aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-08-20 20:50:12 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-08-20 20:50:12 +0000
commite6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch)
tree599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Target/AMDGPU
parent1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff)
downloadsrc-e6d1592492a3a379186bfb02bd0f4eda0669c0d5.tar.gz
src-e6d1592492a3a379186bfb02bd0f4eda0669c0d5.zip
Notes
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h52
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td570
-rw-r--r--lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp41
-rw-r--r--lib/Target/AMDGPU/AMDGPUAliasAnalysis.h13
-rw-r--r--lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp75
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp19
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h43
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp339
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.h17
-rw-r--r--lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp314
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.cpp362
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.h20
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallingConv.td49
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp136
-rw-r--r--lib/Target/AMDGPU/AMDGPUFeatures.td18
-rw-r--r--lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUGISel.td55
-rw-r--r--lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def113
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp220
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h41
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp802
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp363
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h73
-rw-r--r--lib/Target/AMDGPU/AMDGPUInline.cpp45
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td46
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp1469
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.h55
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td267
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp103
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h58
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp1357
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.h50
-rw-r--r--lib/Target/AMDGPU/AMDGPULibCalls.cpp151
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.cpp62
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.h11
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp38
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.cpp48
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.cpp21
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp17
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h80
-rw-r--r--lib/Target/AMDGPU/AMDGPUMacroFusion.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUMacroFusion.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp11
-rw-r--r--lib/Target/AMDGPU/AMDGPUPTNote.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp77
-rw-r--r--lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h17
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp36
-rw-r--r--lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp336
-rw-r--r--lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp353
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp1782
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.h52
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBanks.td9
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.cpp27
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.td9
-rw-r--r--lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUSearchableTables.td60
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp263
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h311
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp307
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.h21
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetObjectFile.h7
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp38
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h21
-rw-r--r--lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp18
-rw-r--r--lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDILCFGStructurizer.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDKernelCodeT.h15
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp2786
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td957
-rw-r--r--lib/Target/AMDGPU/CaymanInstructions.td7
-rw-r--r--lib/Target/AMDGPU/DSInstructions.td566
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp485
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h32
-rw-r--r--lib/Target/AMDGPU/EvergreenInstructions.td7
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td527
-rw-r--r--lib/Target/AMDGPU/GCNDPPCombine.cpp259
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.cpp826
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.h41
-rw-r--r--lib/Target/AMDGPU/GCNILPSched.cpp7
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.cpp7
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.h7
-rw-r--r--lib/Target/AMDGPU/GCNMinRegStrategy.cpp7
-rw-r--r--lib/Target/AMDGPU/GCNNSAReassign.cpp343
-rw-r--r--lib/Target/AMDGPU/GCNProcessors.td114
-rw-r--r--lib/Target/AMDGPU/GCNRegBankReassign.cpp800
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp22
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.h61
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.cpp35
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.h16
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp65
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp21
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp7
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h7
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h7
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (renamed from lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp)537
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h (renamed from lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h)38
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp29
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h8
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp7
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h20
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp41
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h12
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp218
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h40
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp14
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp7
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp84
-rw-r--r--lib/Target/AMDGPU/MIMGInstructions.td484
-rw-r--r--lib/Target/AMDGPU/R600.td7
-rw-r--r--lib/Target/AMDGPU/R600AsmPrinter.cpp7
-rw-r--r--lib/Target/AMDGPU/R600AsmPrinter.h7
-rw-r--r--lib/Target/AMDGPU/R600ClauseMergePass.cpp7
-rw-r--r--lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp7
-rw-r--r--lib/Target/AMDGPU/R600Defines.h7
-rw-r--r--lib/Target/AMDGPU/R600EmitClauseMarkers.cpp7
-rw-r--r--lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp7
-rw-r--r--lib/Target/AMDGPU/R600FrameLowering.cpp7
-rw-r--r--lib/Target/AMDGPU/R600FrameLowering.h7
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp37
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.h14
-rw-r--r--lib/Target/AMDGPU/R600InstrFormats.td7
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.cpp8
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.h7
-rw-r--r--lib/Target/AMDGPU/R600Instructions.td35
-rw-r--r--lib/Target/AMDGPU/R600MachineFunctionInfo.cpp7
-rw-r--r--lib/Target/AMDGPU/R600MachineFunctionInfo.h7
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.cpp7
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.h7
-rw-r--r--lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp7
-rw-r--r--lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp22
-rw-r--r--lib/Target/AMDGPU/R600Packetizer.cpp11
-rw-r--r--lib/Target/AMDGPU/R600Processors.td18
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.cpp9
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.h9
-rw-r--r--lib/Target/AMDGPU/R600Schedule.td7
-rw-r--r--lib/Target/AMDGPU/R700Instructions.td7
-rw-r--r--lib/Target/AMDGPU/SIAddIMGInit.cpp7
-rw-r--r--lib/Target/AMDGPU/SIAnnotateControlFlow.cpp64
-rw-r--r--lib/Target/AMDGPU/SIDebuggerInsertNops.cpp97
-rw-r--r--lib/Target/AMDGPU/SIDefines.h178
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp83
-rw-r--r--lib/Target/AMDGPU/SIFixVGPRCopies.cpp7
-rw-r--r--lib/Target/AMDGPU/SIFixWWMLiveness.cpp418
-rw-r--r--lib/Target/AMDGPU/SIFixupVectorISel.cpp12
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp363
-rw-r--r--lib/Target/AMDGPU/SIFormMemoryClauses.cpp22
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp810
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h28
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp1918
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h49
-rw-r--r--lib/Target/AMDGPU/SIInsertSkips.cpp76
-rw-r--r--lib/Target/AMDGPU/SIInsertWaitcnts.cpp417
-rw-r--r--lib/Target/AMDGPU/SIInstrFormats.td68
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp1415
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h125
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td654
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td425
-rw-r--r--lib/Target/AMDGPU/SIIntrinsics.td19
-rw-r--r--lib/Target/AMDGPU/SILoadStoreOptimizer.cpp60
-rw-r--r--lib/Target/AMDGPU/SILowerControlFlow.cpp104
-rw-r--r--lib/Target/AMDGPU/SILowerI1Copies.cpp107
-rw-r--r--lib/Target/AMDGPU/SILowerSGPRSpills.cpp323
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp271
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h377
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.cpp11
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.h7
-rw-r--r--lib/Target/AMDGPU/SIMemoryLegalizer.cpp322
-rw-r--r--lib/Target/AMDGPU/SIModeRegister.cpp9
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMasking.cpp98
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp155
-rw-r--r--lib/Target/AMDGPU/SIPeepholeSDWA.cpp36
-rw-r--r--lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp221
-rw-r--r--lib/Target/AMDGPU/SIProgramInfo.h21
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp660
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h78
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.td633
-rw-r--r--lib/Target/AMDGPU/SISchedule.td71
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp140
-rw-r--r--lib/Target/AMDGPU/SIWholeQuadMode.cpp82
-rw-r--r--lib/Target/AMDGPU/SMInstructions.td359
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td666
-rw-r--r--lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp9
-rw-r--r--lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h29
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp36
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h14
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp410
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h203
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp723
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h135
-rw-r--r--lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h11
-rw-r--r--lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp7
-rw-r--r--lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h7
-rw-r--r--lib/Target/AMDGPU/VIInstrFormats.td7
-rw-r--r--lib/Target/AMDGPU/VIInstructions.td7
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td487
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td889
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td501
-rw-r--r--lib/Target/AMDGPU/VOP3PInstructions.td220
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td972
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td182
212 files changed, 29044 insertions, 9242 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index bb7801c172f6..19a8bd901629 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -1,9 +1,8 @@
//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
@@ -51,14 +50,16 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIOptimizeExecMaskingPreRAPass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
-FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitcntsPass();
-FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createSIPreAllocateWWMRegsPass();
FunctionPass *createSIFormMemoryClausesPass();
-FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
+FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
+ const TargetMachine *);
FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
+ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
FunctionPass *createAMDGPURewriteOutArgumentsPass();
FunctionPass *createSIModeRegisterPass();
@@ -93,6 +94,12 @@ ModulePass *createAMDGPULowerKernelAttributesPass();
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
extern char &AMDGPULowerKernelAttributesID;
+void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &);
+extern char &AMDGPUPropagateAttributesEarlyID;
+
+void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &);
+extern char &AMDGPUPropagateAttributesLateID;
+
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
@@ -135,6 +142,9 @@ extern char &SIFixupVectorISelID;
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
+void initializeSILowerSGPRSpillsPass(PassRegistry &);
+extern char &SILowerSGPRSpillsID;
+
void initializeSILoadStoreOptimizerPass(PassRegistry &);
extern char &SILoadStoreOptimizerID;
@@ -150,8 +160,8 @@ extern char &SIInsertSkipsPassID;
void initializeSIOptimizeExecMaskingPass(PassRegistry &);
extern char &SIOptimizeExecMaskingID;
-void initializeSIFixWWMLivenessPass(PassRegistry &);
-extern char &SIFixWWMLivenessID;
+void initializeSIPreAllocateWWMRegsPass(PassRegistry &);
+extern char &SIPreAllocateWWMRegsID;
void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &);
extern char &AMDGPUSimplifyLibCallsID;
@@ -197,9 +207,6 @@ extern char &SIAnnotateControlFlowPassID;
void initializeSIMemoryLegalizerPass(PassRegistry&);
extern char &SIMemoryLegalizerID;
-void initializeSIDebuggerInsertNopsPass(PassRegistry&);
-extern char &SIDebuggerInsertNopsID;
-
void initializeSIModeRegisterPass(PassRegistry&);
extern char &SIModeRegisterID;
@@ -226,8 +233,11 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
-Target &getTheAMDGPUTarget();
-Target &getTheGCNTarget();
+void initializeGCNRegBankReassignPass(PassRegistry &);
+extern char &GCNRegBankReassignID;
+
+void initializeGCNNSAReassignPass(PassRegistry &);
+extern char &GCNNSAReassignID;
namespace AMDGPU {
enum TargetIndex {
@@ -250,21 +260,23 @@ enum TargetIndex {
namespace AMDGPUAS {
enum : unsigned {
// The maximum value for flat, generic, local, private, constant and region.
- MAX_AMDGPU_ADDRESS = 6,
+ MAX_AMDGPU_ADDRESS = 7,
FLAT_ADDRESS = 0, ///< Address space for flat memory.
GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
- REGION_ADDRESS = 2, ///< Address space for region memory.
+ REGION_ADDRESS = 2, ///< Address space for region memory. (GDS)
- CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2)
+ CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2).
LOCAL_ADDRESS = 3, ///< Address space for local memory.
PRIVATE_ADDRESS = 5, ///< Address space for private memory.
- CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
+ CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory.
+
+ BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers.
- /// Address space for direct addressible parameter memory (CONST0)
+ /// Address space for direct addressible parameter memory (CONST0).
PARAM_D_ADDRESS = 6,
- /// Address space for indirect addressible parameter memory (VTX1)
+ /// Address space for indirect addressible parameter memory (VTX1).
PARAM_I_ADDRESS = 7,
// Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 6a4cfe08e491..baeba534012c 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -1,9 +1,8 @@
//===-- AMDGPU.td - AMDGPU Tablegen files --------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===------------------------------------------------------------===//
@@ -61,6 +60,12 @@ def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
"Have scratch_* flat memory instructions"
>;
+def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts",
+ "ScalarFlatScratchInsts",
+ "true",
+ "Have s_scratch_* flat memory instructions"
+>;
+
def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
"AddNoCarryInsts",
"true",
@@ -103,6 +108,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
>;
+def FeatureDoesNotSupportXNACK : SubtargetFeature<"no-xnack-support",
+ "DoesNotSupportXNACK",
+ "true",
+ "Hardware does not support XNACK"
+>;
+
// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
// XNACK. The current default kernel driver setting is:
// - graphics ring: XNACK disabled
@@ -116,12 +127,78 @@ def FeatureXNACK : SubtargetFeature<"xnack",
"Enable XNACK support"
>;
+def FeatureCuMode : SubtargetFeature<"cumode",
+ "EnableCuMode",
+ "true",
+ "Enable CU wavefront execution mode"
+>;
+
def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
"SGPRInitBug",
"true",
"VI SGPR initialization bug requiring a fixed SGPR allocation size"
>;
+def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
+ "LDSMisalignedBug",
+ "true",
+ "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode"
+>;
+
+def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard",
+ "HasVcmpxPermlaneHazard",
+ "true",
+ "TODO: describe me"
+>;
+
+def FeatureVMEMtoScalarWriteHazard : SubtargetFeature<"vmem-to-scalar-write-hazard",
+ "HasVMEMtoScalarWriteHazard",
+ "true",
+ "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution."
+>;
+
+def FeatureSMEMtoVectorWriteHazard : SubtargetFeature<"smem-to-vector-write-hazard",
+ "HasSMEMtoVectorWriteHazard",
+ "true",
+ "s_load_dword followed by v_cmp page faults"
+>;
+
+def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug",
+ "HasInstFwdPrefetchBug",
+ "true",
+ "S_INST_PREFETCH instruction causes shader to hang"
+>;
+
+def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
+ "HasVcmpxExecWARHazard",
+ "true",
+ "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)"
+>;
+
+def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard",
+ "HasLdsBranchVmemWARHazard",
+ "true",
+ "Switching between LDS and VMEM-tex not waiting VM_VSRC=0"
+>;
+
+def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug",
+ "HasNSAtoVMEMBug",
+ "true",
+ "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero"
+>;
+
+def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug",
+ "HasFlatSegmentOffsetBug",
+ "true",
+ "GFX10 bug, inst_offset ignored in flat segment"
+>;
+
+def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
+ "HasOffset3fBug",
+ "true",
+ "Branch offset of 3f hardware bug"
+>;
+
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,
"LDSBankCount",
@@ -144,10 +221,10 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts",
"Additional instructions for CI+"
>;
-def FeatureVIInsts : SubtargetFeature<"vi-insts",
- "VIInsts",
+def FeatureGFX8Insts : SubtargetFeature<"gfx8-insts",
+ "GFX8Insts",
"true",
- "Additional instructions for VI+"
+ "Additional instructions for GFX8+"
>;
def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
@@ -156,6 +233,18 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
"Additional instructions for GFX9+"
>;
+def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
+ "GFX10Insts",
+ "true",
+ "Additional instructions for GFX10+"
+>;
+
+def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts",
+ "GFX7GFX8GFX9Insts",
+ "true",
+ "Instructions shared in GFX7, GFX8, GFX9"
+>;
+
def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
"HasSMemRealTime",
"true",
@@ -246,12 +335,25 @@ def FeatureDPP : SubtargetFeature<"dpp",
"Support DPP (Data Parallel Primitives) extension"
>;
+// DPP8 allows arbitrary cross-lane swizzling withing groups of 8 lanes.
+def FeatureDPP8 : SubtargetFeature<"dpp8",
+ "HasDPP8",
+ "true",
+ "Support DPP8 (Data Parallel Primitives) extension"
+>;
+
def FeatureR128A16 : SubtargetFeature<"r128-a16",
"HasR128A16",
"true",
"Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9"
>;
+def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
+ "HasNSAEncoding",
+ "true",
+ "Support NSA encoding for image instructions"
+>;
+
def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
"HasIntClamp",
"true",
@@ -270,10 +372,65 @@ def FeatureDLInsts : SubtargetFeature<"dl-insts",
"Has v_fmac_f32 and v_xnor_b32 instructions"
>;
-def FeatureDotInsts : SubtargetFeature<"dot-insts",
- "HasDotInsts",
+def FeatureDot1Insts : SubtargetFeature<"dot1-insts",
+ "HasDot1Insts",
+ "true",
+ "Has v_dot4_i32_i8 and v_dot8_i32_i4 instructions"
+>;
+
+def FeatureDot2Insts : SubtargetFeature<"dot2-insts",
+ "HasDot2Insts",
+ "true",
+ "Has v_dot2_f32_f16, v_dot2_i32_i16, v_dot2_u32_u16, v_dot4_u32_u8, v_dot8_u32_u4 instructions"
+>;
+
+def FeatureDot3Insts : SubtargetFeature<"dot3-insts",
+ "HasDot3Insts",
+ "true",
+ "Has v_dot8c_i32_i4 instruction"
+>;
+
+def FeatureDot4Insts : SubtargetFeature<"dot4-insts",
+ "HasDot4Insts",
+ "true",
+ "Has v_dot2c_i32_i16 instruction"
+>;
+
+def FeatureDot5Insts : SubtargetFeature<"dot5-insts",
+ "HasDot5Insts",
"true",
- "Has v_dot* instructions"
+ "Has v_dot2c_f32_f16 instruction"
+>;
+
+def FeatureDot6Insts : SubtargetFeature<"dot6-insts",
+ "HasDot6Insts",
+ "true",
+ "Has v_dot4c_i32_i8 instruction"
+>;
+
+def FeatureMAIInsts : SubtargetFeature<"mai-insts",
+ "HasMAIInsts",
+ "true",
+ "Has mAI instructions"
+>;
+
+def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
+ "HasPkFmacF16Inst",
+ "true",
+ "Has v_pk_fmac_f16 instruction"
+>;
+
+def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts",
+ "HasAtomicFaddInsts",
+ "true",
+ "Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, "
+ "global_atomic_pk_add_f16 instructions"
+>;
+
+def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support",
+ "DoesNotSupportSRAMECC",
+ "true",
+ "Hardware does not support SRAM ECC"
>;
def FeatureSRAMECC : SubtargetFeature<"sram-ecc",
@@ -282,6 +439,36 @@ def FeatureSRAMECC : SubtargetFeature<"sram-ecc",
"Enable SRAM ECC"
>;
+def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx",
+ "HasNoSdstCMPX",
+ "true",
+ "V_CMPX does not write VCC/SGPR in addition to EXEC"
+>;
+
+def FeatureVscnt : SubtargetFeature<"vscnt",
+ "HasVscnt",
+ "true",
+ "Has separate store vscnt counter"
+>;
+
+def FeatureRegisterBanking : SubtargetFeature<"register-banking",
+ "HasRegisterBanking",
+ "true",
+ "Has register banking"
+>;
+
+def FeatureVOP3Literal : SubtargetFeature<"vop3-literal",
+ "HasVOP3Literal",
+ "true",
+ "Can use one literal in VOP3"
+>;
+
+def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
+ "HasNoDataDepHazard",
+ "true",
+ "Does not need SW waitstates"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -327,13 +514,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
-def FeatureEnableHugePrivateBuffer : SubtargetFeature<
- "huge-private-buffer",
- "EnableHugePrivateBuffer",
- "true",
- "Enable private/scratch buffer sizes greater than 128 GB"
->;
-
def FeatureDumpCode : SubtargetFeature <"DumpCode",
"DumpCode",
"true",
@@ -425,103 +605,123 @@ def FeatureDisable : SubtargetFeature<"",
"Dummy feature to disable assembler instructions"
>;
-def FeatureGCN : SubtargetFeature<"gcn",
- "IsGCN",
- "true",
- "GCN or newer GPU"
->;
-
class GCNSubtargetFeatureGeneration <string Value,
- list<SubtargetFeature> Implies> :
- SubtargetFeatureGeneration <Value, "GCNSubtarget", Implies>;
+ string FeatureName,
+ list<SubtargetFeature> Implies> :
+ SubtargetFeatureGeneration <Value, FeatureName, "GCNSubtarget", Implies>;
def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+ "southern-islands",
[FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
- FeatureWavefrontSize64, FeatureGCN,
- FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange]
+ FeatureWavefrontSize64,
+ FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange,
+ FeatureDoesNotSupportSRAMECC, FeatureDoesNotSupportXNACK]
>;
def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
+ "sea-islands",
[FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
- FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
- FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange]
+ FeatureWavefrontSize64, FeatureFlatAddressSpace,
+ FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
+ FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportSRAMECC]
>;
def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+ "volcanic-islands",
[FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
- FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
- FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts,
+ FeatureWavefrontSize64, FeatureFlatAddressSpace,
+ FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
- FeatureIntClamp, FeatureTrigReducedRange
+ FeatureIntClamp, FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC,
+ FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts
]
>;
def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
+ "gfx9",
[FeatureFP64, FeatureLocalMemorySize65536,
- FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
- FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts,
+ FeatureWavefrontSize64, FeatureFlatAddressSpace,
+ FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
- FeatureAddNoCarryInsts, FeatureScalarAtomics, FeatureR128A16
+ FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
+ FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16
]
>;
-class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
- list<SubtargetFeature> Implies>
- : SubtargetFeature <
- "isaver"#Major#"."#Minor#"."#Stepping,
- "IsaVersion",
- "ISAVersion"#Major#"_"#Minor#"_"#Stepping,
- "Instruction set version number",
- Implies
+def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
+ "gfx10",
+ [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
+ FeatureFlatAddressSpace,
+ FeatureCIInsts, Feature16BitInsts,
+ FeatureSMemRealTime, FeatureInv2PiInlineImm,
+ FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3P,
+ FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
+ FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
+ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
+ FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
+ FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
+ FeatureVOP3Literal, FeatureDPP8,
+ FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC
+ ]
>;
-def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
- [FeatureSouthernIslands,
+class FeatureSet<list<SubtargetFeature> Features_> {
+ list<SubtargetFeature> Features = Features_;
+}
+
+def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands,
FeatureFastFMAF32,
HalfRate64Ops,
FeatureLDSBankCount32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
+def FeatureISAVersion6_0_1 : FeatureSet<
[FeatureSouthernIslands,
FeatureLDSBankCount32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
+def FeatureISAVersion7_0_0 : FeatureSet<
[FeatureSeaIslands,
FeatureLDSBankCount32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
+def FeatureISAVersion7_0_1 : FeatureSet<
[FeatureSeaIslands,
HalfRate64Ops,
FeatureLDSBankCount32,
FeatureFastFMAF32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
+def FeatureISAVersion7_0_2 : FeatureSet<
[FeatureSeaIslands,
FeatureLDSBankCount16,
FeatureFastFMAF32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
+def FeatureISAVersion7_0_3 : FeatureSet<
[FeatureSeaIslands,
FeatureLDSBankCount16,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
+def FeatureISAVersion7_0_4 : FeatureSet<
[FeatureSeaIslands,
FeatureLDSBankCount32,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
+def FeatureISAVersion8_0_1 : FeatureSet<
[FeatureVolcanicIslands,
FeatureFastFMAF32,
HalfRate64Ops,
@@ -530,78 +730,151 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
FeatureUnpackedD16VMem,
FeatureCodeObjectV3]>;
-def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
+def FeatureISAVersion8_0_2 : FeatureSet<
[FeatureVolcanicIslands,
FeatureLDSBankCount32,
FeatureSGPRInitBug,
FeatureUnpackedD16VMem,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
+def FeatureISAVersion8_0_3 : FeatureSet<
[FeatureVolcanicIslands,
FeatureLDSBankCount32,
FeatureUnpackedD16VMem,
+ FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
+def FeatureISAVersion8_1_0 : FeatureSet<
[FeatureVolcanicIslands,
FeatureLDSBankCount16,
FeatureXNACK,
FeatureCodeObjectV3]>;
-def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
+def FeatureISAVersion9_0_0 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
- FeatureCodeObjectV3]>;
+ FeatureCodeObjectV3,
+ FeatureDoesNotSupportXNACK,
+ FeatureDoesNotSupportSRAMECC]>;
-def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
+def FeatureISAVersion9_0_2 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
FeatureXNACK,
+ FeatureDoesNotSupportSRAMECC,
FeatureCodeObjectV3]>;
-def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
+def FeatureISAVersion9_0_4 : FeatureSet<
[FeatureGFX9,
FeatureLDSBankCount32,
FeatureFmaMixInsts,
+ FeatureDoesNotSupportXNACK,
+ FeatureDoesNotSupportSRAMECC,
FeatureCodeObjectV3]>;
-def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
+def FeatureISAVersion9_0_6 : FeatureSet<
[FeatureGFX9,
HalfRate64Ops,
FeatureFmaMixInsts,
FeatureLDSBankCount32,
FeatureDLInsts,
- FeatureDotInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDoesNotSupportXNACK,
+ FeatureCodeObjectV3]>;
+
+def FeatureISAVersion9_0_8 : FeatureSet<
+ [FeatureGFX9,
+ HalfRate64Ops,
+ FeatureFmaMixInsts,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot3Insts,
+ FeatureDot4Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureMAIInsts,
+ FeaturePkFmacF16Inst,
+ FeatureAtomicFaddInsts,
FeatureSRAMECC,
FeatureCodeObjectV3]>;
-def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9,
+def FeatureISAVersion9_0_9 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
FeatureXNACK,
FeatureCodeObjectV3]>;
-//===----------------------------------------------------------------------===//
-// Debugger related subtarget features.
-//===----------------------------------------------------------------------===//
-
-def FeatureDebuggerInsertNops : SubtargetFeature<
- "amdgpu-debugger-insert-nops",
- "DebuggerInsertNops",
- "true",
- "Insert one nop instruction for each high level source statement"
->;
+// TODO: Organize more features into groups.
+def FeatureGroup {
+ // Bugs present on gfx10.1.
+ list<SubtargetFeature> GFX10_1_Bugs = [
+ FeatureVcmpxPermlaneHazard,
+ FeatureVMEMtoScalarWriteHazard,
+ FeatureSMEMtoVectorWriteHazard,
+ FeatureInstFwdPrefetchBug,
+ FeatureVcmpxExecWARHazard,
+ FeatureLdsBranchVmemWARHazard,
+ FeatureNSAtoVMEMBug,
+ FeatureOffset3fBug,
+ FeatureFlatSegmentOffsetBug
+ ];
+}
-def FeatureDebuggerEmitPrologue : SubtargetFeature<
- "amdgpu-debugger-emit-prologue",
- "DebuggerEmitPrologue",
- "true",
- "Emit debugger prologue"
->;
+def FeatureISAVersion10_1_0 : FeatureSet<
+ !listconcat(FeatureGroup.GFX10_1_Bugs,
+ [FeatureGFX10,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureNSAEncoding,
+ FeatureWavefrontSize32,
+ FeatureScalarStores,
+ FeatureScalarAtomics,
+ FeatureScalarFlatScratchInsts,
+ FeatureLdsMisalignedBug,
+ FeatureDoesNotSupportXNACK,
+ FeatureCodeObjectV3])>;
+
+def FeatureISAVersion10_1_1 : FeatureSet<
+ !listconcat(FeatureGroup.GFX10_1_Bugs,
+ [FeatureGFX10,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureNSAEncoding,
+ FeatureWavefrontSize32,
+ FeatureScalarStores,
+ FeatureScalarAtomics,
+ FeatureScalarFlatScratchInsts,
+ FeatureDoesNotSupportXNACK,
+ FeatureCodeObjectV3])>;
+
+def FeatureISAVersion10_1_2 : FeatureSet<
+ !listconcat(FeatureGroup.GFX10_1_Bugs,
+ [FeatureGFX10,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureNSAEncoding,
+ FeatureWavefrontSize32,
+ FeatureScalarStores,
+ FeatureScalarAtomics,
+ FeatureScalarFlatScratchInsts,
+ FeatureLdsMisalignedBug,
+ FeatureDoesNotSupportXNACK,
+ FeatureCodeObjectV3])>;
//===----------------------------------------------------------------------===//
@@ -682,23 +955,71 @@ def NullALU : InstrItinClass;
// Predicate helper class
//===----------------------------------------------------------------------===//
-def isSICI : Predicate<
- "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
- "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
->, AssemblerPredicate<"!FeatureGCN3Encoding">;
+def isGFX6 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS">,
+ AssemblerPredicate<"FeatureSouthernIslands">;
+
+def isGFX6GFX7 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,!FeatureGFX10Insts">;
+
+def isGFX6GFX7GFX10 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<"!FeatureGCN3Encoding">;
+
+def isGFX7Only :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts,!FeatureGFX10Insts">;
+
+def isGFX7GFX10 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts">;
+
+def isGFX7GFX8GFX9 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
+ AssemblerPredicate<"FeatureGFX7GFX8GFX9Insts">;
+
+def isGFX6GFX7GFX8GFX9 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
+ AssemblerPredicate<"!FeatureGFX10Insts">;
+
+def isGFX7Plus :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
+ AssemblerPredicate<"FeatureCIInsts">;
+
+def isGFX8Plus :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+ AssemblerPredicate<"FeatureGFX8Insts">;
-def isVI : Predicate <
- "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
- AssemblerPredicate<"FeatureGCN3Encoding">;
+def isGFX8Only : Predicate<"Subtarget->getGeneration() =="
+ "AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+ AssemblerPredicate <"FeatureVolcanicIslands">;
-def isGFX9 : Predicate <
- "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
+def isGFX9Plus :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
AssemblerPredicate<"FeatureGFX9Insts">;
-// TODO: Either the name to be changed or we simply use IsCI!
-def isCIVI : Predicate <
- "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
- AssemblerPredicate<"FeatureCIInsts">;
+def isGFX9Only : Predicate <
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
+ AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts">;
+
+def isGFX8GFX9 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
+ AssemblerPredicate<"FeatureGFX8Insts,FeatureGCN3Encoding">;
+
+def isGFX10Plus :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
+ AssemblerPredicate<"FeatureGFX10Insts">;
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
AssemblerPredicate<"FeatureFlatAddressSpace">;
@@ -707,6 +1028,8 @@ def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
AssemblerPredicate<"FeatureFlatGlobalInsts">;
def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
AssemblerPredicate<"FeatureFlatScratchInsts">;
+def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">,
+ AssemblerPredicate<"FeatureScalarFlatScratchInsts">;
def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
AssemblerPredicate<"FeatureGFX9Insts">;
@@ -716,7 +1039,7 @@ def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<"!FeatureUnpackedD16VMem">;
def D16PreservesUnusedBits :
- Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">,
+ Predicate<"Subtarget->d16PreservesUnusedBits()">,
AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">;
def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
@@ -728,38 +1051,54 @@ def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9
def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
AssemblerPredicate<"FeatureAddNoCarryInsts">;
-def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">,
- AssemblerPredicate<"!FeatureAddNoCarryInsts">;
+def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">;
def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
AssemblerPredicate<"Feature16BitInsts">;
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<"FeatureVOP3P">;
-def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">,
- AssemblerPredicate<"!FeatureVOP3P">;
-
def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
-def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">,
- AssemblerPredicate<"FeatureSDWA,FeatureGFX9">;
+def HasSDWA9 :
+ Predicate<"Subtarget->hasSDWA()">,
+ AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts,FeatureSDWA">;
+
+def HasSDWA10 :
+ Predicate<"Subtarget->hasSDWA()">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureSDWA">;
def HasDPP : Predicate<"Subtarget->hasDPP()">,
- AssemblerPredicate<"FeatureDPP">;
+ AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">;
+
+def HasDPP8 : Predicate<"Subtarget->hasDPP8()">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP8">;
def HasR128A16 : Predicate<"Subtarget->hasR128A16()">,
AssemblerPredicate<"FeatureR128A16">;
+def HasDPP16 : Predicate<"Subtarget->hasDPP()">,
+ AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP">;
+
def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
AssemblerPredicate<"FeatureIntClamp">;
def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
AssemblerPredicate<"FeatureMadMixInsts">;
+def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">,
+ AssemblerPredicate<"FeatureScalarStores">;
+
def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
AssemblerPredicate<"FeatureScalarAtomics">;
+def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">,
+ AssemblerPredicate<"FeatureNoSdstCMPX">;
+
+def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">,
+ AssemblerPredicate<"!FeatureNoSdstCMPX">;
+
def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
@@ -773,9 +1112,35 @@ def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
AssemblerPredicate<"FeatureDLInsts">;
-def HasDotInsts : Predicate<"Subtarget->hasDotInsts()">,
- AssemblerPredicate<"FeatureDotInsts">;
+def HasDot1Insts : Predicate<"Subtarget->hasDot1Insts()">,
+ AssemblerPredicate<"FeatureDot1Insts">;
+
+def HasDot2Insts : Predicate<"Subtarget->hasDot2Insts()">,
+ AssemblerPredicate<"FeatureDot2Insts">;
+
+def HasDot3Insts : Predicate<"Subtarget->hasDot3Insts()">,
+ AssemblerPredicate<"FeatureDot3Insts">;
+
+def HasDot4Insts : Predicate<"Subtarget->hasDot4Insts()">,
+ AssemblerPredicate<"FeatureDot4Insts">;
+
+def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">,
+ AssemblerPredicate<"FeatureDot5Insts">;
+
+def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
+ AssemblerPredicate<"FeatureDot6Insts">;
+
+def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">,
+ AssemblerPredicate<"FeatureMAIInsts">;
+
+def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
+ AssemblerPredicate<"FeaturePkFmacF16Inst">;
+
+def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
+ AssemblerPredicate<"FeatureAtomicFaddInsts">;
+def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">,
+ AssemblerPredicate<"FeatureOffset3fBug">;
def EnableLateCFGStructurize : Predicate<
"EnableLateStructurizeCFG">;
@@ -784,7 +1149,6 @@ def EnableLateCFGStructurize : Predicate<
include "SISchedule.td"
include "GCNProcessors.td"
include "AMDGPUInstrInfo.td"
-include "SIIntrinsics.td"
include "AMDGPURegisterInfo.td"
include "AMDGPURegisterBanks.td"
include "AMDGPUInstructions.td"
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 73709ba13643..bba132c3bc46 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUAliasAnalysis ------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -54,20 +53,21 @@ void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
}
-// These arrays are indexed by address space value enum elements 0 ... to 6
-static const AliasResult ASAliasRules[7][7] = {
- /* Flat Global Region Group Constant Private Constant 32-bit */
- /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
- /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias},
- /* Region */ {MayAlias, NoAlias , NoAlias , NoAlias, MayAlias, NoAlias , MayAlias},
- /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias},
- /* Constant */ {MayAlias, MayAlias, MayAlias, NoAlias , NoAlias, NoAlias , MayAlias},
- /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias},
- /* Constant 32-bit */ {MayAlias, MayAlias, MayAlias, NoAlias , MayAlias, NoAlias , NoAlias}
+// These arrays are indexed by address space value enum elements 0 ... to 7
+static const AliasResult ASAliasRules[8][8] = {
+ /* Flat Global Region Group Constant Private Constant 32-bit Buffer Fat Ptr */
+ /* Flat */ {MayAlias, MayAlias, NoAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
+ /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias},
+ /* Region */ {NoAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias , NoAlias, NoAlias},
+ /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
+ /* Constant */ {MayAlias, MayAlias, NoAlias, NoAlias , NoAlias , NoAlias , MayAlias, MayAlias},
+ /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias , NoAlias},
+ /* Constant 32-bit */ {MayAlias, MayAlias, NoAlias, NoAlias , MayAlias, NoAlias , NoAlias , MayAlias},
+ /* Buffer Fat Ptr */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias}
};
static AliasResult getAliasResult(unsigned AS1, unsigned AS2) {
- static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 6, "Addr space out of range");
+ static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 7, "Addr space out of range");
if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS)
return MayAlias;
@@ -76,7 +76,8 @@ static AliasResult getAliasResult(unsigned AS1, unsigned AS2) {
}
AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
- const MemoryLocation &LocB) {
+ const MemoryLocation &LocB,
+ AAQueryInfo &AAQI) {
unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace();
unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace();
@@ -85,11 +86,11 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
return Result;
// Forward the query to the next alias analysis.
- return AAResultBase::alias(LocA, LocB);
+ return AAResultBase::alias(LocA, LocB, AAQI);
}
bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
- bool OrLocal) {
+ AAQueryInfo &AAQI, bool OrLocal) {
const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
unsigned AS = Base->getType()->getPointerAddressSpace();
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
@@ -106,7 +107,7 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
// Only assume constant memory for arguments on kernels.
switch (F->getCallingConv()) {
default:
- return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
+ return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
case CallingConv::AMDGPU_LS:
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_ES:
@@ -133,5 +134,5 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
return true;
}
}
- return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
+ return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
}
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index d76c9fc48199..fb722920900f 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -1,9 +1,8 @@
//===- AMDGPUAliasAnalysis --------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -45,8 +44,10 @@ public:
/// By definition, this result is stateless and so remains valid.
bool invalidate(Function &, const PreservedAnalyses &) { return false; }
- AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB);
- bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal);
+ AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
+ AAQueryInfo &AAQI);
+ bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
+ bool OrLocal);
private:
bool Aliases(const MDNode *A, const MDNode *B) const;
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index fc65430b745f..4c1dbd4c5304 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 896ac9c87779..419ebb2240ad 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -46,8 +45,11 @@ namespace {
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
private:
const TargetMachine *TM = nullptr;
+ SmallVector<CallGraphNode*, 8> NodeList;
bool addFeatureAttributes(Function &F);
+ bool processUniformWorkGroupAttribute();
+ bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
public:
static char ID;
@@ -186,7 +188,6 @@ static bool handleAttr(Function &Parent, const Function &Callee,
Parent.addFnAttr(Name);
return true;
}
-
return false;
}
@@ -213,6 +214,56 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
handleAttr(Parent, Callee, AttrName);
}
+bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
+ bool Changed = false;
+
+ for (auto *Node : reverse(NodeList)) {
+ Function *Caller = Node->getFunction();
+
+ for (auto I : *Node) {
+ Function *Callee = std::get<1>(I)->getFunction();
+ if (Callee)
+ Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
+ }
+ }
+
+ return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
+ Function &Caller, Function &Callee) {
+
+ // Check for externally defined function
+ if (!Callee.hasExactDefinition()) {
+ Callee.addFnAttr("uniform-work-group-size", "false");
+ if (!Caller.hasFnAttribute("uniform-work-group-size"))
+ Caller.addFnAttr("uniform-work-group-size", "false");
+
+ return true;
+ }
+ // Check if the Caller has the attribute
+ if (Caller.hasFnAttribute("uniform-work-group-size")) {
+ // Check if the value of the attribute is true
+ if (Caller.getFnAttribute("uniform-work-group-size")
+ .getValueAsString().equals("true")) {
+ // Propagate the attribute to the Callee, if it does not have it
+ if (!Callee.hasFnAttribute("uniform-work-group-size")) {
+ Callee.addFnAttr("uniform-work-group-size", "true");
+ return true;
+ }
+ } else {
+ Callee.addFnAttr("uniform-work-group-size", "false");
+ return true;
+ }
+ } else {
+ // If the attribute is absent, set it as false
+ Caller.addFnAttr("uniform-work-group-size", "false");
+ Callee.addFnAttr("uniform-work-group-size", "false");
+ return true;
+ }
+ return false;
+}
+
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
bool HasFlat = ST.hasFlatAddressSpace();
@@ -293,15 +344,21 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
}
bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
- Module &M = SCC.getCallGraph().getModule();
- Triple TT(M.getTargetTriple());
-
bool Changed = false;
+
for (CallGraphNode *I : SCC) {
+ // Build a list of CallGraphNodes from most number of uses to least
+ if (I->getNumReferences())
+ NodeList.push_back(I);
+ else {
+ processUniformWorkGroupAttribute();
+ NodeList.clear();
+ }
+
Function *F = I->getFunction();
+ // Add feature attributes
if (!F || F->isDeclaration())
continue;
-
Changed |= addFeatureAttributes(*F);
}
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index f88e3b0dac86..71121ade0a49 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,7 +13,6 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPUIntrinsicInfo.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index 7465cf22b5a4..99a01ca3a2fd 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -1,15 +1,15 @@
//===----------------------------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUArgumentUsageInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -27,9 +27,16 @@ void ArgDescriptor::print(raw_ostream &OS,
}
if (isRegister())
- OS << "Reg " << printReg(getRegister(), TRI) << '\n';
+ OS << "Reg " << printReg(getRegister(), TRI);
else
- OS << "Stack offset " << getStackOffset() << '\n';
+ OS << "Stack offset " << getStackOffset();
+
+ if (isMasked()) {
+ OS << " & ";
+ llvm::write_hex(OS, Mask, llvm::HexPrintStyle::PrefixLower);
+ }
+
+ OS << '\n';
}
char AMDGPUArgumentUsageInfo::ID = 0;
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index f0e6d1b83f15..097730441ed8 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -1,9 +1,8 @@
//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -11,6 +10,7 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/IR/Function.h"
#include "llvm/Pass.h"
@@ -29,22 +29,31 @@ private:
friend class AMDGPUArgumentUsageInfo;
union {
- unsigned Register;
+ Register Reg;
unsigned StackOffset;
};
+ // Bitmask to locate argument within the register.
+ unsigned Mask;
+
bool IsStack : 1;
bool IsSet : 1;
- ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false)
- : Register(Val), IsStack(IsStack), IsSet(IsSet) {}
public:
- static ArgDescriptor createRegister(unsigned Reg) {
- return ArgDescriptor(Reg, false, true);
+ ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
+ bool IsStack = false, bool IsSet = false)
+ : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
+
+ static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
+ return ArgDescriptor(Reg, Mask, false, true);
+ }
+
+ static ArgDescriptor createStack(Register Reg, unsigned Mask = ~0u) {
+ return ArgDescriptor(Reg, Mask, true, true);
}
- static ArgDescriptor createStack(unsigned Reg) {
- return ArgDescriptor(Reg, true, true);
+ static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
+ return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
}
bool isSet() const {
@@ -59,9 +68,9 @@ public:
return !IsStack;
}
- unsigned getRegister() const {
+ Register getRegister() const {
assert(!IsStack);
- return Register;
+ return Reg;
}
unsigned getStackOffset() const {
@@ -69,6 +78,14 @@ public:
return StackOffset;
}
+ unsigned getMask() const {
+ return Mask;
+ }
+
+ bool isMasked() const {
+ return Mask != ~0u;
+ }
+
void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
};
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 2ded7cdb6489..743ac64b8f10 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -20,7 +19,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
-#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "R600AsmPrinter.h"
@@ -31,10 +30,12 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
@@ -100,7 +101,7 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)) {
- if (IsaInfo::hasCodeObjectV3(getSTI()))
+ if (IsaInfo::hasCodeObjectV3(getGlobalSTI()))
HSAMetadataStream.reset(new MetadataStreamerV3());
else
HSAMetadataStream.reset(new MetadataStreamerV2());
@@ -110,7 +111,7 @@ StringRef AMDGPUAsmPrinter::getPassName() const {
return "AMDGPU Assembly Printer";
}
-const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const {
+const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
return TM.getMCSubtargetInfo();
}
@@ -121,10 +122,10 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
}
void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
- if (IsaInfo::hasCodeObjectV3(getSTI())) {
+ if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
std::string ExpectedTarget;
raw_string_ostream ExpectedTargetOS(ExpectedTarget);
- IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS);
+ IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS);
getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
}
@@ -137,9 +138,9 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
HSAMetadataStream->begin(M);
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
- readPALMetadata(M);
+ getTargetStreamer()->getPALMetadata()->readFromIR(M);
- if (IsaInfo::hasCodeObjectV3(getSTI()))
+ if (IsaInfo::hasCodeObjectV3(getGlobalSTI()))
return;
// HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
@@ -147,7 +148,7 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
// HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
- IsaVersion Version = getIsaVersion(getSTI()->getCPU());
+ IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU());
getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
}
@@ -157,11 +158,11 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
if (!getTargetStreamer())
return;
- if (!IsaInfo::hasCodeObjectV3(getSTI())) {
+ if (!IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
// Emit ISA Version (NT_AMD_AMDGPU_ISA).
std::string ISAVersionString;
raw_string_ostream ISAVersionStream(ISAVersionString);
- IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
+ IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream);
getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
}
@@ -172,20 +173,6 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
(void)Success;
assert(Success && "Malformed HSA Metadata");
}
-
- if (!IsaInfo::hasCodeObjectV3(getSTI())) {
- // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
- if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
- // Copy the PAL metadata from the map where we collected it into a vector,
- // then write it as a .note.
- PALMD::Metadata PALMetadataVector;
- for (auto i : PALMetadataMap) {
- PALMetadataVector.push_back(i.first);
- PALMetadataVector.push_back(i.second);
- }
- getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
- }
- }
}
bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
@@ -225,7 +212,8 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
if (!MFI.isEntryFunction())
return;
- if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
+
+ if (!IsaInfo::hasCodeObjectV3(getGlobalSTI()) ||
TM.getTargetTriple().getOS() != Triple::AMDHSA)
return;
@@ -243,23 +231,25 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
if (ReadOnlySection.getAlignment() < 64)
ReadOnlySection.setAlignment(64);
+ const MCSubtargetInfo &STI = MF->getSubtarget();
+
SmallString<128> KernelName;
getNameWithPrefix(KernelName, &MF->getFunction());
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
- *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
+ STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
CurrentProgramInfo.NumVGPRsForWavesPerEU,
CurrentProgramInfo.NumSGPRsForWavesPerEU -
- IsaInfo::getNumExtraSGPRs(getSTI(),
+ IsaInfo::getNumExtraSGPRs(&STI,
CurrentProgramInfo.VCCUsed,
CurrentProgramInfo.FlatUsed),
CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
- hasXNACK(*getSTI()));
+ hasXNACK(STI));
Streamer.PopSection();
}
void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
- if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+ if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) &&
TM.getTargetTriple().getOS() == Triple::AMDHSA) {
AsmPrinter::EmitFunctionEntryLabel();
return;
@@ -273,8 +263,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
getTargetStreamer()->EmitAMDGPUSymbolType(
SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
}
- const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
- if (STI.dumpCode()) {
+ if (DumpCodeInstEmitter) {
// Disassemble function name label to text.
DisasmLines.push_back(MF->getName().str() + ":");
DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
@@ -285,8 +274,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
}
void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
- const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>();
- if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) {
+ if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
// Write a line for the basic block label if it is not only fallthrough.
DisasmLines.push_back(
(Twine("BB") + Twine(getFunctionNumber())
@@ -298,38 +286,57 @@ void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
}
void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+ if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
+ OutContext.reportError({},
+ Twine(GV->getName()) +
+ ": unsupported initializer for address space");
+ return;
+ }
+
+ // LDS variables aren't emitted in HSA or PAL yet.
+ const Triple::OSType OS = TM.getTargetTriple().getOS();
+ if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+ return;
- // Group segment variables aren't emitted in HSA.
- if (AMDGPU::isGroupSegment(GV))
+ MCSymbol *GVSym = getSymbol(GV);
+
+ GVSym->redefineIfPossible();
+ if (GVSym->isDefined() || GVSym->isVariable())
+ report_fatal_error("symbol '" + Twine(GVSym->getName()) +
+ "' is already defined");
+
+ const DataLayout &DL = GV->getParent()->getDataLayout();
+ uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
+ unsigned Align = GV->getAlignment();
+ if (!Align)
+ Align = 4;
+
+ EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
+ EmitLinkage(GV, GVSym);
+ if (auto TS = getTargetStreamer())
+ TS->emitAMDGPULDS(GVSym, Size, Align);
return;
+ }
AsmPrinter::EmitGlobalVariable(GV);
}
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
CallGraphResourceInfo.clear();
- return AsmPrinter::doFinalization(M);
-}
-// For the amdpal OS type, read the amdgpu.pal.metadata supplied by the
-// frontend into our PALMetadataMap, ready for per-function modification. It
-// is a NamedMD containing an MDTuple containing a number of MDNodes each of
-// which is an integer value, and each two integer values forms a key=value
-// pair that we store as PALMetadataMap[key]=value in the map.
-void AMDGPUAsmPrinter::readPALMetadata(Module &M) {
- auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
- if (!NamedMD || !NamedMD->getNumOperands())
- return;
- auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0));
- if (!Tuple)
- return;
- for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) {
- auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I));
- auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1));
- if (!Key || !Val)
- continue;
- PALMetadataMap[Key->getZExtValue()] = Val->getZExtValue();
+ // Pad with s_code_end to help tools and guard against instruction prefetch
+ // causing stale data in caches. Arguably this should be done by the linker,
+ // which is why this isn't done for Mesa.
+ const MCSubtargetInfo &STI = *getGlobalSTI();
+ if (AMDGPU::isGFX10(STI) &&
+ (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
+ STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
+ OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+ getTargetStreamer()->EmitCodeEnd();
}
+
+ return AsmPrinter::doFinalization(M);
}
// Print comments that apply to both callable functions and entry points.
@@ -376,6 +383,10 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
}
+ if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
+ KernelCodeProperties |=
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
+ }
return KernelCodeProperties;
}
@@ -435,6 +446,18 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
EmitProgramInfoSI(MF, CurrentProgramInfo);
}
+ DumpCodeInstEmitter = nullptr;
+ if (STM.dumpCode()) {
+ // For -dumpcode, get the assembler out of the streamer, even if it does
+ // not really want to let us have it. This only works with -filetype=obj.
+ bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
+ OutStreamer->setUseAssemblerInfoForParsing(true);
+ MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
+ OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
+ if (Assembler)
+ DumpCodeInstEmitter = Assembler->getEmitterPtr();
+ }
+
DisasmLines.clear();
HexLines.clear();
DisasmLineMaxLen = 0;
@@ -486,15 +509,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
- if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) {
- OutStreamer->emitRawComment(
- " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
- Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
- OutStreamer->emitRawComment(
- " DebuggerPrivateSegmentBufferSGPR: s" +
- Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
- }
-
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC2:USER_SGPR: " +
Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
@@ -516,7 +530,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
}
- if (STM.dumpCode()) {
+ if (DumpCodeInstEmitter) {
OutStreamer->SwitchSection(
Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
@@ -620,6 +634,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
HighestVGPRReg = Reg;
break;
}
+ MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg);
+ if (MRI.isPhysRegUsed(AReg)) {
+ HighestVGPRReg = AReg;
+ break;
+ }
}
MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
@@ -665,8 +684,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
case AMDGPU::SRC_SHARED_LIMIT:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT:
+ case AMDGPU::SGPR_NULL:
continue;
+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+ llvm_unreachable("src_pops_exiting_wave_id should not be used");
+
case AMDGPU::NoRegister:
assert(MI.isDebugInstr());
continue;
@@ -687,6 +710,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
case AMDGPU::XNACK_MASK_HI:
llvm_unreachable("xnack_mask registers should not be used");
+ case AMDGPU::LDS_DIRECT:
+ llvm_unreachable("lds_direct register should not be used");
+
case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
@@ -695,6 +721,15 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
case AMDGPU::TMA_HI:
llvm_unreachable("trap handler registers should not be used");
+ case AMDGPU::SRC_VCCZ:
+ llvm_unreachable("src_vccz register should not be used");
+
+ case AMDGPU::SRC_EXECZ:
+ llvm_unreachable("src_execz register should not be used");
+
+ case AMDGPU::SRC_SCC:
+ llvm_unreachable("src_scc register should not be used");
+
default:
break;
}
@@ -707,6 +742,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
IsSGPR = false;
Width = 1;
+ } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 1;
} else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
"trap handler registers should not be used");
@@ -715,9 +753,14 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
IsSGPR = false;
Width = 2;
+ } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
Width = 3;
+ } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
+ Width = 3;
} else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
"trap handler registers should not be used");
@@ -726,6 +769,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
IsSGPR = false;
Width = 4;
+ } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 4;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
"trap handler registers should not be used");
@@ -742,6 +788,18 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
IsSGPR = false;
Width = 16;
+ } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 16;
+ } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 32;
+ } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 32;
+ } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 32;
} else {
llvm_unreachable("Unknown register class");
}
@@ -767,8 +825,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
// 48 SGPRs - vcc, - flat_scr, -xnack
int MaxSGPRGuess =
- 47 - IsaInfo::getNumExtraSGPRs(getSTI(), true,
- ST.hasFlatAddressSpace());
+ 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
MaxVGPR = std::max(MaxVGPR, 23);
@@ -779,9 +836,19 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else {
// We force CodeGen to run in SCC order, so the callee's register
// usage etc. should be the cumulative usage of all callees.
+
auto I = CallGraphResourceInfo.find(Callee);
- assert(I != CallGraphResourceInfo.end() &&
- "callee should have been handled before caller");
+ if (I == CallGraphResourceInfo.end()) {
+ // Avoid crashing on undefined behavior with an illegal call to a
+ // kernel. If a callsite's calling convention doesn't match the
+ // function's, it's undefined behavior. If the callsite calling
+ // convention does match, that would have errored earlier.
+ // FIXME: The verifier shouldn't allow this.
+ if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
+ report_fatal_error("invalid call to entry function");
+
+ llvm_unreachable("callee should have been handled before caller");
+ }
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
@@ -825,14 +892,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SIInstrInfo *TII = STM.getInstrInfo();
- const SIRegisterInfo *RI = &TII->getRegisterInfo();
// TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
- getSTI(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
+ &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed);
// Check the addressable register limit before we add ExtraSGPRs.
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
@@ -918,24 +983,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
&STM, ProgInfo.NumVGPRsForWavesPerEU);
- // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
- // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
- // attribute was requested.
- if (STM.debuggerEmitPrologue()) {
- ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
- RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
- ProgInfo.DebuggerPrivateSegmentBufferSGPR =
- RI->getHWRegIndex(MFI->getScratchRSrcReg());
- }
-
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
// register.
ProgInfo.FloatMode = getFPMode(MF);
- ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
+ const SIModeRegisterDefaults Mode = MFI->getMode();
+ ProgInfo.IEEEMode = Mode.IEEE;
// Make clamp modifier on NaN input returns 0.
- ProgInfo.DX10Clamp = STM.enableDX10Clamp();
+ ProgInfo.DX10Clamp = Mode.DX10Clamp;
unsigned LDSAlignShift;
if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
@@ -963,6 +1019,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1ULL << ScratchAlignShift) >>
ScratchAlignShift;
+ if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
+ ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
+ ProgInfo.MemOrdered = 1;
+ }
+
ProgInfo.ComputePGMRSrc1 =
S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
@@ -971,7 +1032,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
S_00B848_PRIV(ProgInfo.Priv) |
S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
- S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+ S_00B848_IEEE_MODE(ProgInfo.IEEEMode) |
+ S_00B848_WGP_MODE(ProgInfo.WgpMode) |
+ S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
// 0 = X, 1 = XY, 2 = XYZ
unsigned TIDIGCompCnt = 0;
@@ -1053,71 +1116,38 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
// is AMDPAL. It stores each compute/SPI register setting and other PAL
-// metadata items into the PALMetadataMap, combining with any provided by the
-// frontend as LLVM metadata. Once all functions are written, PALMetadataMap is
-// then written as a single block in the .note section.
+// metadata items into the PALMD::Metadata, combining with any provided by the
+// frontend as LLVM metadata. Once all functions are written, the PAL metadata
+// is then written as a single block in the .note section.
void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- // Given the calling convention, calculate the register number for rsrc1. In
- // principle the register number could change in future hardware, but we know
- // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so
- // we can use the same fixed value that .AMDGPU.config has for Mesa. Note
- // that we use a register number rather than a byte offset, so we need to
- // divide by 4.
- unsigned Rsrc1Reg = getRsrcReg(MF.getFunction().getCallingConv()) / 4;
- unsigned Rsrc2Reg = Rsrc1Reg + 1;
- // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used
- // with a constant offset to access any non-register shader-specific PAL
- // metadata key.
- unsigned ScratchSizeKey = PALMD::Key::CS_SCRATCH_SIZE;
- switch (MF.getFunction().getCallingConv()) {
- case CallingConv::AMDGPU_PS:
- ScratchSizeKey = PALMD::Key::PS_SCRATCH_SIZE;
- break;
- case CallingConv::AMDGPU_VS:
- ScratchSizeKey = PALMD::Key::VS_SCRATCH_SIZE;
- break;
- case CallingConv::AMDGPU_GS:
- ScratchSizeKey = PALMD::Key::GS_SCRATCH_SIZE;
- break;
- case CallingConv::AMDGPU_ES:
- ScratchSizeKey = PALMD::Key::ES_SCRATCH_SIZE;
- break;
- case CallingConv::AMDGPU_HS:
- ScratchSizeKey = PALMD::Key::HS_SCRATCH_SIZE;
- break;
- case CallingConv::AMDGPU_LS:
- ScratchSizeKey = PALMD::Key::LS_SCRATCH_SIZE;
- break;
- }
- unsigned NumUsedVgprsKey = ScratchSizeKey +
- PALMD::Key::VS_NUM_USED_VGPRS - PALMD::Key::VS_SCRATCH_SIZE;
- unsigned NumUsedSgprsKey = ScratchSizeKey +
- PALMD::Key::VS_NUM_USED_SGPRS - PALMD::Key::VS_SCRATCH_SIZE;
- PALMetadataMap[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU;
- PALMetadataMap[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU;
+ auto CC = MF.getFunction().getCallingConv();
+ auto MD = getTargetStreamer()->getPALMetadata();
+
+ MD->setEntryPoint(CC, MF.getFunction().getName());
+ MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+ MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
- PALMetadataMap[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1;
- PALMetadataMap[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2;
- // ScratchSize is in bytes, 16 aligned.
- PALMetadataMap[ScratchSizeKey] |=
- alignTo(CurrentProgramInfo.ScratchSize, 16);
+ MD->setRsrc1(CC, CurrentProgramInfo.ComputePGMRSrc1);
+ MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
} else {
- PALMetadataMap[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
- S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks);
+ MD->setRsrc1(CC, S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
+ S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks));
if (CurrentProgramInfo.ScratchBlocks > 0)
- PALMetadataMap[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1);
- // ScratchSize is in bytes, 16 aligned.
- PALMetadataMap[ScratchSizeKey] |=
- alignTo(CurrentProgramInfo.ScratchSize, 16);
+ MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
}
+ // ScratchSize is in bytes, 16 aligned.
+ MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
- PALMetadataMap[Rsrc2Reg] |=
- S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
- PALMetadataMap[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable();
- PALMetadataMap[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr();
+ MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
+ MD->setSpiPsInputEna(MFI->getPSInputEnable());
+ MD->setSpiPsInputAddr(MFI->getPSInputAddr());
}
+
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ if (STM.isWave32())
+ MD->setWave32(MF.getFunction().getCallingConv());
}
// This is supposed to be log2(Size)
@@ -1144,12 +1174,12 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- AMDGPU::initDefaultAMDKernelCodeT(Out, getSTI());
+ AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
Out.compute_pgm_resource_registers =
CurrentProgramInfo.ComputePGMRSrc1 |
(CurrentProgramInfo.ComputePGMRSrc2 << 32);
- Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+ Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
if (CurrentProgramInfo.DynamicCallStack)
Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
@@ -1181,9 +1211,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (MFI->hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
- if (STM.debuggerSupported())
- Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
-
if (STM.isXNACKEnabled())
Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
@@ -1196,22 +1223,14 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
// These alignment values are specified in powers of two, so alignment =
// 2^n. The minimum alignment is 2^4 = 16.
- Out.kernarg_segment_alignment = std::max((size_t)4,
+ Out.kernarg_segment_alignment = std::max<size_t>(4,
countTrailingZeros(MaxKernArgAlign));
-
- if (STM.debuggerEmitPrologue()) {
- Out.debug_wavefront_private_segment_offset_sgpr =
- CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
- Out.debug_private_segment_buffer_sgpr =
- CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;
- }
}
bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant,
const char *ExtraCode, raw_ostream &O) {
// First try the generic code, which knows about modifiers like 'c' and 'n'.
- if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O))
+ if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
return false;
if (ExtraCode && ExtraCode[0]) {
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 167ac4b21e1e..cf77034329ef 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -1,9 +1,8 @@
//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -33,6 +32,7 @@ namespace llvm {
class AMDGPUMachineFunction;
class AMDGPUTargetStreamer;
+class MCCodeEmitter;
class MCOperand;
class GCNSubtarget;
@@ -57,12 +57,12 @@ private:
DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
- std::map<uint32_t, uint32_t> PALMetadataMap;
+
+ MCCodeEmitter *DumpCodeInstEmitter = nullptr;
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const;
- void readPALMetadata(Module &M);
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
const MachineFunction &MF) const;
@@ -95,7 +95,7 @@ public:
StringRef getPassName() const override;
- const MCSubtargetInfo* getSTI() const;
+ const MCSubtargetInfo* getGlobalSTI() const;
AMDGPUTargetStreamer* getTargetStreamer() const;
@@ -137,8 +137,7 @@ public:
const MachineBasicBlock *MBB) const override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- unsigned AsmVariant, const char *ExtraCode,
- raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
protected:
mutable std::vector<std::string> DisasmLines, HexLines;
diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 644e4fd558ba..8a92e7d923fb 100644
--- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -31,6 +30,7 @@ namespace {
enum DPP_CTRL {
DPP_ROW_SR1 = 0x111,
DPP_ROW_SR2 = 0x112,
+ DPP_ROW_SR3 = 0x113,
DPP_ROW_SR4 = 0x114,
DPP_ROW_SR8 = 0x118,
DPP_WF_SR1 = 0x138,
@@ -40,7 +40,7 @@ enum DPP_CTRL {
struct ReplacementInfo {
Instruction *I;
- Instruction::BinaryOps Op;
+ AtomicRMWInst::BinOp Op;
unsigned ValIdx;
bool ValDivergent;
};
@@ -55,10 +55,8 @@ private:
bool HasDPP;
bool IsPixelShader;
- void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
- unsigned ValIdx, bool ValDivergent) const;
-
- void setConvergent(CallInst *const CI) const;
+ void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
+ bool ValDivergent) const;
public:
static char ID;
@@ -122,16 +120,20 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
break;
}
- Instruction::BinaryOps Op;
+ AtomicRMWInst::BinOp Op = I.getOperation();
- switch (I.getOperation()) {
+ switch (Op) {
default:
return;
case AtomicRMWInst::Add:
- Op = Instruction::Add;
- break;
case AtomicRMWInst::Sub:
- Op = Instruction::Sub;
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Xor:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
break;
}
@@ -163,7 +165,7 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
}
void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
- Instruction::BinaryOps Op;
+ AtomicRMWInst::BinOp Op;
switch (I.getIntrinsicID()) {
default:
@@ -171,12 +173,47 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
case Intrinsic::amdgcn_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:
case Intrinsic::amdgcn_raw_buffer_atomic_add:
- Op = Instruction::Add;
+ Op = AtomicRMWInst::Add;
break;
case Intrinsic::amdgcn_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
- Op = Instruction::Sub;
+ Op = AtomicRMWInst::Sub;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ Op = AtomicRMWInst::And;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ Op = AtomicRMWInst::Or;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ Op = AtomicRMWInst::Xor;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ Op = AtomicRMWInst::Min;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ Op = AtomicRMWInst::UMin;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ Op = AtomicRMWInst::Max;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ Op = AtomicRMWInst::UMax;
break;
}
@@ -208,12 +245,68 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
ToReplace.push_back(Info);
}
+// Use the builder to create the non-atomic counterpart of the specified
+// atomicrmw binary op.
+static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+ Value *LHS, Value *RHS) {
+ CmpInst::Predicate Pred;
+
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ return B.CreateBinOp(Instruction::Add, LHS, RHS);
+ case AtomicRMWInst::Sub:
+ return B.CreateBinOp(Instruction::Sub, LHS, RHS);
+ case AtomicRMWInst::And:
+ return B.CreateBinOp(Instruction::And, LHS, RHS);
+ case AtomicRMWInst::Or:
+ return B.CreateBinOp(Instruction::Or, LHS, RHS);
+ case AtomicRMWInst::Xor:
+ return B.CreateBinOp(Instruction::Xor, LHS, RHS);
+
+ case AtomicRMWInst::Max:
+ Pred = CmpInst::ICMP_SGT;
+ break;
+ case AtomicRMWInst::Min:
+ Pred = CmpInst::ICMP_SLT;
+ break;
+ case AtomicRMWInst::UMax:
+ Pred = CmpInst::ICMP_UGT;
+ break;
+ case AtomicRMWInst::UMin:
+ Pred = CmpInst::ICMP_ULT;
+ break;
+ }
+ Value *Cond = B.CreateICmp(Pred, LHS, RHS);
+ return B.CreateSelect(Cond, LHS, RHS);
+}
+
+static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
+ unsigned BitWidth) {
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Xor:
+ case AtomicRMWInst::UMax:
+ return APInt::getMinValue(BitWidth);
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::UMin:
+ return APInt::getMaxValue(BitWidth);
+ case AtomicRMWInst::Max:
+ return APInt::getSignedMinValue(BitWidth);
+ case AtomicRMWInst::Min:
+ return APInt::getSignedMaxValue(BitWidth);
+ }
+}
+
void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
- Instruction::BinaryOps Op,
+ AtomicRMWInst::BinOp Op,
unsigned ValIdx,
bool ValDivergent) const {
- LLVMContext &Context = I.getContext();
-
// Start building just before the instruction.
IRBuilder<> B(&I);
@@ -251,115 +344,130 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Value *const V = I.getOperand(ValIdx);
// We need to know how many lanes are active within the wavefront, and we do
- // this by getting the exec register, which tells us all the lanes that are
- // active.
- MDNode *const RegName =
- llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec"));
- Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName);
- CallInst *const Exec =
- B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata});
- setConvergent(Exec);
+ // this by doing a ballot of active lanes.
+ CallInst *const Ballot = B.CreateIntrinsic(
+ Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()},
+ {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});
// We need to know how many lanes are active within the wavefront that are
// below us. If we counted each lane linearly starting from 0, a lane is
// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.
- Value *const BitCast = B.CreateBitCast(Exec, VecTy);
+ Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
CallInst *const PartialMbcnt = B.CreateIntrinsic(
Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
- CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
- {ExtractHi, PartialMbcnt});
+ Value *const Mbcnt =
+ B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
+ {ExtractHi, PartialMbcnt}),
+ Ty, false);
- Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);
+ Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
- Value *LaneOffset = nullptr;
+ Value *ExclScan = nullptr;
Value *NewV = nullptr;
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
if (ValDivergent) {
- // First we need to set all inactive invocations to 0, so that they can
- // correctly contribute to the final result.
- CallInst *const SetInactive = B.CreateIntrinsic(
- Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)});
- setConvergent(SetInactive);
- NewV = SetInactive;
-
- const unsigned Iters = 6;
- const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2,
- DPP_ROW_SR4, DPP_ROW_SR8,
- DPP_ROW_BCAST15, DPP_ROW_BCAST31};
- const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
-
- // This loop performs an inclusive scan across the wavefront, with all lanes
+ // First we need to set all inactive invocations to the identity value, so
+ // that they can correctly contribute to the final result.
+ CallInst *const SetInactive =
+ B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
+
+ CallInst *const FirstDPP =
+ B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
+ {Identity, SetInactive, B.getInt32(DPP_WF_SR1),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
+ ExclScan = FirstDPP;
+
+ const unsigned Iters = 7;
+ const unsigned DPPCtrl[Iters] = {
+ DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4,
+ DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};
+ const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
+ const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf};
+
+ // This loop performs an exclusive scan across the wavefront, with all lanes
// active (by using the WWM intrinsic).
for (unsigned Idx = 0; Idx < Iters; Idx++) {
- CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty,
- {NewV, B.getInt32(DPPCtrl[Idx]),
- B.getInt32(RowMask[Idx]),
- B.getInt32(0xf), B.getFalse()});
- setConvergent(DPP);
- Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);
-
- NewV = B.CreateBinOp(Op, NewV, WWM);
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
+ Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
+ CallInst *const DPP = B.CreateIntrinsic(
+ Intrinsic::amdgcn_update_dpp, Ty,
+ {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),
+ B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
+
+ ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
}
- // NewV has returned the inclusive scan of V, but for the lane offset we
- // require an exclusive scan. We do this by shifting the values from the
- // entire wavefront right by 1, and by setting the bound_ctrl (last argument
- // to the intrinsic below) to true, we can guarantee that 0 will be shifted
- // into the 0'th invocation.
- CallInst *const DPP =
- B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty},
- {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf),
- B.getInt32(0xf), B.getTrue()});
- setConvergent(DPP);
- LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);
+ NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);
// Read the value from the last lane, which has accumlated the values of
- // each active lane in the wavefront. This will be our new value with which
- // we will provide to the atomic operation.
+ // each active lane in the wavefront. This will be our new value which we
+ // will provide to the atomic operation.
if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
Value *const ExtractHi =
B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
CallInst *const ReadLaneLo = B.CreateIntrinsic(
Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
- setConvergent(ReadLaneLo);
CallInst *const ReadLaneHi = B.CreateIntrinsic(
Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
- setConvergent(ReadLaneHi);
Value *const PartialInsert = B.CreateInsertElement(
UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
Value *const Insert =
B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
NewV = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
- CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
- {}, {NewV, B.getInt32(63)});
- setConvergent(ReadLane);
- NewV = ReadLane;
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+ {NewV, B.getInt32(63)});
} else {
llvm_unreachable("Unhandled atomic bit width");
}
+
+ // Finally mark the readlanes in the WWM section.
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
} else {
- // Get the total number of active lanes we have by using popcount.
- Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec);
- Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
-
- // Calculate the new value we will be contributing to the atomic operation
- // for the entire wavefront.
- NewV = B.CreateMul(V, CtpopCast);
- LaneOffset = B.CreateMul(V, MbcntCast);
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub: {
+ // The new value we will be contributing to the atomic operation is the
+ // old value times the number of active lanes.
+ Value *const Ctpop = B.CreateIntCast(
+ B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+ NewV = B.CreateMul(V, Ctpop);
+ break;
+ }
+
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ // These operations with a uniform value are idempotent: doing the atomic
+ // operation multiple times has the same effect as doing it once.
+ NewV = V;
+ break;
+
+ case AtomicRMWInst::Xor:
+ // The new value we will be contributing to the atomic operation is the
+ // old value times the parity of the number of active lanes.
+ Value *const Ctpop = B.CreateIntCast(
+ B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+ NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1));
+ break;
+ }
}
// We only want a single lane to enter our new control flow, and we do this
// by checking if there are any active lanes below us. Only one lane will
// have 0 active lanes below us, so that will be the only one to progress.
- Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));
+ Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
// Store I's original basic block before we split the block.
BasicBlock *const EntryBB = I.getParent();
@@ -401,20 +509,16 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
CallInst *const ReadFirstLaneLo =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
- setConvergent(ReadFirstLaneLo);
CallInst *const ReadFirstLaneHi =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
- setConvergent(ReadFirstLaneHi);
Value *const PartialInsert = B.CreateInsertElement(
UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
Value *const Insert =
B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
BroadcastI = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
- CallInst *const ReadFirstLane =
- B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
- setConvergent(ReadFirstLane);
- BroadcastI = ReadFirstLane;
+
+ BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
} else {
llvm_unreachable("Unhandled atomic bit width");
}
@@ -423,7 +527,31 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// get our individual lane's slice into the result. We use the lane offset we
// previously calculated combined with the atomic result value we got from the
// first lane, to get our lane's index into the atomic result.
- Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);
+ Value *LaneOffset = nullptr;
+ if (ValDivergent) {
+ LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
+ } else {
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ LaneOffset = B.CreateMul(V, Mbcnt);
+ break;
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ LaneOffset = B.CreateSelect(Cond, Identity, V);
+ break;
+ case AtomicRMWInst::Xor:
+ LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
+ break;
+ }
+ }
+ Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
if (IsPixelShader) {
// Need a final PHI to reconverge to above the helper lane branch mask.
@@ -442,10 +570,6 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
I.eraseFromParent();
}
-void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const {
- CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
-}
-
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
"AMDGPU atomic optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index daef37f9c21f..b107c357196d 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1,9 +1,8 @@
//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -21,28 +20,98 @@
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
using namespace llvm;
+namespace {
+
+struct OutgoingArgHandler : public CallLowering::ValueHandler {
+ OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+ : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+ MachineInstrBuilder MIB;
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override {
+ llvm_unreachable("not implemented");
+ }
+
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ llvm_unreachable("not implemented");
+ }
+
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override {
+ MIB.addUse(PhysReg);
+ MIRBuilder.buildCopy(PhysReg, ValVReg);
+ }
+
+ bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ const CallLowering::ArgInfo &Info,
+ CCState &State) override {
+ return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ }
+};
+
+}
+
AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
: CallLowering(&TLI) {
}
bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Value *Val,
- ArrayRef<unsigned> VRegs) const {
- // FIXME: Add support for non-void returns.
- if (Val)
+ ArrayRef<Register> VRegs) const {
+
+ MachineFunction &MF = MIRBuilder.getMF();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MFI->setIfReturnsVoid(!Val);
+
+ if (!Val) {
+ MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
+ return true;
+ }
+
+ Register VReg = VRegs[0];
+
+ const Function &F = MF.getFunction();
+ auto &DL = F.getParent()->getDataLayout();
+ if (!AMDGPU::isShader(F.getCallingConv()))
return false;
- MIRBuilder.buildInstr(AMDGPU::S_ENDPGM);
+
+ const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+ SmallVector<EVT, 4> SplitVTs;
+ SmallVector<uint64_t, 4> Offsets;
+ ArgInfo OrigArg{VReg, Val->getType()};
+ setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
+ ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+
+ SmallVector<ArgInfo, 8> SplitArgs;
+ CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false);
+ for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
+ Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext());
+ SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed});
+ }
+ auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG);
+ OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn);
+ if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ return false;
+ MIRBuilder.insertInstr(RetInstr);
+
return true;
}
-unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
+Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
Type *ParamTy,
uint64_t Offset) const {
@@ -53,12 +122,12 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
const DataLayout &DL = F.getParent()->getDataLayout();
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
LLT PtrType = getLLTForType(*PtrTy, DL);
- unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
- unsigned KernArgSegmentPtr =
+ Register DstReg = MRI.createGenericVirtualRegister(PtrType);
+ Register KernArgSegmentPtr =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
- unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
+ Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
- unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
+ Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
MIRBuilder.buildConstant(OffsetReg, Offset);
MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
@@ -69,14 +138,14 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
Type *ParamTy, uint64_t Offset,
unsigned Align,
- unsigned DstReg) const {
+ Register DstReg) const {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
- unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
+ Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
MachineMemOperand *MMO =
MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad |
@@ -87,93 +156,233 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
MIRBuilder.buildLoad(DstReg, PtrReg, *MMO);
}
-bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
- const Function &F,
- ArrayRef<unsigned> VRegs) const {
- // AMDGPU_GS and AMDGP_HS are not supported yet.
- if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
- F.getCallingConv() == CallingConv::AMDGPU_HS)
- return false;
+static Register findFirstFreeSGPR(CCState &CCInfo) {
+ unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+ for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
+ if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
+ return AMDGPU::SGPR0 + Reg;
+ }
+ }
+ llvm_unreachable("Cannot allocate sgpr");
+}
- MachineFunction &MF = MIRBuilder.getMF();
- const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
+static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
+ const LLT S32 = LLT::scalar(32);
MachineRegisterInfo &MRI = MF.getRegInfo();
- SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
- const DataLayout &DL = F.getParent()->getDataLayout();
- SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+ if (Info.hasWorkItemIDX()) {
+ Register Reg = AMDGPU::VGPR0;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
+ }
+
+ if (Info.hasWorkItemIDY()) {
+ Register Reg = AMDGPU::VGPR1;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
+ }
+
+ if (Info.hasWorkItemIDZ()) {
+ Register Reg = AMDGPU::VGPR2;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
+ }
+}
+// Allocate special inputs passed in user SGPRs.
+static void allocateHSAUserSGPRs(CCState &CCInfo,
+ MachineIRBuilder &MIRBuilder,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) {
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
- if (Info->hasPrivateSegmentBuffer()) {
- unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
- MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+ if (Info.hasPrivateSegmentBuffer()) {
+ unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+ MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
- if (Info->hasDispatchPtr()) {
- unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
- // FIXME: Need to add reg as live-in
+ if (Info.hasDispatchPtr()) {
+ unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
+ MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
- if (Info->hasQueuePtr()) {
- unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
- // FIXME: Need to add reg as live-in
+ if (Info.hasQueuePtr()) {
+ unsigned QueuePtrReg = Info.addQueuePtr(TRI);
+ MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}
- if (Info->hasKernargSegmentPtr()) {
- unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
- const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
- unsigned VReg = MRI.createGenericVirtualRegister(P2);
+ if (Info.hasKernargSegmentPtr()) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
+ const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+ Register VReg = MRI.createGenericVirtualRegister(P4);
MRI.addLiveIn(InputPtrReg, VReg);
MIRBuilder.getMBB().addLiveIn(InputPtrReg);
MIRBuilder.buildCopy(VReg, InputPtrReg);
CCInfo.AllocateReg(InputPtrReg);
}
- if (Info->hasDispatchID()) {
- unsigned DispatchIDReg = Info->addDispatchID(*TRI);
- // FIXME: Need to add reg as live-in
+ if (Info.hasDispatchID()) {
+ unsigned DispatchIDReg = Info.addDispatchID(TRI);
+ MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchIDReg);
}
- if (Info->hasFlatScratchInit()) {
- unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
- // FIXME: Need to add reg as live-in
+ if (Info.hasFlatScratchInit()) {
+ unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
+ MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
}
+ // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+ // these from the dispatch pointer.
+}
+
+static void allocateSystemSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ SIMachineFunctionInfo &Info,
+ CallingConv::ID CallConv,
+ bool IsShader) {
+ const LLT S32 = LLT::scalar(32);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ if (Info.hasWorkGroupIDX()) {
+ Register Reg = Info.addWorkGroupIDX();
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info.hasWorkGroupIDY()) {
+ Register Reg = Info.addWorkGroupIDY();
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info.hasWorkGroupIDZ()) {
+ unsigned Reg = Info.addWorkGroupIDZ();
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info.hasWorkGroupInfo()) {
+ unsigned Reg = Info.addWorkGroupInfo();
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info.hasPrivateSegmentWaveByteOffset()) {
+ // Scratch wave offset passed in system SGPR.
+ unsigned PrivateSegmentWaveByteOffsetReg;
+
+ if (IsShader) {
+ PrivateSegmentWaveByteOffsetReg =
+ Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
+
+ // This is true if the scratch wave byte offset doesn't have a fixed
+ // location.
+ if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
+ PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
+ Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+ }
+ } else
+ PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
+
+ MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+ }
+}
+
+bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
+ MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+
+ allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info);
+
+ unsigned i = 0;
+ const unsigned KernArgBaseAlign = 16;
+ const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
+ uint64_t ExplicitArgOffset = 0;
+
+ // TODO: Align down to dword alignment and extract bits for extending loads.
+ for (auto &Arg : F.args()) {
+ Type *ArgTy = Arg.getType();
+ unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+ if (AllocSize == 0)
+ continue;
+
+ unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
+
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
+
+ ArrayRef<Register> OrigArgRegs = VRegs[i];
+ Register ArgReg =
+ OrigArgRegs.size() == 1
+ ? OrigArgRegs[0]
+ : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
+ unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
+ ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
+ lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg);
+ if (OrigArgRegs.size() > 1)
+ unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder);
+ ++i;
+ }
+
+ allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
+ allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
+ return true;
+}
+
+bool AMDGPUCallLowering::lowerFormalArguments(
+ MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const {
// The infrastructure for normal calling convention lowering is essentially
// useless for kernels. We want to avoid any kind of legalization or argument
// splitting.
- if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
- unsigned i = 0;
- const unsigned KernArgBaseAlign = 16;
- const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
- uint64_t ExplicitArgOffset = 0;
-
- // TODO: Align down to dword alignment and extract bits for extending loads.
- for (auto &Arg : F.args()) {
- Type *ArgTy = Arg.getType();
- unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
- if (AllocSize == 0)
- continue;
+ if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
+ return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs);
- unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
+ // AMDGPU_GS and AMDGP_HS are not supported yet.
+ if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
+ F.getCallingConv() == CallingConv::AMDGPU_HS)
+ return false;
+
+ MachineFunction &MF = MIRBuilder.getMF();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ const DataLayout &DL = F.getParent()->getDataLayout();
- uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
- ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
+ bool IsShader = AMDGPU::isShader(F.getCallingConv());
- unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
- ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
- lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]);
- ++i;
- }
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
- return true;
+ if (Info->hasImplicitBufferPtr()) {
+ unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
+ MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
+ CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
unsigned NumArgs = F.arg_size();
@@ -186,7 +395,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
// We can only hanlde simple value types at the moment.
ISD::ArgFlagsTy Flags;
- ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()};
+ assert(VRegs[i].size() == 1 && "Can't lower into more than one register");
+ ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()};
setArgFlags(OrigArg, i + 1, DL, F);
Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
@@ -239,11 +449,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) {
if (Skipped.test(OrigArgIdx))
continue;
- CCValAssign &VA = ArgLocs[i++];
- MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx]);
- MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
- MIRBuilder.buildCopy(VRegs[OrigArgIdx], VA.getLocReg());
+ assert(VRegs[OrigArgIdx].size() == 1 &&
+ "Can't lower into more than 1 reg");
+ CCValAssign &VA = ArgLocs[i++];
+ MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]);
+ MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
+ MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg());
}
+
+ allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader);
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index ed859716218e..3599659cac6a 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -1,9 +1,8 @@
//===- lib/Target/AMDGPU/AMDGPUCallLowering.h - Call lowering -*- C++ -*---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
@@ -23,20 +22,25 @@ namespace llvm {
class AMDGPUTargetLowering;
class AMDGPUCallLowering: public CallLowering {
- unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
+ Register lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
uint64_t Offset) const;
void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
uint64_t Offset, unsigned Align,
- unsigned DstReg) const;
+ Register DstReg) const;
public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<Register> VRegs) const override;
+
+ bool lowerFormalArgumentsKernel(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const;
+
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<unsigned> VRegs) const override;
+ ArrayRef<ArrayRef<Register>> VRegs) const override;
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
};
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 367f120b5fa6..3688cd77542e 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -1,9 +1,8 @@
//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -24,7 +23,16 @@ def CC_SI : CallingConv<[
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
- SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+ SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
+ SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47,
+ SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55,
+ SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63,
+ SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71,
+ SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79,
+ SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87,
+ SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95,
+ SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103,
+ SGPR104, SGPR105
]>>>,
// We have no way of referring to the generated register tuples
@@ -60,7 +68,16 @@ def RetCC_SI_Shader : CallingConv<[
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
- SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+ SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
+ SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47,
+ SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55,
+ SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63,
+ SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71,
+ SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79,
+ SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87,
+ SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95,
+ SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103,
+ SGPR104, SGPR105
]>>,
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
@@ -93,12 +110,22 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
(sequence "VGPR%u", 32, 255)
>;
-def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs<
- (sequence "SGPR%u", 32, 103)
+def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
+ (sequence "SGPR%u", 32, 105)
+>;
+
+// Just to get the regmask, not for calling convention purposes.
+def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs<
+ (sequence "VGPR%u", 0, 255)
+>;
+
+// Just to get the regmask, not for calling convention purposes.
+def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
+ (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI)
>;
def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
- (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103)
+ (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105)
>;
// Calling convention for leaf functions
@@ -111,10 +138,12 @@ def CC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
+ CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v5i32, v5f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+ CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+ CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
]>;
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 4dc1e67c573d..b750c6b5f6d2 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -62,6 +61,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
AssumptionCache *AC = nullptr;
LegacyDivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;
+ const DataLayout *DL = nullptr;
bool HasUnsafeFPMath = false;
/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
@@ -134,6 +134,16 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// \returns True.
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
+
+ unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
+ unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
+ bool isI24(Value *V, unsigned ScalarSize) const;
+ bool isU24(Value *V, unsigned ScalarSize) const;
+
+ /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
+ /// SelectionDAG has an issue where an and asserting the bits are known
+ bool replaceMulWithMul24(BinaryOperator &I) const;
+
/// Expands 24 bit div or rem.
Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
Value *Num, Value *Den,
@@ -393,6 +403,118 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
return true;
}
+unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,
+ unsigned ScalarSize) const {
+ KnownBits Known = computeKnownBits(Op, *DL, 0, AC);
+ return ScalarSize - Known.countMinLeadingZeros();
+}
+
+unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
+ unsigned ScalarSize) const {
+ // In order for this to be a signed 24-bit value, bit 23, must
+ // be a sign bit.
+ return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);
+}
+
+bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {
+ return ScalarSize >= 24 && // Types less than 24-bit should be treated
+ // as unsigned 24-bit values.
+ numBitsSigned(V, ScalarSize) < 24;
+}
+
+bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
+ return numBitsUnsigned(V, ScalarSize) <= 24;
+}
+
+static void extractValues(IRBuilder<> &Builder,
+ SmallVectorImpl<Value *> &Values, Value *V) {
+ VectorType *VT = dyn_cast<VectorType>(V->getType());
+ if (!VT) {
+ Values.push_back(V);
+ return;
+ }
+
+ for (int I = 0, E = VT->getNumElements(); I != E; ++I)
+ Values.push_back(Builder.CreateExtractElement(V, I));
+}
+
+static Value *insertValues(IRBuilder<> &Builder,
+ Type *Ty,
+ SmallVectorImpl<Value *> &Values) {
+ if (Values.size() == 1)
+ return Values[0];
+
+ Value *NewVal = UndefValue::get(Ty);
+ for (int I = 0, E = Values.size(); I != E; ++I)
+ NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
+
+ return NewVal;
+}
+
+bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
+ if (I.getOpcode() != Instruction::Mul)
+ return false;
+
+ Type *Ty = I.getType();
+ unsigned Size = Ty->getScalarSizeInBits();
+ if (Size <= 16 && ST->has16BitInsts())
+ return false;
+
+ // Prefer scalar if this could be s_mul_i32
+ if (DA->isUniform(&I))
+ return false;
+
+ Value *LHS = I.getOperand(0);
+ Value *RHS = I.getOperand(1);
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
+
+ // TODO: Should this try to match mulhi24?
+ if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {
+ IntrID = Intrinsic::amdgcn_mul_u24;
+ } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {
+ IntrID = Intrinsic::amdgcn_mul_i24;
+ } else
+ return false;
+
+ SmallVector<Value *, 4> LHSVals;
+ SmallVector<Value *, 4> RHSVals;
+ SmallVector<Value *, 4> ResultVals;
+ extractValues(Builder, LHSVals, LHS);
+ extractValues(Builder, RHSVals, RHS);
+
+
+ IntegerType *I32Ty = Builder.getInt32Ty();
+ FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
+ for (int I = 0, E = LHSVals.size(); I != E; ++I) {
+ Value *LHS, *RHS;
+ if (IntrID == Intrinsic::amdgcn_mul_u24) {
+ LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
+ RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
+ } else {
+ LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
+ RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
+ }
+
+ Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});
+
+ if (IntrID == Intrinsic::amdgcn_mul_u24) {
+ ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
+ LHSVals[I]->getType()));
+ } else {
+ ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
+ LHSVals[I]->getType()));
+ }
+ }
+
+ I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals));
+ I.eraseFromParent();
+
+ return true;
+}
+
static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
if (!CNum)
@@ -757,6 +879,9 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
DA->isUniform(&I) && promoteUniformOpToI32(I))
return true;
+ if (replaceMulWithMul24(I))
+ return true;
+
bool Changed = false;
Instruction::BinaryOps Opc = I.getOpcode();
Type *Ty = I.getType();
@@ -807,7 +932,7 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
Type *I32Ty = Builder.getInt32Ty();
Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
- LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
+ LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast);
WidenLoad->copyMetadata(I);
// If we have range metadata, we need to convert the type, and not make
@@ -883,6 +1008,7 @@ bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Mod = &M;
+ DL = &Mod->getDataLayout();
return false;
}
diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td
index 3c7d8a8fc550..ea3952c316e4 100644
--- a/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -1,9 +1,8 @@
//===-- AMDGPUFeatures.td - AMDGPU Feature Definitions -----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -50,17 +49,12 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
-class SubtargetFeatureGeneration <string Value, string Subtarget,
+class SubtargetFeatureGeneration <string Value, string FeatureName,
+ string Subtarget,
list<SubtargetFeature> Implies> :
- SubtargetFeature <Value, "Gen", Subtarget#"::"#Value,
+ SubtargetFeature <FeatureName, "Gen", Subtarget#"::"#Value,
Value#" GPU generation", Implies>;
-def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
- "DX10Clamp",
- "true",
- "clamp modifier clamps NaNs to 0.0"
->;
-
def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
"EnablePromoteAlloca",
"true",
diff --git a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
index 6e2a981d3396..9ba04d113c70 100644
--- a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index e32ca9653b3a..e80797736363 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -1,9 +1,8 @@
//===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index ee836bf8a631..48b64488303e 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -1,9 +1,8 @@
//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td
index 59bb2a16e0f3..cad4c2ef404c 100644
--- a/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -1,9 +1,8 @@
//===-- AMDGPUGIsel.td - AMDGPU GlobalISel Patterns---------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This files contains patterns that should only be used by GlobalISel. For
@@ -13,6 +12,10 @@
include "AMDGPU.td"
+def p0 : PtrValueType<i64, 0>;
+def p1 : PtrValueType<i64, 1>;
+def p4 : PtrValueType<i64, 4>;
+
def sd_vsrc0 : ComplexPattern<i32, 1, "">;
def gi_vsrc0 :
GIComplexOperandMatcher<s32, "selectVSRC0">,
@@ -35,6 +38,33 @@ def gi_vop3omods :
GIComplexOperandMatcher<s32, "selectVOP3OMods">,
GIComplexPatternEquiv<VOP3OMods>;
+def gi_smrd_imm :
+ GIComplexOperandMatcher<s64, "selectSmrdImm">,
+ GIComplexPatternEquiv<SMRDImm>;
+
+def gi_smrd_imm32 :
+ GIComplexOperandMatcher<s64, "selectSmrdImm32">,
+ GIComplexPatternEquiv<SMRDImm32>;
+
+def gi_smrd_sgpr :
+ GIComplexOperandMatcher<s64, "selectSmrdSgpr">,
+ GIComplexPatternEquiv<SMRDSgpr>;
+
+def gi_flat_offset :
+ GIComplexOperandMatcher<s64, "selectFlatOffset">,
+ GIComplexPatternEquiv<FLATOffset>;
+def gi_flat_offset_signed :
+ GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
+ GIComplexPatternEquiv<FLATOffsetSigned>;
+
+def gi_mubuf_scratch_offset :
+ GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
+ GIComplexPatternEquiv<MUBUFScratchOffset>;
+def gi_mubuf_scratch_offen :
+ GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">,
+ GIComplexPatternEquiv<MUBUFScratchOffen>;
+
+
class GISelSop2Pat <
SDPatternOperator node,
Instruction inst,
@@ -113,15 +143,6 @@ multiclass GISelVop2IntrPat <
def : GISelSop2Pat <or, S_OR_B32, i32>;
def : GISelVop2Pat <or, V_OR_B32_e32, i32>;
-def : GISelSop2Pat <sra, S_ASHR_I32, i32>;
-let AddedComplexity = 100 in {
-let SubtargetPredicate = isSICI in {
-def : GISelVop2Pat <sra, V_ASHR_I32_e32, i32>;
-}
-def : GISelVop2CommutePat <sra, V_ASHRREV_I32_e32, i32>;
-}
-def : GISelVop3Pat2CommutePat <sra, V_ASHRREV_I32_e64, i32>;
-
// FIXME: We can't re-use SelectionDAG patterns here because they match
// against a custom SDNode and we would need to create a generic machine
// instruction that is equivalent to the custom SDNode. This would also require
@@ -135,3 +156,11 @@ defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>;
def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>;
defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>;
def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
+
+// Since GlobalISel is more flexible then SelectionDAG, I think we can get
+// away with adding patterns for integer types and not legalizing all
+// loads and stores to vector types. This should help simplify the load/store
+// legalization.
+foreach Ty = [i64, p0, p1, p4] in {
+ defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 6eab59ab4e09..0a1f48231b18 100644
--- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -1,9 +1,8 @@
//===- AMDGPUGenRegisterBankInfo.def -----------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -92,6 +91,28 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
{&PartMappings[17], 1}
};
+const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
+ /*32-bit op*/ {0, 32, SGPRRegBank},
+ /*2x32-bit op*/ {0, 32, SGPRRegBank},
+ {32, 32, SGPRRegBank},
+/*<2x32-bit> op*/ {0, 64, SGPRRegBank},
+
+ /*32-bit op*/ {0, 32, VGPRRegBank},
+ /*2x32-bit op*/ {0, 32, VGPRRegBank},
+ {32, 32, VGPRRegBank},
+};
+
+
+// For some instructions which can operate 64-bit only for the scalar version.
+const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
+ /*32-bit sgpr*/ {&SGPROnly64BreakDown[0], 1},
+ /*2 x 32-bit sgpr*/ {&SGPROnly64BreakDown[1], 2},
+ /*64-bit sgpr */ {&SGPROnly64BreakDown[3], 1},
+
+ /*32-bit vgpr*/ {&SGPROnly64BreakDown[4], 1},
+ /*2 x 32-bit vgpr*/ {&SGPROnly64BreakDown[5], 2}
+};
+
enum ValueMappingIdx {
SCCStartIdx = 0,
SGPRStartIdx = 2,
@@ -128,5 +149,89 @@ const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
return &ValMappings[Idx];
}
+const RegisterBankInfo::ValueMapping *getValueMappingSGPR64Only(unsigned BankID,
+ unsigned Size) {
+ if (Size != 64)
+ return getValueMapping(BankID, Size);
+
+ if (BankID == AMDGPU::VGPRRegBankID)
+ return &ValMappingsSGPR64OnlyVGPR32[4];
+
+ assert(BankID == AMDGPU::SGPRRegBankID);
+ return &ValMappingsSGPR64OnlyVGPR32[2];
+}
+
+const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] {
+ /* 256-bit load */ {0, 256, SGPRRegBank},
+ /* 512-bit load */ {0, 512, SGPRRegBank},
+ /* 8 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
+ {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
+ {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
+ {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
+ /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
+ {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
+ {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
+ {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
+ {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank},
+ {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank},
+ {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank},
+ {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank},
+ /* 4 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
+ {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
+ /* 8 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
+ {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
+ {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank},
+ {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank},
+
+ /* FIXME: The generic register bank select does not support complex
+ * break downs where the number of vector elements does not equal the
+ * number of breakdowns.
+ * FIXME: register bank select now tries to handle complex break downs,
+ * but it emits an illegal instruction:
+ * %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128)
+ */
+ /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
+ /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
+ {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank}
+};
+
+const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] {
+ /* 256-bit load */ {&LoadSGPROnlyBreakDown[0], 1},
+ /* 512-bit load */ {&LoadSGPROnlyBreakDown[1], 1},
+ /* <8 x i32> load */ {&LoadSGPROnlyBreakDown[2], 8},
+ /* <16 x i32> load */ {&LoadSGPROnlyBreakDown[10], 16},
+ /* <4 x i64> load */ {&LoadSGPROnlyBreakDown[26], 4},
+ /* <8 x i64> load */ {&LoadSGPROnlyBreakDown[30], 8}
+};
+
+const RegisterBankInfo::ValueMapping *
+getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) {
+ unsigned Size = SizeTy.getSizeInBits();
+ if (Size < 256 || BankID == AMDGPU::SGPRRegBankID)
+ return getValueMapping(BankID, Size);
+
+ assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID);
+
+ // Default to using the non-split ValueMappings, we will use these if
+ // the register bank is SGPR or if we don't know how to handle the vector
+ // type.
+ unsigned Idx = Size == 256 ? 0 : 1;
+
+ // We need to split this load if it has a vgpr pointer.
+ if (BankID == AMDGPU::VGPRRegBankID) {
+ if (SizeTy == LLT::vector(8, 32))
+ Idx = 2;
+ else if (SizeTy == LLT::vector(16, 32))
+ Idx = 3;
+ else if (SizeTy == LLT::vector(4, 64))
+ Idx = 4;
+ else if (SizeTy == LLT::vector(8, 64))
+ Idx = 5;
+ }
+
+ return &ValMappingsLoadSGPROnly[Idx];
+}
+
+
} // End AMDGPU namespace.
} // End llvm namespace.
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index c38b0e61558b..b31de0af5018 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -1,9 +1,8 @@
//===--- AMDGPUHSAMetadataStreamer.cpp --------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -240,23 +239,7 @@ MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF,
Kernel::DebugProps::Metadata
MetadataStreamerV2::getHSADebugProps(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) const {
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
-
- if (!STM.debuggerSupported())
- return HSADebugProps;
-
- HSADebugProps.mDebuggerABIVersion.push_back(1);
- HSADebugProps.mDebuggerABIVersion.push_back(0);
-
- if (STM.debuggerEmitPrologue()) {
- HSADebugProps.mPrivateSegmentBufferSGPR =
- ProgramInfo.DebuggerPrivateSegmentBufferSGPR;
- HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR =
- ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
- }
-
- return HSADebugProps;
+ return HSAMD::Kernel::DebugProps::Metadata();
}
void MetadataStreamerV2::emitVersion() {
@@ -452,6 +435,10 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
}
}
+
+ // Emit the pointer argument for multi-grid object.
+ if (HiddenArgNumBytes >= 56)
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenMultiGridSyncArg);
}
bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
@@ -506,20 +493,16 @@ void MetadataStreamerV3::dump(StringRef HSAMetadataString) const {
void MetadataStreamerV3::verify(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata Parser Test: ";
- std::shared_ptr<msgpack::Node> FromHSAMetadataString =
- std::make_shared<msgpack::MapNode>();
+ msgpack::Document FromHSAMetadataString;
- yaml::Input YIn(HSAMetadataString);
- YIn >> FromHSAMetadataString;
- if (YIn.error()) {
+ if (!FromHSAMetadataString.fromYAML(HSAMetadataString)) {
errs() << "FAIL\n";
return;
}
std::string ToHSAMetadataString;
raw_string_ostream StrOS(ToHSAMetadataString);
- yaml::Output YOut(StrOS);
- YOut << FromHSAMetadataString;
+ FromHSAMetadataString.toYAML(StrOS);
errs() << (HSAMetadataString == StrOS.str() ? "PASS" : "FAIL") << '\n';
if (HSAMetadataString != ToHSAMetadataString) {
@@ -653,23 +636,23 @@ std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const {
}
}
-std::shared_ptr<msgpack::ArrayNode>
+msgpack::ArrayDocNode
MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const {
- auto Dims = std::make_shared<msgpack::ArrayNode>();
+ auto Dims = HSAMetadataDoc->getArrayNode();
if (Node->getNumOperands() != 3)
return Dims;
for (auto &Op : Node->operands())
- Dims->push_back(std::make_shared<msgpack::ScalarNode>(
- mdconst::extract<ConstantInt>(Op)->getZExtValue()));
+ Dims.push_back(Dims.getDocument()->getNode(
+ uint64_t(mdconst::extract<ConstantInt>(Op)->getZExtValue())));
return Dims;
}
void MetadataStreamerV3::emitVersion() {
- auto Version = std::make_shared<msgpack::ArrayNode>();
- Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMajor));
- Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMinor));
- getRootMetadata("amdhsa.version") = std::move(Version);
+ auto Version = HSAMetadataDoc->getArrayNode();
+ Version.push_back(Version.getDocument()->getNode(VersionMajor));
+ Version.push_back(Version.getDocument()->getNode(VersionMinor));
+ getRootMetadata("amdhsa.version") = Version;
}
void MetadataStreamerV3::emitPrintf(const Module &Mod) {
@@ -677,16 +660,16 @@ void MetadataStreamerV3::emitPrintf(const Module &Mod) {
if (!Node)
return;
- auto Printf = std::make_shared<msgpack::ArrayNode>();
+ auto Printf = HSAMetadataDoc->getArrayNode();
for (auto Op : Node->operands())
if (Op->getNumOperands())
- Printf->push_back(std::make_shared<msgpack::ScalarNode>(
- cast<MDString>(Op->getOperand(0))->getString()));
- getRootMetadata("amdhsa.printf") = std::move(Printf);
+ Printf.push_back(Printf.getDocument()->getNode(
+ cast<MDString>(Op->getOperand(0))->getString(), /*Copy=*/true));
+ getRootMetadata("amdhsa.printf") = Printf;
}
void MetadataStreamerV3::emitKernelLanguage(const Function &Func,
- msgpack::MapNode &Kern) {
+ msgpack::MapDocNode Kern) {
// TODO: What about other languages?
auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
if (!Node || !Node->getNumOperands())
@@ -695,77 +678,50 @@ void MetadataStreamerV3::emitKernelLanguage(const Function &Func,
if (Op0->getNumOperands() <= 1)
return;
- Kern[".language"] = std::make_shared<msgpack::ScalarNode>("OpenCL C");
- auto LanguageVersion = std::make_shared<msgpack::ArrayNode>();
- LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>(
+ Kern[".language"] = Kern.getDocument()->getNode("OpenCL C");
+ auto LanguageVersion = Kern.getDocument()->getArrayNode();
+ LanguageVersion.push_back(Kern.getDocument()->getNode(
mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()));
- LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>(
+ LanguageVersion.push_back(Kern.getDocument()->getNode(
mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()));
- Kern[".language_version"] = std::move(LanguageVersion);
+ Kern[".language_version"] = LanguageVersion;
}
void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
- msgpack::MapNode &Kern) {
+ msgpack::MapDocNode Kern) {
if (auto Node = Func.getMetadata("reqd_work_group_size"))
Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node);
if (auto Node = Func.getMetadata("work_group_size_hint"))
Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node);
if (auto Node = Func.getMetadata("vec_type_hint")) {
- Kern[".vec_type_hint"] = std::make_shared<msgpack::ScalarNode>(getTypeName(
- cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
- mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()));
+ Kern[".vec_type_hint"] = Kern.getDocument()->getNode(
+ getTypeName(
+ cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
+ mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()),
+ /*Copy=*/true);
}
if (Func.hasFnAttribute("runtime-handle")) {
- Kern[".device_enqueue_symbol"] = std::make_shared<msgpack::ScalarNode>(
- Func.getFnAttribute("runtime-handle").getValueAsString().str());
+ Kern[".device_enqueue_symbol"] = Kern.getDocument()->getNode(
+ Func.getFnAttribute("runtime-handle").getValueAsString().str(),
+ /*Copy=*/true);
}
}
void MetadataStreamerV3::emitKernelArgs(const Function &Func,
- msgpack::MapNode &Kern) {
+ msgpack::MapDocNode Kern) {
unsigned Offset = 0;
- auto Args = std::make_shared<msgpack::ArrayNode>();
+ auto Args = HSAMetadataDoc->getArrayNode();
for (auto &Arg : Func.args())
- emitKernelArg(Arg, Offset, *Args);
-
- emitHiddenKernelArgs(Func, Offset, *Args);
-
- // TODO: What about other languages?
- if (Func.getParent()->getNamedMetadata("opencl.ocl.version")) {
- auto &DL = Func.getParent()->getDataLayout();
- auto Int64Ty = Type::getInt64Ty(Func.getContext());
-
- emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, *Args);
- emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, *Args);
- emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, *Args);
-
- auto Int8PtrTy =
- Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+ emitKernelArg(Arg, Offset, Args);
- // Emit "printf buffer" argument if printf is used, otherwise emit dummy
- // "none" argument.
- if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
- emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, *Args);
- else
- emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
+ emitHiddenKernelArgs(Func, Offset, Args);
- // Emit "default queue" and "completion action" arguments if enqueue kernel
- // is used, otherwise emit dummy "none" arguments.
- if (Func.hasFnAttribute("calls-enqueue-kernel")) {
- emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, *Args);
- emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, *Args);
- } else {
- emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
- emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
- }
- }
-
- Kern[".args"] = std::move(Args);
+ Kern[".args"] = Args;
}
void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
- msgpack::ArrayNode &Args) {
+ msgpack::ArrayDocNode Args) {
auto Func = Arg.getParent();
auto ArgNo = Arg.getArgNo();
const MDNode *Node;
@@ -822,36 +778,35 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
StringRef ValueKind, unsigned &Offset,
- msgpack::ArrayNode &Args,
+ msgpack::ArrayDocNode Args,
unsigned PointeeAlign, StringRef Name,
StringRef TypeName,
StringRef BaseTypeName,
StringRef AccQual, StringRef TypeQual) {
- auto ArgPtr = std::make_shared<msgpack::MapNode>();
- auto &Arg = *ArgPtr;
+ auto Arg = Args.getDocument()->getMapNode();
if (!Name.empty())
- Arg[".name"] = std::make_shared<msgpack::ScalarNode>(Name);
+ Arg[".name"] = Arg.getDocument()->getNode(Name, /*Copy=*/true);
if (!TypeName.empty())
- Arg[".type_name"] = std::make_shared<msgpack::ScalarNode>(TypeName);
+ Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true);
auto Size = DL.getTypeAllocSize(Ty);
auto Align = DL.getABITypeAlignment(Ty);
- Arg[".size"] = std::make_shared<msgpack::ScalarNode>(Size);
+ Arg[".size"] = Arg.getDocument()->getNode(Size);
Offset = alignTo(Offset, Align);
- Arg[".offset"] = std::make_shared<msgpack::ScalarNode>(Offset);
+ Arg[".offset"] = Arg.getDocument()->getNode(Offset);
Offset += Size;
- Arg[".value_kind"] = std::make_shared<msgpack::ScalarNode>(ValueKind);
+ Arg[".value_kind"] = Arg.getDocument()->getNode(ValueKind, /*Copy=*/true);
Arg[".value_type"] =
- std::make_shared<msgpack::ScalarNode>(getValueType(Ty, BaseTypeName));
+ Arg.getDocument()->getNode(getValueType(Ty, BaseTypeName), /*Copy=*/true);
if (PointeeAlign)
- Arg[".pointee_align"] = std::make_shared<msgpack::ScalarNode>(PointeeAlign);
+ Arg[".pointee_align"] = Arg.getDocument()->getNode(PointeeAlign);
if (auto PtrTy = dyn_cast<PointerType>(Ty))
if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace()))
- Arg[".address_space"] = std::make_shared<msgpack::ScalarNode>(*Qualifier);
+ Arg[".address_space"] = Arg.getDocument()->getNode(*Qualifier, /*Copy=*/true);
if (auto AQ = getAccessQualifier(AccQual))
- Arg[".access"] = std::make_shared<msgpack::ScalarNode>(*AQ);
+ Arg[".access"] = Arg.getDocument()->getNode(*AQ, /*Copy=*/true);
// TODO: Emit Arg[".actual_access"].
@@ -859,21 +814,21 @@ void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
TypeQual.split(SplitTypeQuals, " ", -1, false);
for (StringRef Key : SplitTypeQuals) {
if (Key == "const")
- Arg[".is_const"] = std::make_shared<msgpack::ScalarNode>(true);
+ Arg[".is_const"] = Arg.getDocument()->getNode(true);
else if (Key == "restrict")
- Arg[".is_restrict"] = std::make_shared<msgpack::ScalarNode>(true);
+ Arg[".is_restrict"] = Arg.getDocument()->getNode(true);
else if (Key == "volatile")
- Arg[".is_volatile"] = std::make_shared<msgpack::ScalarNode>(true);
+ Arg[".is_volatile"] = Arg.getDocument()->getNode(true);
else if (Key == "pipe")
- Arg[".is_pipe"] = std::make_shared<msgpack::ScalarNode>(true);
+ Arg[".is_pipe"] = Arg.getDocument()->getNode(true);
}
- Args.push_back(std::move(ArgPtr));
+ Args.push_back(Arg);
}
void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
unsigned &Offset,
- msgpack::ArrayNode &Args) {
+ msgpack::ArrayDocNode Args) {
int HiddenArgNumBytes =
getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
@@ -913,56 +868,58 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
}
}
+
+ // Emit the pointer argument for multi-grid object.
+ if (HiddenArgNumBytes >= 56)
+ emitKernelArg(DL, Int8PtrTy, "hidden_multigrid_sync_arg", Offset, Args);
}
-std::shared_ptr<msgpack::MapNode>
+msgpack::MapDocNode
MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) const {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
const Function &F = MF.getFunction();
- auto HSAKernelProps = std::make_shared<msgpack::MapNode>();
- auto &Kern = *HSAKernelProps;
+ auto Kern = HSAMetadataDoc->getMapNode();
unsigned MaxKernArgAlign;
- Kern[".kernarg_segment_size"] = std::make_shared<msgpack::ScalarNode>(
+ Kern[".kernarg_segment_size"] = Kern.getDocument()->getNode(
STM.getKernArgSegmentSize(F, MaxKernArgAlign));
Kern[".group_segment_fixed_size"] =
- std::make_shared<msgpack::ScalarNode>(ProgramInfo.LDSSize);
+ Kern.getDocument()->getNode(ProgramInfo.LDSSize);
Kern[".private_segment_fixed_size"] =
- std::make_shared<msgpack::ScalarNode>(ProgramInfo.ScratchSize);
+ Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
Kern[".kernarg_segment_align"] =
- std::make_shared<msgpack::ScalarNode>(std::max(uint32_t(4), MaxKernArgAlign));
+ Kern.getDocument()->getNode(std::max(uint32_t(4), MaxKernArgAlign));
Kern[".wavefront_size"] =
- std::make_shared<msgpack::ScalarNode>(STM.getWavefrontSize());
- Kern[".sgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumSGPR);
- Kern[".vgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumVGPR);
+ Kern.getDocument()->getNode(STM.getWavefrontSize());
+ Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
+ Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
Kern[".max_flat_workgroup_size"] =
- std::make_shared<msgpack::ScalarNode>(MFI.getMaxFlatWorkGroupSize());
+ Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
Kern[".sgpr_spill_count"] =
- std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledSGPRs());
+ Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
Kern[".vgpr_spill_count"] =
- std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledVGPRs());
+ Kern.getDocument()->getNode(MFI.getNumSpilledVGPRs());
- return HSAKernelProps;
+ return Kern;
}
bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
- return TargetStreamer.EmitHSAMetadata(getHSAMetadataRoot(), true);
+ return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true);
}
void MetadataStreamerV3::begin(const Module &Mod) {
emitVersion();
emitPrintf(Mod);
- getRootMetadata("amdhsa.kernels").reset(new msgpack::ArrayNode());
+ getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
}
void MetadataStreamerV3::end() {
std::string HSAMetadataString;
raw_string_ostream StrOS(HSAMetadataString);
- yaml::Output YOut(StrOS);
- YOut << HSAMetadataRoot;
+ HSAMetadataDoc->toYAML(StrOS);
if (DumpHSAMetadata)
dump(StrOS.str());
@@ -973,25 +930,24 @@ void MetadataStreamerV3::end() {
void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) {
auto &Func = MF.getFunction();
- auto KernelProps = getHSAKernelProps(MF, ProgramInfo);
+ auto Kern = getHSAKernelProps(MF, ProgramInfo);
assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
Func.getCallingConv() == CallingConv::SPIR_KERNEL);
- auto &KernelsNode = getRootMetadata("amdhsa.kernels");
- auto Kernels = cast<msgpack::ArrayNode>(KernelsNode.get());
+ auto Kernels =
+ getRootMetadata("amdhsa.kernels").getArray(/*Convert=*/true);
{
- auto &Kern = *KernelProps;
- Kern[".name"] = std::make_shared<msgpack::ScalarNode>(Func.getName());
- Kern[".symbol"] = std::make_shared<msgpack::ScalarNode>(
- (Twine(Func.getName()) + Twine(".kd")).str());
+ Kern[".name"] = Kern.getDocument()->getNode(Func.getName());
+ Kern[".symbol"] = Kern.getDocument()->getNode(
+ (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
emitKernelLanguage(Func, Kern);
emitKernelAttrs(Func, Kern);
emitKernelArgs(Func, Kern);
}
- Kernels->push_back(std::move(KernelProps));
+ Kernels.push_back(Kern);
}
} // end namespace HSAMD
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index afc09baf952d..2eecddbd7b01 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -1,9 +1,8 @@
//===--- AMDGPUHSAMetadataStreamer.h ----------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,7 +18,7 @@
#include "AMDGPU.h"
#include "AMDKernelCodeT.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/MsgPackTypes.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/Support/AMDGPUMetadata.h"
namespace llvm {
@@ -52,8 +51,8 @@ public:
class MetadataStreamerV3 final : public MetadataStreamer {
private:
- std::shared_ptr<msgpack::Node> HSAMetadataRoot =
- std::make_shared<msgpack::MapNode>();
+ std::unique_ptr<msgpack::Document> HSAMetadataDoc =
+ llvm::make_unique<msgpack::Document>();
void dump(StringRef HSAMetadataString) const;
@@ -70,41 +69,39 @@ private:
std::string getTypeName(Type *Ty, bool Signed) const;
- std::shared_ptr<msgpack::ArrayNode>
- getWorkGroupDimensions(MDNode *Node) const;
+ msgpack::ArrayDocNode getWorkGroupDimensions(MDNode *Node) const;
- std::shared_ptr<msgpack::MapNode>
- getHSAKernelProps(const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const;
+ msgpack::MapDocNode getHSAKernelProps(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const;
void emitVersion();
void emitPrintf(const Module &Mod);
- void emitKernelLanguage(const Function &Func, msgpack::MapNode &Kern);
+ void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern);
- void emitKernelAttrs(const Function &Func, msgpack::MapNode &Kern);
+ void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern);
- void emitKernelArgs(const Function &Func, msgpack::MapNode &Kern);
+ void emitKernelArgs(const Function &Func, msgpack::MapDocNode Kern);
void emitKernelArg(const Argument &Arg, unsigned &Offset,
- msgpack::ArrayNode &Args);
+ msgpack::ArrayDocNode Args);
void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind,
- unsigned &Offset, msgpack::ArrayNode &Args,
+ unsigned &Offset, msgpack::ArrayDocNode Args,
unsigned PointeeAlign = 0, StringRef Name = "",
StringRef TypeName = "", StringRef BaseTypeName = "",
StringRef AccQual = "", StringRef TypeQual = "");
void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
- msgpack::ArrayNode &Args);
+ msgpack::ArrayDocNode Args);
- std::shared_ptr<msgpack::Node> &getRootMetadata(StringRef Key) {
- return (*cast<msgpack::MapNode>(HSAMetadataRoot.get()))[Key];
+ msgpack::DocNode &getRootMetadata(StringRef Key) {
+ return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key];
}
- std::shared_ptr<msgpack::Node> &getHSAMetadataRoot() {
- return HSAMetadataRoot;
+ msgpack::DocNode &getHSAMetadataRoot() {
+ return HSAMetadataDoc->getRoot();
}
public:
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index a0a045e72a58..ea730539f834 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
//
@@ -40,6 +39,9 @@
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/BasicBlock.h"
+#ifdef EXPENSIVE_CHECKS
+#include "llvm/IR/Dominators.h"
+#endif
#include "llvm/IR/Instruction.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/Casting.h"
@@ -52,6 +54,8 @@
#include <new>
#include <vector>
+#define DEBUG_TYPE "isel"
+
using namespace llvm;
namespace llvm {
@@ -66,6 +70,57 @@ class R600InstrInfo;
namespace {
+static bool isNullConstantOrUndef(SDValue V) {
+ if (V.isUndef())
+ return true;
+
+ ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+ return Const != nullptr && Const->isNullValue();
+}
+
+static bool getConstantValue(SDValue N, uint32_t &Out) {
+ // This is only used for packed vectors, where ussing 0 for undef should
+ // always be good.
+ if (N.isUndef()) {
+ Out = 0;
+ return true;
+ }
+
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+ Out = C->getAPIntValue().getSExtValue();
+ return true;
+ }
+
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
+ Out = C->getValueAPF().bitcastToAPInt().getSExtValue();
+ return true;
+ }
+
+ return false;
+}
+
+// TODO: Handle undef as zero
+static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
+ bool Negate = false) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
+ uint32_t LHSVal, RHSVal;
+ if (getConstantValue(N->getOperand(0), LHSVal) &&
+ getConstantValue(N->getOperand(1), RHSVal)) {
+ SDLoc SL(N);
+ uint32_t K = Negate ?
+ (-LHSVal & 0xffff) | (-RHSVal << 16) :
+ (LHSVal & 0xffff) | (RHSVal << 16);
+ return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
+ DAG.getTargetConstant(K, SL, MVT::i32));
+ }
+
+ return nullptr;
+}
+
+static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
+ return packConstantV2I16(N, DAG, true);
+}
+
/// AMDGPU specific code to select AMDGPU machine instructions for
/// SelectionDAG operations.
class AMDGPUDAGToDAGISel : public SelectionDAGISel {
@@ -84,12 +139,18 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AMDGPUArgumentUsageInfo>();
- AU.addRequired<AMDGPUPerfHintAnalysis>();
AU.addRequired<LegacyDivergenceAnalysis>();
+#ifdef EXPENSIVE_CHECKS
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+#endif
SelectionDAGISel::getAnalysisUsage(AU);
}
+ bool matchLoadD16FromBuildVector(SDNode *N) const;
+
bool runOnMachineFunction(MachineFunction &MF) override;
+ void PreprocessISelDAG() override;
void Select(SDNode *N) override;
StringRef getPassName() const override;
void PostprocessISelDAG() override;
@@ -100,19 +161,24 @@ protected:
private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
bool isNoNanSrc(SDValue N) const;
- bool isInlineImmediate(const SDNode *N) const;
+ bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
+ bool isNegInlineImmediate(const SDNode *N) const {
+ return isInlineImmediate(N, true);
+ }
+
bool isVGPRImm(const SDNode *N) const;
bool isUniformLoad(const SDNode *N) const;
bool isUniformBr(const SDNode *N) const;
MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
- SDNode *glueCopyToM0(SDNode *N) const;
+ SDNode *glueCopyToM0LDSInit(SDNode *N) const;
+ SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
- bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+ bool isDSOffsetLegal(SDValue Base, unsigned Offset,
unsigned OffsetBits) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
@@ -120,10 +186,10 @@ private:
bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
- SDValue &TFE) const;
+ SDValue &TFE, SDValue &DLC) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE) const;
+ SDValue &SLC, SDValue &TFE, SDValue &DLC) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
SDValue &SLC) const;
@@ -136,19 +202,19 @@ private:
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
SDValue &Offset, SDValue &GLC, SDValue &SLC,
- SDValue &TFE) const;
+ SDValue &TFE, SDValue &DLC) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset, SDValue &SLC) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
- bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr,
+ bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
- bool SelectFlatAtomicSigned(SDValue Addr, SDValue &VAddr,
+ bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
template <bool IsSigned>
- bool SelectFlatOffset(SDValue Addr, SDValue &VAddr,
+ bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
@@ -164,6 +230,7 @@ private:
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3Mods_f32(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const;
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
@@ -193,11 +260,13 @@ private:
bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectHi16Elt(SDValue In, SDValue &Src) const;
+ SDValue getHi16Elt(SDValue In) const;
void SelectADD_SUB_I64(SDNode *N);
+ void SelectAddcSubb(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
+ void SelectDIV_FMAS(SDNode *N);
void SelectMAD_64_32(SDNode *N);
void SelectFMA_W_CHAIN(SDNode *N);
void SelectFMUL_W_CHAIN(SDNode *N);
@@ -210,6 +279,10 @@ private:
void SelectBRCOND(SDNode *N);
void SelectFMAD_FMA(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);
+ void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
+ void SelectDS_GWS(SDNode *N, unsigned IntrID);
+ void SelectINTRINSIC_W_CHAIN(SDNode *N);
+ void SelectINTRINSIC_VOID(SDNode *N);
protected:
// Include the pieces autogenerated from the target description.
@@ -235,11 +308,49 @@ public:
SDValue &Offset) override;
bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void PreprocessISelDAG() override {}
+
protected:
// Include the pieces autogenerated from the target description.
#include "R600GenDAGISel.inc"
};
+static SDValue stripBitcast(SDValue Val) {
+ return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
+}
+
+// Figure out if this is really an extract of the high 16-bits of a dword.
+static bool isExtractHiElt(SDValue In, SDValue &Out) {
+ In = stripBitcast(In);
+ if (In.getOpcode() != ISD::TRUNCATE)
+ return false;
+
+ SDValue Srl = In.getOperand(0);
+ if (Srl.getOpcode() == ISD::SRL) {
+ if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+ if (ShiftAmt->getZExtValue() == 16) {
+ Out = stripBitcast(Srl.getOperand(0));
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+// Look through operations that obscure just looking at the low 16-bits of the
+// same register.
+static SDValue stripExtractLoElt(SDValue In) {
+ if (In.getOpcode() == ISD::TRUNCATE) {
+ SDValue Src = In.getOperand(0);
+ if (Src.getValueType().getSizeInBits() == 32)
+ return stripBitcast(Src);
+ }
+
+ return In;
+}
+
} // end anonymous namespace
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
@@ -247,6 +358,10 @@ INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+#ifdef EXPENSIVE_CHECKS
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+#endif
INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
@@ -265,10 +380,125 @@ FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
}
bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+#ifdef EXPENSIVE_CHECKS
+ DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ for (auto &L : LI->getLoopsInPreorder()) {
+ assert(L->isLCSSAForm(DT));
+ }
+#endif
Subtarget = &MF.getSubtarget<GCNSubtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
+bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
+ assert(Subtarget->d16PreservesUnusedBits());
+ MVT VT = N->getValueType(0).getSimpleVT();
+ if (VT != MVT::v2i16 && VT != MVT::v2f16)
+ return false;
+
+ SDValue Lo = N->getOperand(0);
+ SDValue Hi = N->getOperand(1);
+
+ LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
+
+ // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
+ // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
+ // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
+
+ // Need to check for possible indirect dependencies on the other half of the
+ // vector to avoid introducing a cycle.
+ if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
+ SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
+
+ SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
+ SDValue Ops[] = {
+ LdHi->getChain(), LdHi->getBasePtr(), TiedIn
+ };
+
+ unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
+ if (LdHi->getMemoryVT() == MVT::i8) {
+ LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
+ AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
+ } else {
+ assert(LdHi->getMemoryVT() == MVT::i16);
+ }
+
+ SDValue NewLoadHi =
+ CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
+ Ops, LdHi->getMemoryVT(),
+ LdHi->getMemOperand());
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
+ return true;
+ }
+
+ // build_vector (load ptr), hi -> load_d16_lo ptr, hi
+ // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
+ // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
+ LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
+ if (LdLo && Lo.hasOneUse()) {
+ SDValue TiedIn = getHi16Elt(Hi);
+ if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
+ return false;
+
+ SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
+ unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
+ if (LdLo->getMemoryVT() == MVT::i8) {
+ LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
+ AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
+ } else {
+ assert(LdLo->getMemoryVT() == MVT::i16);
+ }
+
+ TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
+
+ SDValue Ops[] = {
+ LdLo->getChain(), LdLo->getBasePtr(), TiedIn
+ };
+
+ SDValue NewLoadLo =
+ CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
+ Ops, LdLo->getMemoryVT(),
+ LdLo->getMemOperand());
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
+ return true;
+ }
+
+ return false;
+}
+
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
+ if (!Subtarget->d16PreservesUnusedBits())
+ return;
+
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+
+ bool MadeChange = false;
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ if (N->use_empty())
+ continue;
+
+ switch (N->getOpcode()) {
+ case ISD::BUILD_VECTOR:
+ MadeChange |= matchLoadD16FromBuildVector(N);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (MadeChange) {
+ CurDAG->RemoveDeadNodes();
+ LLVM_DEBUG(dbgs() << "After PreProcess:\n";
+ CurDAG->dump(););
+ }
+}
+
bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
if (TM.Options.NoNaNsFPMath)
return true;
@@ -280,14 +510,26 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
return CurDAG->isKnownNeverNaN(N);
}
-bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
+bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
+ bool Negated) const {
+ if (N->isUndef())
+ return true;
+
const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ if (Negated) {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+ return TII->isInlineConstant(-C->getAPIntValue());
+
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+ return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
- return TII->isInlineConstant(C->getAPIntValue());
+ } else {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+ return TII->isInlineConstant(C->getAPIntValue());
- if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
- return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+ return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
+ }
return false;
}
@@ -340,37 +582,48 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
}
}
-SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
- if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
- !Subtarget->ldsRequiresM0Init())
- return N;
-
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
const SITargetLowering& Lowering =
- *static_cast<const SITargetLowering*>(getTargetLowering());
+ *static_cast<const SITargetLowering*>(getTargetLowering());
- // Write max value to m0 before each load operation
+ assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
- SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
- CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+ SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N),
+ Val);
SDValue Glue = M0.getValue(1);
SmallVector <SDValue, 8> Ops;
- for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
- Ops.push_back(N->getOperand(i));
- }
+ Ops.push_back(M0); // Replace the chain.
+ for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
+ Ops.push_back(N->getOperand(i));
+
Ops.push_back(Glue);
return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
}
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
+ unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
+ if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ if (Subtarget->ldsRequiresM0Init())
+ return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+ } else if (AS == AMDGPUAS::REGION_ADDRESS) {
+ MachineFunction &MF = CurDAG->getMachineFunction();
+ unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
+ return
+ glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
+ }
+ return N;
+}
+
MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
EVT VT) const {
SDNode *Lo = CurDAG->getMachineNode(
AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
+ CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
SDNode *Hi =
CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
+ CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
const SDValue Ops[] = {
CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
@@ -385,31 +638,23 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
return AMDGPU::SReg_32_XM0RegClassID;
case 2:
return AMDGPU::SReg_64RegClassID;
+ case 3:
+ return AMDGPU::SGPR_96RegClassID;
case 4:
return AMDGPU::SReg_128RegClassID;
+ case 5:
+ return AMDGPU::SGPR_160RegClassID;
case 8:
return AMDGPU::SReg_256RegClassID;
case 16:
return AMDGPU::SReg_512RegClassID;
+ case 32:
+ return AMDGPU::SReg_1024RegClassID;
}
llvm_unreachable("invalid vector size");
}
-static bool getConstantValue(SDValue N, uint32_t &Out) {
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
- Out = C->getAPIntValue().getZExtValue();
- return true;
- }
-
- if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
- Out = C->getValueAPF().bitcastToAPInt().getZExtValue();
- return true;
- }
-
- return false;
-}
-
void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
@@ -423,12 +668,12 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
return;
}
- assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+ assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
"supported yet");
- // 16 = Max Num Vector Elements
+ // 32 = Max Num Vector Elements
// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
// 1 = Vector Register Class
- SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
+ SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
bool IsRegSeq = true;
@@ -470,10 +715,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
if (isa<AtomicSDNode>(N) ||
(Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FADD ||
+ Opc == ISD::ATOMIC_LOAD_FADD ||
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
- N = glueCopyToM0(N);
+ N = glueCopyToM0LDSInit(N);
switch (Opc) {
default:
@@ -491,6 +736,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectADD_SUB_I64(N);
return;
}
+ case ISD::ADDCARRY:
+ case ISD::SUBCARRY:
+ if (N->getValueType(0) != MVT::i32)
+ break;
+
+ SelectAddcSubb(N);
+ return;
case ISD::UADDO:
case ISD::USUBO: {
SelectUADDO_USUBO(N);
@@ -511,12 +763,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned NumVectorElts = VT.getVectorNumElements();
if (VT.getScalarSizeInBits() == 16) {
if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
- uint32_t LHSVal, RHSVal;
- if (getConstantValue(N->getOperand(0), LHSVal) &&
- getConstantValue(N->getOperand(1), RHSVal)) {
- uint32_t K = LHSVal | (RHSVal << 16);
- CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT,
- CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32));
+ if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
+ ReplaceNode(N, Packed);
return;
}
}
@@ -571,7 +819,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::STORE:
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: {
- N = glueCopyToM0(N);
+ N = glueCopyToM0LDSInit(N);
break;
}
@@ -606,6 +854,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectDIV_SCALE(N);
return;
}
+ case AMDGPUISD::DIV_FMAS: {
+ SelectDIV_FMAS(N);
+ return;
+ }
case AMDGPUISD::MAD_I64_I32:
case AMDGPUISD::MAD_U64_U32: {
SelectMAD_64_32(N);
@@ -649,6 +901,16 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectCode(N);
return;
}
+
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ SelectINTRINSIC_W_CHAIN(N);
+ return;
+ }
+ case ISD::INTRINSIC_VOID: {
+ SelectINTRINSIC_VOID(N);
+ return;
}
}
@@ -763,6 +1025,19 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
ReplaceNode(N, RegSequence);
}
+void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue CI = N->getOperand(2);
+
+ unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
+ : AMDGPU::V_SUBB_U32_e64;
+ CurDAG->SelectNodeTo(
+ N, Opc, N->getVTList(),
+ {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+}
+
void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
// carry out despite the _i32 name. These were renamed in VI to _U32.
@@ -770,8 +1045,10 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
unsigned Opc = N->getOpcode() == ISD::UADDO ?
AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
- CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
- { N->getOperand(0), N->getOperand(1) });
+ CurDAG->SelectNodeTo(
+ N, Opc, N->getVTList(),
+ {N->getOperand(0), N->getOperand(1),
+ CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
}
void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
@@ -816,6 +1093,35 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
+void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) {
+ const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
+ const SIRegisterInfo *TRI = ST->getRegisterInfo();
+
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+
+ assert(VT == MVT::f32 || VT == MVT::f64);
+
+ unsigned Opc
+ = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32;
+
+ SDValue CarryIn = N->getOperand(3);
+ // V_DIV_FMAS implicitly reads VCC.
+ SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL,
+ TRI->getVCC(), CarryIn, SDValue());
+
+ SDValue Ops[10];
+
+ SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
+ SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
+ SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
+
+ Ops[8] = VCC;
+ Ops[9] = VCC.getValue(1);
+
+ CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+}
+
// We need to handle this here because tablegen doesn't support matching
// instructions with multiple outputs.
void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
@@ -829,13 +1135,13 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
-bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
unsigned OffsetBits) const {
if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
(OffsetBits == 8 && !isUInt<8>(Offset)))
return false;
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS ||
+ if (Subtarget->hasUsableDSOffset() ||
Subtarget->unsafeDSOffsetFoldingEnabled())
return true;
@@ -871,13 +1177,20 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
Zero, Addr.getOperand(1));
if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+ SmallVector<SDValue, 3> Opnds;
+ Opnds.push_back(Zero);
+ Opnds.push_back(Addr.getOperand(1));
+
// FIXME: Select to VOP3 version for with-carry.
- unsigned SubOp = Subtarget->hasAddNoCarry() ?
- AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+ unsigned SubOp = AMDGPU::V_SUB_I32_e32;
+ if (Subtarget->hasAddNoCarry()) {
+ SubOp = AMDGPU::V_SUB_U32_e64;
+ Opnds.push_back(
+ CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
+ }
- MachineSDNode *MachineSub
- = CurDAG->getMachineNode(SubOp, DL, MVT::i32,
- Zero, Addr.getOperand(1));
+ MachineSDNode *MachineSub =
+ CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
Base = SDValue(MachineSub, 0);
Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
@@ -945,12 +1258,18 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
Zero, Addr.getOperand(1));
if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
- unsigned SubOp = Subtarget->hasAddNoCarry() ?
- AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+ SmallVector<SDValue, 3> Opnds;
+ Opnds.push_back(Zero);
+ Opnds.push_back(Addr.getOperand(1));
+ unsigned SubOp = AMDGPU::V_SUB_I32_e32;
+ if (Subtarget->hasAddNoCarry()) {
+ SubOp = AMDGPU::V_SUB_U32_e64;
+ Opnds.push_back(
+ CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
+ }
MachineSDNode *MachineSub
- = CurDAG->getMachineNode(SubOp, DL, MVT::i32,
- Zero, Addr.getOperand(1));
+ = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
Base = SDValue(MachineSub, 0);
Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
@@ -989,7 +1308,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64,
SDValue &GLC, SDValue &SLC,
- SDValue &TFE) const {
+ SDValue &TFE, SDValue &DLC) const {
// Subtarget prefers to use flat instruction
if (Subtarget->useFlatForGlobal())
return false;
@@ -1001,6 +1320,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
if (!SLC.getNode())
SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ DLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -1079,15 +1399,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE) const {
+ SDValue &SLC, SDValue &TFE,
+ SDValue &DLC) const {
SDValue Ptr, Offen, Idxen, Addr64;
// addr64 bit was removed for volcanic islands.
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (!Subtarget->hasAddr64())
return false;
if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE))
+ GLC, SLC, TFE, DLC))
return false;
ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
@@ -1109,9 +1430,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &Offset,
SDValue &SLC) const {
SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
- SDValue GLC, TFE;
+ SDValue GLC, TFE, DLC;
- return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
+ return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC);
}
static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
@@ -1127,10 +1448,10 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const
SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
FI->getValueType(0));
- // If we can resolve this to a frame index access, this is relative to the
- // frame pointer SGPR.
- return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(),
- MVT::i32));
+ // If we can resolve this to a frame index access, this will be relative to
+ // either the stack or frame pointer SGPR.
+ return std::make_pair(
+ TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32));
}
// If we don't know this private access is a local stack object, it needs to
@@ -1236,13 +1557,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &SOffset, SDValue &Offset,
SDValue &GLC, SDValue &SLC,
- SDValue &TFE) const {
+ SDValue &TFE, SDValue &DLC) const {
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE))
+ GLC, SLC, TFE, DLC))
return false;
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
@@ -1264,57 +1585,42 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &Soffset, SDValue &Offset
) const {
- SDValue GLC, SLC, TFE;
+ SDValue GLC, SLC, TFE, DLC;
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC);
}
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &Soffset, SDValue &Offset,
SDValue &SLC) const {
- SDValue GLC, TFE;
+ SDValue GLC, TFE, DLC;
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC);
}
template <bool IsSigned>
-bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
+ SDValue Addr,
SDValue &VAddr,
SDValue &Offset,
SDValue &SLC) const {
- int64_t OffsetVal = 0;
-
- if (Subtarget->hasFlatInstOffsets() &&
- CurDAG->isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
- int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
-
- if ((IsSigned && isInt<13>(COffsetVal)) ||
- (!IsSigned && isUInt<12>(COffsetVal))) {
- Addr = N0;
- OffsetVal = COffsetVal;
- }
- }
-
- VAddr = Addr;
- Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
- SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
-
- return true;
+ return static_cast<const SITargetLowering*>(getTargetLowering())->
+ SelectFlatOffset(IsSigned, *CurDAG, N, Addr, VAddr, Offset, SLC);
}
-bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N,
+ SDValue Addr,
SDValue &VAddr,
SDValue &Offset,
SDValue &SLC) const {
- return SelectFlatOffset<false>(Addr, VAddr, Offset, SLC);
+ return SelectFlatOffset<false>(N, Addr, VAddr, Offset, SLC);
}
-bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
+ SDValue Addr,
SDValue &VAddr,
SDValue &Offset,
SDValue &SLC) const {
- return SelectFlatOffset<true>(Addr, VAddr, Offset, SLC);
+ return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC);
}
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
@@ -1619,9 +1925,12 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
return;
}
+ const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
+ const SIRegisterInfo *TRI = ST->getRegisterInfo();
+
bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
- unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC;
+ unsigned CondReg = UseSCCBr ? (unsigned)AMDGPU::SCC : TRI->getVCC();
SDLoc SL(N);
if (!UseSCCBr) {
@@ -1638,9 +1947,13 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
// the S_AND when is unnecessary. But it would be better to add a separate
// pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
// catches both cases.
- Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1,
- CurDAG->getRegister(AMDGPU::EXEC, MVT::i1),
- Cond),
+ Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
+ : AMDGPU::S_AND_B64,
+ SL, MVT::i1,
+ CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
+ : AMDGPU::EXEC,
+ MVT::i1),
+ Cond),
0);
}
@@ -1761,6 +2074,183 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
CurDAG->RemoveDeadNode(N);
}
+void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
+ // The address is assumed to be uniform, so if it ends up in a VGPR, it will
+ // be copied to an SGPR with readfirstlane.
+ unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
+ AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
+
+ SDValue Chain = N->getOperand(0);
+ SDValue Ptr = N->getOperand(2);
+ MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+ MachineMemOperand *MMO = M->getMemOperand();
+ bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+
+ SDValue Offset;
+ if (CurDAG->isBaseWithConstantOffset(Ptr)) {
+ SDValue PtrBase = Ptr.getOperand(0);
+ SDValue PtrOffset = Ptr.getOperand(1);
+
+ const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
+ if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
+ N = glueCopyToM0(N, PtrBase);
+ Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
+ }
+ }
+
+ if (!Offset) {
+ N = glueCopyToM0(N, Ptr);
+ Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+ }
+
+ SDValue Ops[] = {
+ Offset,
+ CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
+ Chain,
+ N->getOperand(N->getNumOperands() - 1) // New glue
+ };
+
+ SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
+}
+
+static unsigned gwsIntrinToOpcode(unsigned IntrID) {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_gws_init:
+ return AMDGPU::DS_GWS_INIT;
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ return AMDGPU::DS_GWS_BARRIER;
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ return AMDGPU::DS_GWS_SEMA_V;
+ case Intrinsic::amdgcn_ds_gws_sema_br:
+ return AMDGPU::DS_GWS_SEMA_BR;
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ return AMDGPU::DS_GWS_SEMA_P;
+ case Intrinsic::amdgcn_ds_gws_sema_release_all:
+ return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
+ default:
+ llvm_unreachable("not a gws intrinsic");
+ }
+}
+
+void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
+ if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
+ !Subtarget->hasGWSSemaReleaseAll()) {
+ // Let this error.
+ SelectCode(N);
+ return;
+ }
+
+ // Chain, intrinsic ID, vsrc, offset
+ const bool HasVSrc = N->getNumOperands() == 4;
+ assert(HasVSrc || N->getNumOperands() == 3);
+
+ SDLoc SL(N);
+ SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
+ int ImmOffset = 0;
+ MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+ MachineMemOperand *MMO = M->getMemOperand();
+
+ // Don't worry if the offset ends up in a VGPR. Only one lane will have
+ // effect, so SIFixSGPRCopies will validly insert readfirstlane.
+
+ // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
+ // offset field) % 64. Some versions of the programming guide omit the m0
+ // part, or claim it's from offset 0.
+ if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
+ // If we have a constant offset, try to use the default value for m0 as a
+ // base to possibly avoid setting it up.
+ glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
+ ImmOffset = ConstOffset->getZExtValue() + 1;
+ } else {
+ if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
+ ImmOffset = BaseOffset.getConstantOperandVal(1);
+ BaseOffset = BaseOffset.getOperand(0);
+ }
+
+ // Prefer to do the shift in an SGPR since it should be possible to use m0
+ // as the result directly. If it's already an SGPR, it will be eliminated
+ // later.
+ SDNode *SGPROffset
+ = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
+ BaseOffset);
+ // Shift to offset in m0
+ SDNode *M0Base
+ = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
+ SDValue(SGPROffset, 0),
+ CurDAG->getTargetConstant(16, SL, MVT::i32));
+ glueCopyToM0(N, SDValue(M0Base, 0));
+ }
+
+ SDValue V0;
+ SDValue Chain = N->getOperand(0);
+ SDValue Glue;
+ if (HasVSrc) {
+ SDValue VSrc0 = N->getOperand(2);
+
+ // The manual doesn't mention this, but it seems only v0 works.
+ V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
+
+ SDValue CopyToV0 = CurDAG->getCopyToReg(
+ N->getOperand(0), SL, V0, VSrc0,
+ N->getOperand(N->getNumOperands() - 1));
+ Chain = CopyToV0;
+ Glue = CopyToV0.getValue(1);
+ }
+
+ SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
+
+ // TODO: Can this just be removed from the instruction?
+ SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
+
+ const unsigned Opc = gwsIntrinToOpcode(IntrID);
+ SmallVector<SDValue, 5> Ops;
+ if (HasVSrc)
+ Ops.push_back(V0);
+ Ops.push_back(OffsetField);
+ Ops.push_back(GDS);
+ Ops.push_back(Chain);
+
+ if (HasVSrc)
+ Ops.push_back(Glue);
+
+ SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
+}
+
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_append:
+ case Intrinsic::amdgcn_ds_consume: {
+ if (N->getValueType(0) != MVT::i32)
+ break;
+ SelectDSAppendConsume(N, IntrID);
+ return;
+ }
+ }
+
+ SelectCode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ case Intrinsic::amdgcn_ds_gws_sema_br:
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ case Intrinsic::amdgcn_ds_gws_sema_release_all:
+ SelectDS_GWS(N, IntrID);
+ return;
+ default:
+ break;
+ }
+
+ SelectCode(N);
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
unsigned &Mods) const {
Mods = 0;
@@ -1796,6 +2286,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
return isNoNanSrc(Src);
}
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ if (In.getValueType() == MVT::f32)
+ return SelectVOP3Mods(In, Src, SrcMods);
+ Src = In;
+ SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);;
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
return false;
@@ -1833,41 +2332,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
return true;
}
-static SDValue stripBitcast(SDValue Val) {
- return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
-}
-
-// Figure out if this is really an extract of the high 16-bits of a dword.
-static bool isExtractHiElt(SDValue In, SDValue &Out) {
- In = stripBitcast(In);
- if (In.getOpcode() != ISD::TRUNCATE)
- return false;
-
- SDValue Srl = In.getOperand(0);
- if (Srl.getOpcode() == ISD::SRL) {
- if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
- if (ShiftAmt->getZExtValue() == 16) {
- Out = stripBitcast(Srl.getOperand(0));
- return true;
- }
- }
- }
-
- return false;
-}
-
-// Look through operations that obscure just looking at the low 16-bits of the
-// same register.
-static SDValue stripExtractLoElt(SDValue In) {
- if (In.getOpcode() == ISD::TRUNCATE) {
- SDValue Src = In.getOperand(0);
- if (Src.getValueType().getSizeInBits() == 32)
- return stripBitcast(Src);
- }
-
- return In;
-}
-
bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
@@ -2020,39 +2484,31 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
return true;
}
-// TODO: Can we identify things like v_mad_mixhi_f16?
-bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
- if (In.isUndef()) {
- Src = In;
- return true;
- }
+SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
+ if (In.isUndef())
+ return CurDAG->getUNDEF(MVT::i32);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
SDLoc SL(In);
- SDValue K = CurDAG->getTargetConstant(C->getZExtValue() << 16, SL, MVT::i32);
- MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
- SL, MVT::i32, K);
- Src = SDValue(MovK, 0);
- return true;
+ return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
}
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
SDLoc SL(In);
- SDValue K = CurDAG->getTargetConstant(
+ return CurDAG->getConstant(
C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
- MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
- SL, MVT::i32, K);
- Src = SDValue(MovK, 0);
- return true;
}
- return isExtractHiElt(In, Src);
+ SDValue Src;
+ if (isExtractHiElt(In, Src))
+ return Src;
+
+ return SDValue();
}
bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
- if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- return false;
- }
+ assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
+
const SIRegisterInfo *SIRI =
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
const SIInstrInfo * SII =
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 6951c915b177..39016ed37193 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -21,7 +20,6 @@
#include "AMDGPU.h"
#include "AMDGPUCallLowering.h"
#include "AMDGPUFrameLowering.h"
-#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
@@ -65,9 +63,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
case MVT::v2f32:
case MVT::v4i16:
case MVT::v4f16: {
- // Up to SGPR0-SGPR39
+ // Up to SGPR0-SGPR105
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::SGPR_64RegClass, 20);
+ &AMDGPU::SGPR_64RegClass, 53);
}
default:
return false;
@@ -152,15 +150,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+ setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
+
setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
+
setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
+ setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
+
setOperationAction(ISD::LOAD, MVT::i64, Promote);
AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
@@ -237,15 +244,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+ setOperationAction(ISD::STORE, MVT::v3f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
+
setOperationAction(ISD::STORE, MVT::v4f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
+ setOperationAction(ISD::STORE, MVT::v5f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
+
setOperationAction(ISD::STORE, MVT::v8f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
setOperationAction(ISD::STORE, MVT::v16f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
+ setOperationAction(ISD::STORE, MVT::v32f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
+
setOperationAction(ISD::STORE, MVT::i64, Promote);
AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
@@ -327,16 +343,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// Expand to fneg + fadd.
setOperationAction(ISD::FSUB, MVT::f64, Expand);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
@@ -394,7 +422,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
static const MVT::SimpleValueType VectorIntTypes[] = {
- MVT::v2i32, MVT::v4i32
+ MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
};
for (MVT VT : VectorIntTypes) {
@@ -436,7 +464,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
- MVT::v2f32, MVT::v4f32
+ MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
};
for (MVT VT : FloatVectorTypes) {
@@ -478,9 +506,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
+ setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
+
setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
+ setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
+
// There are no libcalls of any kind.
for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
@@ -499,6 +533,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// vector compares until that is fixed.
setHasMultipleConditionRegisters(true);
+ setMinCmpXchgSizeInBits(32);
+ setSupportsUnalignedAtomics(false);
+
PredictableSelectIsExpensive = false;
// We want to find all load dependencies for long chains of stores to enable
@@ -592,6 +629,7 @@ static bool hasSourceMods(const SDNode *N) {
case ISD::FDIV:
case ISD::FREM:
case ISD::INLINEASM:
+ case ISD::INLINEASM_BR:
case AMDGPUISD::INTERP_P1:
case AMDGPUISD::INTERP_P2:
case AMDGPUISD::DIV_SCALE:
@@ -640,7 +678,8 @@ bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
// The backend supports 32 and 64 bit floating point immediates.
// FIXME: Why are we reporting vectors of FP immediates as legal?
-bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
EVT ScalarVT = VT.getScalarType();
return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
(ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
@@ -690,8 +729,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
return (OldSize < 32);
}
-bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
- EVT CastTy) const {
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const {
assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
@@ -701,8 +741,12 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
unsigned LScalarSize = LoadTy.getScalarSizeInBits();
unsigned CastScalarSize = CastTy.getScalarSizeInBits();
- return (LScalarSize < CastScalarSize) ||
- (CastScalarSize >= 32);
+ if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
+ return false;
+
+ bool Fast = false;
+ return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy,
+ MMO, &Fast) && Fast;
}
// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
@@ -849,9 +893,6 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) {
switch (CC) {
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::SPIR_KERNEL:
- llvm_unreachable("kernels should not be handled here");
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
@@ -864,8 +905,10 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::Fast:
case CallingConv::Cold:
return CC_AMDGPU_Func;
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
default:
- report_fatal_error("Unsupported calling convention.");
+ report_fatal_error("Unsupported calling convention for call");
}
}
@@ -1010,9 +1053,10 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
MemVT = MemVT.getScalarType();
- if (MemVT.isExtended()) {
- // This should really only happen if we have vec3 arguments
- assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+ // Round up vec3/vec5 argument.
+ if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
+ assert(MemVT.getVectorNumElements() == 3 ||
+ MemVT.getVectorNumElements() == 5);
MemVT = MemVT.getPow2VectorType(State.getContext());
}
@@ -1372,6 +1416,41 @@ SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
}
+// Split a vector type into two parts. The first part is a power of two vector.
+// The second part is whatever is left over, and is a scalar if it would
+// otherwise be a 1-vector.
+std::pair<EVT, EVT>
+AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
+ EVT LoVT, HiVT;
+ EVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
+ LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
+ HiVT = NumElts - LoNumElts == 1
+ ? EltVT
+ : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
+ return std::make_pair(LoVT, HiVT);
+}
+
+// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
+// scalar.
+std::pair<SDValue, SDValue>
+AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
+ const EVT &LoVT, const EVT &HiVT,
+ SelectionDAG &DAG) const {
+ assert(LoVT.getVectorNumElements() +
+ (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
+ N.getValueType().getVectorNumElements() &&
+ "More vector elements requested than available!");
+ auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
+ DAG.getConstant(0, DL, IdxTy));
+ SDValue Hi = DAG.getNode(
+ HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
+ HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy));
+ return std::make_pair(Lo, Hi);
+}
+
SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
SelectionDAG &DAG) const {
LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -1393,9 +1472,9 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
EVT LoMemVT, HiMemVT;
SDValue Lo, Hi;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
- std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
- std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+ std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
+ std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
+ std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
unsigned Size = LoMemVT.getStoreSize();
unsigned BaseAlign = Load->getAlignment();
@@ -1410,15 +1489,52 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
- SDValue Ops[] = {
- DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
- DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
- LoLoad.getValue(1), HiLoad.getValue(1))
- };
+ auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
+ SDValue Join;
+ if (LoVT == HiVT) {
+ // This is the case that the vector is power of two so was evenly split.
+ Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
+ } else {
+ Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
+ DAG.getConstant(0, SL, IdxTy));
+ Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR
+ : ISD::INSERT_VECTOR_ELT,
+ SL, VT, Join, HiLoad,
+ DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy));
+ }
+
+ SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+ LoLoad.getValue(1), HiLoad.getValue(1))};
return DAG.getMergeValues(Ops, SL);
}
+// Widen a vector load from vec3 to vec4.
+SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
+ SelectionDAG &DAG) const {
+ LoadSDNode *Load = cast<LoadSDNode>(Op);
+ EVT VT = Op.getValueType();
+ assert(VT.getVectorNumElements() == 3);
+ SDValue BasePtr = Load->getBasePtr();
+ EVT MemVT = Load->getMemoryVT();
+ SDLoc SL(Op);
+ const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
+ unsigned BaseAlign = Load->getAlignment();
+
+ EVT WideVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
+ EVT WideMemVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
+ SDValue WideLoad = DAG.getExtLoad(
+ Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
+ WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
+ return DAG.getMergeValues(
+ {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
+ DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))),
+ WideLoad.getValue(1)},
+ SL);
+}
+
SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
SelectionDAG &DAG) const {
StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -1439,9 +1555,9 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
EVT LoMemVT, HiMemVT;
SDValue Lo, Hi;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
- std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
- std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
+ std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
+ std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
+ std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
@@ -2788,6 +2904,54 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
return true;
}
+// Find a load or store from corresponding pattern root.
+// Roots may be build_vector, bitconvert or their combinations.
+static MemSDNode* findMemSDNode(SDNode *N) {
+ N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
+ if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
+ return MN;
+ assert(isa<BuildVectorSDNode>(N));
+ for (SDValue V : N->op_values())
+ if (MemSDNode *MN =
+ dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
+ return MN;
+ llvm_unreachable("cannot find MemSDNode in the pattern!");
+}
+
+bool AMDGPUTargetLowering::SelectFlatOffset(bool IsSigned,
+ SelectionDAG &DAG,
+ SDNode *N,
+ SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset,
+ SDValue &SLC) const {
+ const GCNSubtarget &ST =
+ DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
+ int64_t OffsetVal = 0;
+
+ if (ST.hasFlatInstOffsets() &&
+ (!ST.hasFlatSegmentOffsetBug() ||
+ findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
+ DAG.isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(),
+ IsSigned)) {
+ Addr = N0;
+ OffsetVal = COffsetVal;
+ }
+ }
+
+ VAddr = Addr;
+ Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
+ SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1);
+
+ return true;
+}
+
// Replace load of an illegal type with a store of a bitcast to a friendlier
// type.
SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
@@ -2812,7 +2976,8 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
// Expand unaligned loads earlier than legalization. Due to visitation order
// problems during legalization, the emitted instructions to pack and unpack
// the bytes again are not eliminated in the case of an unaligned copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(
+ VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
if (VT.isVector())
return scalarizeVectorLoad(LN, DAG);
@@ -2864,7 +3029,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
// order problems during legalization, the emitted instructions to pack and
// unpack the bytes again are not eliminated in the case of an unaligned
// copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(
+ VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
if (VT.isVector())
return scalarizeVectorStore(SN, DAG);
@@ -3049,30 +3215,44 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
- if (N->getValueType(0) != MVT::i64)
- return SDValue();
-
- const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!RHS)
return SDValue();
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
unsigned ShiftAmt = RHS->getZExtValue();
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
+ // this improves the ability to match BFE patterns in isel.
+ if (LHS.getOpcode() == ISD::AND) {
+ if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
+ if (Mask->getAPIntValue().isShiftedMask() &&
+ Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
+ return DAG.getNode(
+ ISD::AND, SL, VT,
+ DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
+ DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
+ }
+ }
+ }
+
+ if (VT != MVT::i64)
+ return SDValue();
+
if (ShiftAmt < 32)
return SDValue();
// srl i64:x, C for C >= 32
// =>
// build_pair (srl hi_32(x), C - 32), 0
-
- SelectionDAG &DAG = DCI.DAG;
- SDLoc SL(N);
-
SDValue One = DAG.getConstant(1, SL, MVT::i32);
SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
- SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
- SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
- VecOp, One);
+ SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
@@ -3090,7 +3270,7 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
SDValue Src = N->getOperand(0);
// vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
- if (Src.getOpcode() == ISD::BITCAST) {
+ if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
SDValue Vec = Src.getOperand(0);
if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
SDValue Elt0 = Vec.getOperand(0);
@@ -3478,13 +3658,11 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
SelectionDAG &DAG = DCI.DAG;
- if ((DAG.isConstantValueOfAnyType(True) ||
- DAG.isConstantValueOfAnyType(True)) &&
- (!DAG.isConstantValueOfAnyType(False) &&
- !DAG.isConstantValueOfAnyType(False))) {
+ if (DAG.isConstantValueOfAnyType(True) &&
+ !DAG.isConstantValueOfAnyType(False)) {
// Swap cmp + select pair to move constant to false input.
// This will allow using VOPC cndmasks more often.
- // select (setcc x, y), k, x -> select (setcc y, x) x, x
+ // select (setcc x, y), k, x -> select (setccinv x, y), x, k
SDLoc SL(N);
ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
@@ -3594,6 +3772,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
RHS = RHS.getOperand(0);
SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
+ if (Res.getOpcode() != ISD::FADD)
+ return SDValue(); // Op got folded away.
if (!N0.hasOneUse())
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
return Res;
@@ -3613,6 +3793,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
+ if (Res.getOpcode() != Opc)
+ return SDValue(); // Op got folded away.
if (!N0.hasOneUse())
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
return Res;
@@ -3640,6 +3822,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
RHS = RHS.getOperand(0);
SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
+ if (Res.getOpcode() != Opc)
+ return SDValue(); // Op got folded away.
if (!N0.hasOneUse())
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
return Res;
@@ -3668,6 +3852,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
unsigned Opposite = inverseMinMax(Opc);
SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
+ if (Res.getOpcode() != Opposite)
+ return SDValue(); // Op got folded away.
if (!N0.hasOneUse())
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
return Res;
@@ -3678,6 +3864,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
+ if (Res.getOpcode() != AMDGPUISD::FMED3)
+ return SDValue(); // Op got folded away.
if (!N0.hasOneUse())
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
return Res;
@@ -4051,9 +4239,19 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
const ArgDescriptor &Arg) const {
assert(Arg && "Attempting to load missing argument");
- if (Arg.isRegister())
- return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
- return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
+ SDValue V = Arg.isRegister() ?
+ CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
+ loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
+
+ if (!Arg.isMasked())
+ return V;
+
+ unsigned Mask = Arg.getMask();
+ unsigned Shift = countTrailingZeros<unsigned>(Mask);
+ V = DAG.getNode(ISD::SRL, SL, VT, V,
+ DAG.getShiftAmountConstant(Shift, VT, SL));
+ return DAG.getNode(ISD::AND, SL, VT, V,
+ DAG.getConstant(Mask >> Shift, SL, VT));
}
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
@@ -4175,6 +4373,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+ NODE_NAME_CASE(LDS)
NODE_NAME_CASE(KILL)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
@@ -4185,24 +4384,38 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(INTERP_MOV)
NODE_NAME_CASE(INTERP_P1)
NODE_NAME_CASE(INTERP_P2)
+ NODE_NAME_CASE(INTERP_P1LL_F16)
+ NODE_NAME_CASE(INTERP_P1LV_F16)
+ NODE_NAME_CASE(INTERP_P2_F16)
+ NODE_NAME_CASE(LOAD_D16_HI)
+ NODE_NAME_CASE(LOAD_D16_LO)
+ NODE_NAME_CASE(LOAD_D16_HI_I8)
+ NODE_NAME_CASE(LOAD_D16_HI_U8)
+ NODE_NAME_CASE(LOAD_D16_LO_I8)
+ NODE_NAME_CASE(LOAD_D16_LO_U8)
NODE_NAME_CASE(STORE_MSKOR)
NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
- NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
+ NODE_NAME_CASE(DS_ORDERED_COUNT)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
NODE_NAME_CASE(ATOMIC_INC)
NODE_NAME_CASE(ATOMIC_DEC)
- NODE_NAME_CASE(ATOMIC_LOAD_FADD)
NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
NODE_NAME_CASE(BUFFER_LOAD)
+ NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
+ NODE_NAME_CASE(BUFFER_LOAD_USHORT)
+ NODE_NAME_CASE(BUFFER_LOAD_BYTE)
+ NODE_NAME_CASE(BUFFER_LOAD_SHORT)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(SBUFFER_LOAD)
NODE_NAME_CASE(BUFFER_STORE)
+ NODE_NAME_CASE(BUFFER_STORE_BYTE)
+ NODE_NAME_CASE(BUFFER_STORE_SHORT)
NODE_NAME_CASE(BUFFER_STORE_FORMAT)
NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
@@ -4216,6 +4429,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_OR)
NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
+ NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
+ NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
+ NODE_NAME_CASE(ATOMIC_FADD)
+ NODE_NAME_CASE(ATOMIC_PK_FADD)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
@@ -4367,6 +4584,23 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
}
break;
}
+ case AMDGPUISD::BUFFER_LOAD_UBYTE: {
+ Known.Zero.setHighBits(24);
+ break;
+ }
+ case AMDGPUISD::BUFFER_LOAD_USHORT: {
+ Known.Zero.setHighBits(16);
+ break;
+ }
+ case AMDGPUISD::LDS: {
+ auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
+ unsigned Align = GA->getGlobal()->getAlignment();
+
+ Known.Zero.setHighBits(16);
+ if (Align)
+ Known.Zero.setLowBits(Log2_32(Align));
+ break;
+ }
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IID) {
@@ -4412,6 +4646,14 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
case AMDGPUISD::CARRY:
case AMDGPUISD::BORROW:
return 31;
+ case AMDGPUISD::BUFFER_LOAD_BYTE:
+ return 25;
+ case AMDGPUISD::BUFFER_LOAD_SHORT:
+ return 17;
+ case AMDGPUISD::BUFFER_LOAD_UBYTE:
+ return 24;
+ case AMDGPUISD::BUFFER_LOAD_USHORT:
+ return 16;
case AMDGPUISD::FP_TO_FP16:
case AMDGPUISD::FP16_ZEXT:
return 16;
@@ -4519,7 +4761,12 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
TargetLowering::AtomicExpansionKind
AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
- if (RMW->getOperation() == AtomicRMWInst::Nand)
+ switch (RMW->getOperation()) {
+ case AtomicRMWInst::Nand:
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub:
return AtomicExpansionKind::CmpXChg;
- return AtomicExpansionKind::None;
+ default:
+ return AtomicExpansionKind::None;
+ }
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 0d22cb2e3e20..fe7ad694943d 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -1,9 +1,8 @@
//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -111,9 +110,23 @@ protected:
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
+ /// Split a vector type into two parts. The first part is a power of two
+ /// vector. The second part is whatever is left over, and is a scalar if it
+ /// would otherwise be a 1-vector.
+ std::pair<EVT, EVT> getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const;
+
+ /// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
+ /// scalar.
+ std::pair<SDValue, SDValue> splitVector(const SDValue &N, const SDLoc &DL,
+ const EVT &LoVT, const EVT &HighVT,
+ SelectionDAG &DAG) const;
+
/// Split a vector load into 2 loads of half the vector.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+ /// Widen a vector load from vec3 to vec4.
+ SDValue WidenVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+
/// Split a vector store into 2 stores of half the vector.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
@@ -162,13 +175,15 @@ public:
MVT getVectorIdxTy(const DataLayout &) const override;
bool isSelectSupported(SelectSupportKind) const override;
- bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
bool ShouldShrinkFPConstant(EVT VT) const override;
bool shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtType,
EVT ExtVT) const override;
- bool isLoadBitCastBeneficial(EVT, EVT) const final;
+ bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const final;
bool storeOfVectorConstantIsCheap(EVT MemVT,
unsigned NumElem,
@@ -212,15 +227,15 @@ public:
const char* getTargetNodeName(unsigned Opcode) const override;
- // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection
- // for AMDGPU.
- // A commit ( git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319036
- // 91177308-0d34-0410-b5e6-96231b3b80d8 ) turned on
- // MergeConsecutiveStores() before Instruction Selection for all targets.
- // Enough AMDGPU compiles go into an infinite loop ( MergeConsecutiveStores()
- // merges two stores; LegalizeStoreOps() un-merges; MergeConsecutiveStores()
- // re-merges, etc. ) to warrant turning it off for now.
- bool mergeStoresAfterLegalization() const override { return false; }
+ // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection for
+ // AMDGPU. Commit r319036,
+ // (https://github.com/llvm/llvm-project/commit/db77e57ea86d941a4262ef60261692f4cb6893e6)
+ // turned on MergeConsecutiveStores() before Instruction Selection for all
+ // targets. Enough AMDGPU compiles go into an infinite loop (
+ // MergeConsecutiveStores() merges two stores; LegalizeStoreOps() un-merges;
+ // MergeConsecutiveStores() re-merges, etc. ) to warrant turning it off for
+ // now.
+ bool mergeStoresAfterLegalization(EVT) const override { return false; }
bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
return true;
@@ -309,6 +324,10 @@ public:
}
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+
+ bool SelectFlatOffset(bool IsSigned, SelectionDAG &DAG, SDNode *N,
+ SDValue Addr, SDValue &VAddr, SDValue &Offset,
+ SDValue &SLC) const;
};
namespace AMDGPUISD {
@@ -463,28 +482,44 @@ enum NodeType : unsigned {
INTERP_MOV,
INTERP_P1,
INTERP_P2,
+ INTERP_P1LL_F16,
+ INTERP_P1LV_F16,
+ INTERP_P2_F16,
PC_ADD_REL_OFFSET,
+ LDS,
KILL,
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LOAD_D16_HI,
+ LOAD_D16_LO,
+ LOAD_D16_HI_I8,
+ LOAD_D16_HI_U8,
+ LOAD_D16_LO_I8,
+ LOAD_D16_LO_U8,
+
STORE_MSKOR,
LOAD_CONSTANT,
TBUFFER_STORE_FORMAT,
- TBUFFER_STORE_FORMAT_X3,
TBUFFER_STORE_FORMAT_D16,
TBUFFER_LOAD_FORMAT,
TBUFFER_LOAD_FORMAT_D16,
+ DS_ORDERED_COUNT,
ATOMIC_CMP_SWAP,
ATOMIC_INC,
ATOMIC_DEC,
- ATOMIC_LOAD_FADD,
ATOMIC_LOAD_FMIN,
ATOMIC_LOAD_FMAX,
BUFFER_LOAD,
+ BUFFER_LOAD_UBYTE,
+ BUFFER_LOAD_USHORT,
+ BUFFER_LOAD_BYTE,
+ BUFFER_LOAD_SHORT,
BUFFER_LOAD_FORMAT,
BUFFER_LOAD_FORMAT_D16,
SBUFFER_LOAD,
BUFFER_STORE,
+ BUFFER_STORE_BYTE,
+ BUFFER_STORE_SHORT,
BUFFER_STORE_FORMAT,
BUFFER_STORE_FORMAT_D16,
BUFFER_ATOMIC_SWAP,
@@ -498,6 +533,10 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_OR,
BUFFER_ATOMIC_XOR,
BUFFER_ATOMIC_CMPSWAP,
+ BUFFER_ATOMIC_FADD,
+ BUFFER_ATOMIC_PK_FADD,
+ ATOMIC_FADD,
+ ATOMIC_PK_FADD,
LAST_AMDGPU_ISD_NUMBER
};
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index 945c9acd379a..f4df20b8f03e 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -40,7 +39,7 @@ using namespace llvm;
#define DEBUG_TYPE "inline"
static cl::opt<int>
-ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
+ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500),
cl::desc("Cost of alloca argument"));
// If the amount of scratch memory to eliminate exceeds our ability to allocate
@@ -50,6 +49,12 @@ static cl::opt<unsigned>
ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
cl::desc("Maximum alloca size to use for inline cost"));
+// Inliner constraint to achieve reasonable compilation time
+static cl::opt<size_t>
+MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300),
+ cl::desc("Maximum BB number allowed in a function after inlining"
+ " (compile time constraint)"));
+
namespace {
class AMDGPUInliner : public LegacyInlinerBase {
@@ -112,7 +117,8 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
Callee->hasFnAttribute(Attribute::InlineHint);
if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
&& !Caller->hasFnAttribute(Attribute::MinSize))
- Thres = Params.HintThreshold.getValue();
+ Thres = Params.HintThreshold.getValue() *
+ TTIWP->getTTI(*Callee).getInliningThresholdMultiplier();
const DataLayout &DL = Caller->getParent()->getDataLayout();
if (!Callee)
@@ -124,10 +130,11 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
uint64_t AllocaSize = 0;
SmallPtrSet<const AllocaInst *, 8> AIVisited;
for (Value *PtrArg : CS.args()) {
- Type *Ty = PtrArg->getType();
- if (!Ty->isPointerTy() ||
- Ty->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+ PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
+ if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
+ Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
continue;
+
PtrArg = GetUnderlyingObject(PtrArg, DL);
if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
@@ -170,7 +177,6 @@ static bool isWrapperOnlyCall(CallSite CS) {
InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
Function *Callee = CS.getCalledFunction();
Function *Caller = CS.getCaller();
- TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
if (!Callee || Callee->isDeclaration())
return llvm::InlineCost::getNever("undefined callee");
@@ -178,13 +184,15 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
if (CS.isNoInline())
return llvm::InlineCost::getNever("noinline");
+ TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
if (!TTI.areInlineCompatible(Caller, Callee))
return llvm::InlineCost::getNever("incompatible");
if (CS.hasFnAttr(Attribute::AlwaysInline)) {
- if (isInlineViable(*Callee))
+ auto IsViable = isInlineViable(*Callee);
+ if (IsViable)
return llvm::InlineCost::getAlways("alwaysinline viable");
- return llvm::InlineCost::getNever("alwaysinline unviable");
+ return llvm::InlineCost::getNever(IsViable.message);
}
if (isWrapperOnlyCall(CS))
@@ -206,6 +214,15 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
return ACT->getAssumptionCache(F);
};
- return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache,
- None, PSI, RemarksEnabled ? &ORE : nullptr);
+ auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee,
+ LocalParams, TTI, GetAssumptionCache, None, PSI,
+ RemarksEnabled ? &ORE : nullptr);
+
+ if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) {
+ // Single BB does not increase total BB amount, thus subtract 1
+ size_t Size = Caller->size() + Callee->size() - 1;
+ if (MaxBB && Size > MaxBB)
+ return llvm::InlineCost::getNever("max number of bb exceeded");
+ }
+ return IC;
}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 07aa7c2cc8ad..9951cbf2326e 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 2f8166da0d33..698189e14c21 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -1,9 +1,8 @@
//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 82644be26563..4a8446955496 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -1,9 +1,8 @@
//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -51,27 +50,21 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def AMDGPUIfOp : SDTypeProfile<1, 2,
- [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
+ [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
>;
def AMDGPUElseOp : SDTypeProfile<1, 2,
- [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>]
+ [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
>;
def AMDGPULoopOp : SDTypeProfile<0, 2,
- [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
+ [SDTCisVT<0, i1>, SDTCisVT<1, OtherVT>]
>;
def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
- [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
->;
-
-def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
- [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
+ [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>]
>;
-def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
-
//===----------------------------------------------------------------------===//
// AMDGPU DAG Nodes
//
@@ -96,7 +89,8 @@ def AMDGPUcall : SDNode<"AMDGPUISD::CALL",
SDNPVariadic]
>;
-def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET,
+def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN",
+ SDTypeProfile<0, 3, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
@@ -205,14 +199,8 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
// out = (src1 > src0) ? 1 : 0
def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
-// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own
-// nodes in TargetSelectionDAG.td.
-def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>;
-
-def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>;
-
def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
- SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
+ SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
]>;
def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
@@ -251,7 +239,8 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
// Special case divide FMA with scale and flags (src0 = Quotient,
// src1 = Denominator, src2 = Numerator).
-def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp,
+ [SDNPOptInGlue]>;
// Single or double precision division fixup.
// Special case divide fixup and flags(src0 = Quotient, src1 =
@@ -370,6 +359,17 @@ def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
SDTypeProfile<1, 4, [SDTCisFP<0>]>,
[SDNPInGlue]>;
+def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16",
+ SDTypeProfile<1, 7, [SDTCisFP<0>]>,
+ [SDNPInGlue, SDNPOutGlue]>;
+
+def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16",
+ SDTypeProfile<1, 9, [SDTCisFP<0>]>,
+ [SDNPInGlue, SDNPOutGlue]>;
+
+def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16",
+ SDTypeProfile<1, 8, [SDTCisFP<0>]>,
+ [SDNPInGlue]>;
def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
[SDNPHasChain, SDNPSideEffect]>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8eb49d49b2e0..901a2eaa8829 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -18,10 +17,11 @@
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
-#include "SIMachineFunctionInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -35,6 +35,7 @@
#define DEBUG_TYPE "amdgpu-isel"
using namespace llvm;
+using namespace MIPatternMatch;
#define GET_GLOBALISEL_IMPL
#define AMDGPUSubtarget GCNSubtarget
@@ -60,11 +61,101 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector(
const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
+static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return Reg == AMDGPU::SCC;
+
+ auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
+ const TargetRegisterClass *RC =
+ RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
+ if (RC) {
+ // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the
+ // context of the register bank has been lost.
+ if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID)
+ return false;
+ const LLT Ty = MRI.getType(Reg);
+ return Ty.isValid() && Ty.getSizeInBits() == 1;
+ }
+
+ const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
+ return RB->getID() == AMDGPU::SCCRegBankID;
+}
+
+bool AMDGPUInstructionSelector::isVCC(Register Reg,
+ const MachineRegisterInfo &MRI) const {
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return Reg == TRI.getVCC();
+
+ auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
+ const TargetRegisterClass *RC =
+ RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
+ if (RC) {
+ const LLT Ty = MRI.getType(Reg);
+ return RC->hasSuperClassEq(TRI.getBoolRC()) &&
+ Ty.isValid() && Ty.getSizeInBits() == 1;
+ }
+
+ const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
+ return RB->getID() == AMDGPU::VCCRegBankID;
+}
+
bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
+ const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
I.setDesc(TII.get(TargetOpcode::COPY));
+
+ const MachineOperand &Src = I.getOperand(1);
+ MachineOperand &Dst = I.getOperand(0);
+ Register DstReg = Dst.getReg();
+ Register SrcReg = Src.getReg();
+
+ if (isVCC(DstReg, MRI)) {
+ if (SrcReg == AMDGPU::SCC) {
+ const TargetRegisterClass *RC
+ = TRI.getConstrainedRegClassForOperand(Dst, MRI);
+ if (!RC)
+ return true;
+ return RBI.constrainGenericRegister(DstReg, *RC, MRI);
+ }
+
+ if (!isVCC(SrcReg, MRI)) {
+ // TODO: Should probably leave the copy and let copyPhysReg expand it.
+ if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), MRI))
+ return false;
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
+ .addImm(0)
+ .addReg(SrcReg);
+
+ if (!MRI.getRegClassOrNull(SrcReg))
+ MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI));
+ I.eraseFromParent();
+ return true;
+ }
+
+ const TargetRegisterClass *RC =
+ TRI.getConstrainedRegClassForOperand(Dst, MRI);
+ if (RC && !RBI.constrainGenericRegister(DstReg, *RC, MRI))
+ return false;
+
+ // Don't constrain the source register to a class so the def instruction
+ // handles it (unless it's undef).
+ //
+ // FIXME: This is a hack. When selecting the def, we neeed to know
+ // specifically know that the result is VCCRegBank, and not just an SGPR
+ // with size 1. An SReg_32 with size 1 is ambiguous with wave32.
+ if (Src.isUndef()) {
+ const TargetRegisterClass *SrcRC =
+ TRI.getConstrainedRegClassForOperand(Src, MRI);
+ if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI))
+ return false;
+ }
+
+ return true;
+ }
+
for (const MachineOperand &MO : I.operands()) {
if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
continue;
@@ -78,15 +169,54 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
return true;
}
+bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const Register DefReg = I.getOperand(0).getReg();
+ const LLT DefTy = MRI.getType(DefReg);
+
+ // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
+
+ const RegClassOrRegBank &RegClassOrBank =
+ MRI.getRegClassOrRegBank(DefReg);
+
+ const TargetRegisterClass *DefRC
+ = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+ if (!DefRC) {
+ if (!DefTy.isValid()) {
+ LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
+ return false;
+ }
+
+ const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
+ if (RB.getID() == AMDGPU::SCCRegBankID) {
+ LLVM_DEBUG(dbgs() << "illegal scc phi\n");
+ return false;
+ }
+
+ DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI);
+ if (!DefRC) {
+ LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
+ return false;
+ }
+ }
+
+ I.setDesc(TII.get(TargetOpcode::PHI));
+ return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
+}
+
MachineOperand
AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
+ const TargetRegisterClass &SubRC,
unsigned SubIdx) const {
MachineInstr *MI = MO.getParent();
MachineBasicBlock *BB = MO.getParent()->getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register DstReg = MRI.createVirtualRegister(&SubRC);
if (MO.isReg()) {
unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
@@ -118,51 +248,273 @@ static int64_t getConstant(const MachineInstr *MI) {
return MI->getOperand(1).getCImm()->getSExtValue();
}
-bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const {
+static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
+ switch (Opc) {
+ case AMDGPU::G_AND:
+ return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
+ case AMDGPU::G_OR:
+ return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
+ case AMDGPU::G_XOR:
+ return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
+ default:
+ llvm_unreachable("not a bit op");
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned Size = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
- unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ MachineOperand &Dst = I.getOperand(0);
+ MachineOperand &Src0 = I.getOperand(1);
+ MachineOperand &Src1 = I.getOperand(2);
+ Register DstReg = Dst.getReg();
+ unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ if (DstRB->getID() == AMDGPU::VCCRegBankID) {
+ const TargetRegisterClass *RC = TRI.getBoolRC();
+ unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(),
+ RC == &AMDGPU::SReg_64RegClass);
+ I.setDesc(TII.get(InstOpc));
+
+ // FIXME: Hack to avoid turning the register bank into a register class.
+ // The selector for G_ICMP relies on seeing the register bank for the result
+ // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will
+ // be ambiguous whether it's a scalar or vector bool.
+ if (Src0.isUndef() && !MRI.getRegClassOrNull(Src0.getReg()))
+ MRI.setRegClass(Src0.getReg(), RC);
+ if (Src1.isUndef() && !MRI.getRegClassOrNull(Src1.getReg()))
+ MRI.setRegClass(Src1.getReg(), RC);
+
+ return RBI.constrainGenericRegister(DstReg, *RC, MRI);
+ }
- if (Size != 64)
- return false;
+ // TODO: Should this allow an SCC bank result, and produce a copy from SCC for
+ // the result?
+ if (DstRB->getID() == AMDGPU::SGPRRegBankID) {
+ unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32);
+ I.setDesc(TII.get(InstOpc));
- DebugLoc DL = I.getDebugLoc();
+ const TargetRegisterClass *RC
+ = TRI.getConstrainedRegClassForOperand(Dst, MRI);
+ if (!RC)
+ return false;
+ return RBI.constrainGenericRegister(DstReg, *RC, MRI) &&
+ RBI.constrainGenericRegister(Src0.getReg(), *RC, MRI) &&
+ RBI.constrainGenericRegister(Src1.getReg(), *RC, MRI);
+ }
- MachineOperand Lo1(getSubOperand64(I.getOperand(1), AMDGPU::sub0));
- MachineOperand Lo2(getSubOperand64(I.getOperand(2), AMDGPU::sub0));
+ return false;
+}
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
- .add(Lo1)
- .add(Lo2);
+bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ Register DstReg = I.getOperand(0).getReg();
+ const DebugLoc &DL = I.getDebugLoc();
+ unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
+ const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
- MachineOperand Hi1(getSubOperand64(I.getOperand(1), AMDGPU::sub1));
- MachineOperand Hi2(getSubOperand64(I.getOperand(2), AMDGPU::sub1));
+ if (Size == 32) {
+ if (IsSALU) {
+ const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
+ MachineInstr *Add =
+ BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
+ .add(I.getOperand(1))
+ .add(I.getOperand(2));
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
+ }
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
- .add(Hi1)
- .add(Hi2);
+ if (STI.hasAddNoCarry()) {
+ const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
+ I.setDesc(TII.get(Opc));
+ I.addOperand(*MF, MachineOperand::CreateImm(0));
+ I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), I.getOperand(0).getReg())
- .addReg(DstLo)
- .addImm(AMDGPU::sub0)
- .addReg(DstHi)
- .addImm(AMDGPU::sub1);
+ const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64;
- for (MachineOperand &MO : I.explicit_operands()) {
- if (!MO.isReg() || TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
- continue;
- RBI.constrainGenericRegister(MO.getReg(), AMDGPU::SReg_64RegClass, MRI);
+ Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass());
+ MachineInstr *Add
+ = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
+ .addDef(UnusedCarry, RegState::Dead)
+ .add(I.getOperand(1))
+ .add(I.getOperand(2))
+ .addImm(0);
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
}
+ assert(!Sub && "illegal sub should not reach here");
+
+ const TargetRegisterClass &RC
+ = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass &HalfRC
+ = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
+
+ MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
+ MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
+ MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
+ MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
+
+ Register DstLo = MRI.createVirtualRegister(&HalfRC);
+ Register DstHi = MRI.createVirtualRegister(&HalfRC);
+
+ if (IsSALU) {
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
+ .add(Lo1)
+ .add(Lo2);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
+ .add(Hi1)
+ .add(Hi2);
+ } else {
+ const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
+ Register CarryReg = MRI.createVirtualRegister(CarryRC);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo)
+ .addDef(CarryReg)
+ .add(Lo1)
+ .add(Lo2)
+ .addImm(0);
+ MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
+ .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead)
+ .add(Hi1)
+ .add(Hi2)
+ .addReg(CarryReg, RegState::Kill)
+ .addImm(0);
+
+ if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
+ return false;
+ }
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(DstLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(DstHi)
+ .addImm(AMDGPU::sub1);
+
+
+ if (!RBI.constrainGenericRegister(DstReg, RC, MRI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ assert(I.getOperand(2).getImm() % 32 == 0);
+ unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32);
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY),
+ I.getOperand(0).getReg())
+ .addReg(I.getOperand(1).getReg(), 0, SubReg);
+
+ for (const MachineOperand &MO : Copy->operands()) {
+ const TargetRegisterClass *RC =
+ TRI.getConstrainedRegClassForOperand(MO, MRI);
+ if (!RC)
+ continue;
+ RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+ }
I.eraseFromParent();
return true;
}
+bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
+ MachineBasicBlock *BB = MI.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+
+ const unsigned SrcSize = SrcTy.getSizeInBits();
+ if (SrcSize < 32)
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI);
+ const unsigned DstSize = DstTy.getSizeInBits();
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI);
+ if (!DstRC)
+ return false;
+
+ ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
+ for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
+ MachineOperand &Src = MI.getOperand(I + 1);
+ MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
+ MIB.addImm(SubRegs[I]);
+
+ const TargetRegisterClass *SrcRC
+ = TRI.getConstrainedRegClassForOperand(Src, MRI);
+ if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI))
+ return false;
+ }
+
+ if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI))
+ return false;
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
+ MachineBasicBlock *BB = MI.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const int NumDst = MI.getNumOperands() - 1;
+
+ MachineOperand &Src = MI.getOperand(NumDst);
+
+ Register SrcReg = Src.getReg();
+ Register DstReg0 = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg0);
+ LLT SrcTy = MRI.getType(SrcReg);
+
+ const unsigned DstSize = DstTy.getSizeInBits();
+ const unsigned SrcSize = SrcTy.getSizeInBits();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
+
+ const TargetRegisterClass *SrcRC =
+ TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI);
+ if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI))
+ return false;
+
+ const unsigned SrcFlags = getUndefRegState(Src.isUndef());
+
+ // Note we could have mixed SGPR and VGPR destination banks for an SGPR
+ // source, and this relies on the fact that the same subregister indices are
+ // used for both.
+ ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
+ for (int I = 0, E = NumDst; I != E; ++I) {
+ MachineOperand &Dst = MI.getOperand(I);
+ BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
+ .addReg(SrcReg, SrcFlags, SubRegs[I]);
+
+ const TargetRegisterClass *DstRC =
+ TRI.getConstrainedRegClassForOperand(Dst, MRI);
+ if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI))
+ return false;
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
- return selectG_ADD(I);
+ return selectG_ADD_SUB(I);
}
bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
@@ -170,47 +522,200 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const MachineOperand &MO = I.getOperand(0);
- const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(MO, MRI);
- if (RC)
+
+ // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
+ // regbank check here is to know why getConstrainedRegClassForOperand failed.
+ const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI);
+ if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) ||
+ (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) {
+ I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32);
+ DebugLoc DL = I.getDebugLoc();
+ MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG))
+ .addDef(I.getOperand(0).getReg())
+ .addReg(I.getOperand(1).getReg())
+ .addReg(I.getOperand(2).getReg())
+ .addImm(SubReg);
+
+ for (const MachineOperand &MO : Ins->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ continue;
+
+ const TargetRegisterClass *RC =
+ TRI.getConstrainedRegClassForOperand(MO, MRI);
+ if (!RC)
+ continue;
RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
- I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+ }
+ I.eraseFromParent();
return true;
}
-bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
- unsigned IntrinsicID = I.getOperand(1).getIntrinsicID();
-
+bool AMDGPUInstructionSelector::selectG_INTRINSIC(
+ MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
+ unsigned IntrinsicID = I.getOperand(I.getNumExplicitDefs()).getIntrinsicID();
switch (IntrinsicID) {
- default:
- break;
case Intrinsic::maxnum:
case Intrinsic::minnum:
case Intrinsic::amdgcn_cvt_pkrtz:
return selectImpl(I, CoverageInfo);
-
- case Intrinsic::amdgcn_kernarg_segment_ptr: {
- MachineFunction *MF = I.getParent()->getParent();
+ case Intrinsic::amdgcn_if_break: {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const ArgDescriptor *InputPtrReg;
- const TargetRegisterClass *RC;
- const DebugLoc &DL = I.getDebugLoc();
-
- std::tie(InputPtrReg, RC)
- = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
- if (!InputPtrReg)
- report_fatal_error("missing kernarg segment ptr");
- BuildMI(*I.getParent(), &I, DL, TII.get(AMDGPU::COPY))
+ // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
+ // SelectionDAG uses for wave32 vs wave64.
+ BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
.add(I.getOperand(0))
- .addReg(MRI.getLiveInVirtReg(InputPtrReg->getRegister()));
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+
+ Register DstReg = I.getOperand(0).getReg();
+ Register Src0Reg = I.getOperand(2).getReg();
+ Register Src1Reg = I.getOperand(3).getReg();
+
I.eraseFromParent();
+
+ for (Register Reg : { DstReg, Src0Reg, Src1Reg }) {
+ if (!MRI.getRegClassOrNull(Reg))
+ MRI.setRegClass(Reg, TRI.getWaveMaskRegClass());
+ }
+
return true;
}
+ default:
+ return selectImpl(I, CoverageInfo);
+ }
+}
+
+static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
+ if (Size != 32 && Size != 64)
+ return -1;
+ switch (P) {
+ default:
+ llvm_unreachable("Unknown condition code!");
+ case CmpInst::ICMP_NE:
+ return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
+ case CmpInst::ICMP_EQ:
+ return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
+ case CmpInst::ICMP_SGT:
+ return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
+ case CmpInst::ICMP_SGE:
+ return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
+ case CmpInst::ICMP_SLT:
+ return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
+ case CmpInst::ICMP_SLE:
+ return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
+ case CmpInst::ICMP_UGT:
+ return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
+ case CmpInst::ICMP_UGE:
+ return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
+ case CmpInst::ICMP_ULT:
+ return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
+ case CmpInst::ICMP_ULE:
+ return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
}
- return false;
+}
+
+int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
+ unsigned Size) const {
+ if (Size == 64) {
+ if (!STI.hasScalarCompareEq64())
+ return -1;
+
+ switch (P) {
+ case CmpInst::ICMP_NE:
+ return AMDGPU::S_CMP_LG_U64;
+ case CmpInst::ICMP_EQ:
+ return AMDGPU::S_CMP_EQ_U64;
+ default:
+ return -1;
+ }
+ }
+
+ if (Size != 32)
+ return -1;
+
+ switch (P) {
+ case CmpInst::ICMP_NE:
+ return AMDGPU::S_CMP_LG_U32;
+ case CmpInst::ICMP_EQ:
+ return AMDGPU::S_CMP_EQ_U32;
+ case CmpInst::ICMP_SGT:
+ return AMDGPU::S_CMP_GT_I32;
+ case CmpInst::ICMP_SGE:
+ return AMDGPU::S_CMP_GE_I32;
+ case CmpInst::ICMP_SLT:
+ return AMDGPU::S_CMP_LT_I32;
+ case CmpInst::ICMP_SLE:
+ return AMDGPU::S_CMP_LE_I32;
+ case CmpInst::ICMP_UGT:
+ return AMDGPU::S_CMP_GT_U32;
+ case CmpInst::ICMP_UGE:
+ return AMDGPU::S_CMP_GE_U32;
+ case CmpInst::ICMP_ULT:
+ return AMDGPU::S_CMP_LT_U32;
+ case CmpInst::ICMP_ULE:
+ return AMDGPU::S_CMP_LE_U32;
+ default:
+ llvm_unreachable("Unknown condition code!");
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ unsigned SrcReg = I.getOperand(2).getReg();
+ unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI);
+
+ auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
+
+ unsigned CCReg = I.getOperand(0).getReg();
+ if (isSCC(CCReg, MRI)) {
+ int Opcode = getS_CMPOpcode(Pred, Size);
+ if (Opcode == -1)
+ return false;
+ MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
+ .addReg(AMDGPU::SCC);
+ bool Ret =
+ constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
+ RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI);
+ I.eraseFromParent();
+ return Ret;
+ }
+
+ int Opcode = getV_CMPOpcode(Pred, Size);
+ if (Opcode == -1)
+ return false;
+
+ MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
+ I.getOperand(0).getReg())
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+ RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
+ *TRI.getBoolRC(), MRI);
+ bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
+ I.eraseFromParent();
+ return Ret;
}
static MachineInstr *
@@ -232,8 +737,7 @@ buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
}
bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
- MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
+ MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -272,8 +776,72 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
I.eraseFromParent();
return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
}
+ case Intrinsic::amdgcn_end_cf: {
+ // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
+ // SelectionDAG uses for wave32 vs wave64.
+ BuildMI(*BB, &I, I.getDebugLoc(),
+ TII.get(AMDGPU::SI_END_CF))
+ .add(I.getOperand(1));
+
+ Register Reg = I.getOperand(1).getReg();
+ I.eraseFromParent();
+
+ if (!MRI.getRegClassOrNull(Reg))
+ MRI.setRegClass(Reg, TRI.getWaveMaskRegClass());
+ return true;
}
- return false;
+ default:
+ return selectImpl(I, CoverageInfo);
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ unsigned DstReg = I.getOperand(0).getReg();
+ unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+ assert(Size <= 32 || Size == 64);
+ const MachineOperand &CCOp = I.getOperand(1);
+ unsigned CCReg = CCOp.getReg();
+ if (isSCC(CCReg, MRI)) {
+ unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
+ AMDGPU::S_CSELECT_B32;
+ MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
+ .addReg(CCReg);
+
+ // The generic constrainSelectedInstRegOperands doesn't work for the scc register
+ // bank, because it does not cover the register class that we used to represent
+ // for it. So we need to manually set the register class here.
+ if (!MRI.getRegClassOrNull(CCReg))
+ MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI));
+ MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+
+ bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
+ constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
+ I.eraseFromParent();
+ return Ret;
+ }
+
+ // Wide VGPR select should have been split in RegBankSelect.
+ if (Size > 32)
+ return false;
+
+ MachineInstr *Select =
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
+ .add(I.getOperand(3))
+ .addImm(0)
+ .add(I.getOperand(2))
+ .add(I.getOperand(1));
+
+ bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+ I.eraseFromParent();
+ return Ret;
}
bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
@@ -281,10 +849,16 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
DebugLoc DL = I.getDebugLoc();
+ unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI);
+ if (PtrSize != 64) {
+ LLVM_DEBUG(dbgs() << "Unhandled address space\n");
+ return false;
+ }
+
unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
unsigned Opcode;
- // FIXME: Select store instruction based on address space
+ // FIXME: Remove this when integers > s32 naturally selected.
switch (StoreSize) {
default:
return false;
@@ -307,7 +881,8 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
.add(I.getOperand(0))
.addImm(0) // offset
.addImm(0) // glc
- .addImm(0); // slc
+ .addImm(0) // slc
+ .addImm(0); // dlc
// Now that we selected an opcode, we need to constrain the register
@@ -318,6 +893,218 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
return Ret;
}
+static int sizeToSubRegIndex(unsigned Size) {
+ switch (Size) {
+ case 32:
+ return AMDGPU::sub0;
+ case 64:
+ return AMDGPU::sub0_sub1;
+ case 96:
+ return AMDGPU::sub0_sub1_sub2;
+ case 128:
+ return AMDGPU::sub0_sub1_sub2_sub3;
+ case 256:
+ return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
+ default:
+ if (Size < 32)
+ return AMDGPU::sub0;
+ if (Size > 256)
+ return -1;
+ return sizeToSubRegIndex(PowerOf2Ceil(Size));
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ unsigned DstReg = I.getOperand(0).getReg();
+ unsigned SrcReg = I.getOperand(1).getReg();
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ if (!DstTy.isScalar())
+ return false;
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI);
+ if (SrcRB != DstRB)
+ return false;
+
+ unsigned DstSize = DstTy.getSizeInBits();
+ unsigned SrcSize = SrcTy.getSizeInBits();
+
+ const TargetRegisterClass *SrcRC
+ = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI);
+ const TargetRegisterClass *DstRC
+ = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI);
+
+ if (SrcSize > 32) {
+ int SubRegIdx = sizeToSubRegIndex(DstSize);
+ if (SubRegIdx == -1)
+ return false;
+
+ // Deal with weird cases where the class only partially supports the subreg
+ // index.
+ SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
+ if (!SrcRC)
+ return false;
+
+ I.getOperand(1).setSubReg(SubRegIdx);
+ }
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ return false;
+ }
+
+ I.setDesc(TII.get(TargetOpcode::COPY));
+ return true;
+}
+
+/// \returns true if a bitmask for \p Size bits will be an inline immediate.
+static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
+ Mask = maskTrailingOnes<unsigned>(Size);
+ int SignedMask = static_cast<int>(Mask);
+ return SignedMask >= -16 && SignedMask <= 64;
+}
+
+bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
+ bool Signed = I.getOpcode() == AMDGPU::G_SEXT;
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock &MBB = *I.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ const LLT S1 = LLT::scalar(1);
+ const unsigned SrcSize = SrcTy.getSizeInBits();
+ const unsigned DstSize = DstTy.getSizeInBits();
+ if (!DstTy.isScalar())
+ return false;
+
+ const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
+
+ if (SrcBank->getID() == AMDGPU::SCCRegBankID) {
+ if (SrcTy != S1 || DstSize > 64) // Invalid
+ return false;
+
+ unsigned Opcode =
+ DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+ const TargetRegisterClass *DstRC =
+ DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass;
+
+ // FIXME: Create an extra copy to avoid incorrectly constraining the result
+ // of the scc producer.
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg)
+ .addReg(SrcReg);
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
+ .addReg(TmpReg);
+
+ // The instruction operands are backwards from what you would expect.
+ BuildMI(MBB, I, DL, TII.get(Opcode), DstReg)
+ .addImm(0)
+ .addImm(Signed ? -1 : 1);
+ return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
+ }
+
+ if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) {
+ if (SrcTy != S1) // Invalid
+ return false;
+
+ MachineInstr *ExtI =
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0) // src0_modifiers
+ .addImm(0) // src0
+ .addImm(0) // src1_modifiers
+ .addImm(Signed ? -1 : 1) // src1
+ .addUse(SrcReg);
+ return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+ }
+
+ if (I.getOpcode() == AMDGPU::G_ANYEXT)
+ return selectCOPY(I);
+
+ if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
+ // 64-bit should have been split up in RegBankSelect
+
+ // Try to use an and with a mask if it will save code size.
+ unsigned Mask;
+ if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
+ MachineInstr *ExtI =
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
+ .addImm(Mask)
+ .addReg(SrcReg);
+ return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+ }
+
+ const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
+ MachineInstr *ExtI =
+ BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
+ .addReg(SrcReg)
+ .addImm(0) // Offset
+ .addImm(SrcSize); // Width
+ return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+ }
+
+ if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
+ if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI))
+ return false;
+
+ if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
+ const unsigned SextOpc = SrcSize == 8 ?
+ AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
+ BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
+ .addReg(SrcReg);
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
+ }
+
+ const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
+ const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
+
+ // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
+ if (DstSize > 32 && SrcSize <= 32) {
+ // We need a 64-bit register source, but the high bits don't matter.
+ unsigned ExtReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned UndefReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
+ .addReg(SrcReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(UndefReg)
+ .addImm(AMDGPU::sub1);
+
+ BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
+ .addReg(ExtReg)
+ .addImm(SrcSize << 16);
+
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+ }
+
+ unsigned Mask;
+ if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
+ .addReg(SrcReg)
+ .addImm(Mask);
+ } else {
+ BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
+ .addReg(SrcReg)
+ .addImm(SrcSize << 16);
+ }
+
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
+ }
+
+ return false;
+}
+
bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
@@ -423,7 +1210,7 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
getAddrModeInfo(*PtrMI, MRI, AddrInfo);
}
-static bool isInstrUniform(const MachineInstr &MI) {
+bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
if (!MI.hasOneMemOperand())
return false;
@@ -445,52 +1232,6 @@ static bool isInstrUniform(const MachineInstr &MI) {
return I && I->getMetadata("amdgpu.uniform");
}
-static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) {
-
- if (LoadSize == 32)
- return BaseOpcode;
-
- switch (BaseOpcode) {
- case AMDGPU::S_LOAD_DWORD_IMM:
- switch (LoadSize) {
- case 64:
- return AMDGPU::S_LOAD_DWORDX2_IMM;
- case 128:
- return AMDGPU::S_LOAD_DWORDX4_IMM;
- case 256:
- return AMDGPU::S_LOAD_DWORDX8_IMM;
- case 512:
- return AMDGPU::S_LOAD_DWORDX16_IMM;
- }
- break;
- case AMDGPU::S_LOAD_DWORD_IMM_ci:
- switch (LoadSize) {
- case 64:
- return AMDGPU::S_LOAD_DWORDX2_IMM_ci;
- case 128:
- return AMDGPU::S_LOAD_DWORDX4_IMM_ci;
- case 256:
- return AMDGPU::S_LOAD_DWORDX8_IMM_ci;
- case 512:
- return AMDGPU::S_LOAD_DWORDX16_IMM_ci;
- }
- break;
- case AMDGPU::S_LOAD_DWORD_SGPR:
- switch (LoadSize) {
- case 64:
- return AMDGPU::S_LOAD_DWORDX2_SGPR;
- case 128:
- return AMDGPU::S_LOAD_DWORDX4_SGPR;
- case 256:
- return AMDGPU::S_LOAD_DWORDX8_SGPR;
- case 512:
- return AMDGPU::S_LOAD_DWORDX16_SGPR;
- }
- break;
- }
- llvm_unreachable("Invalid base smrd opcode or size");
-}
-
bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
for (const GEPInfo &GEPInfo : AddrInfo) {
if (!GEPInfo.VgprParts.empty())
@@ -499,125 +1240,77 @@ bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
return false;
}
-bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
- ArrayRef<GEPInfo> AddrInfo) const {
-
- if (!I.hasOneMemOperand())
- return false;
-
- if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
- (*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
- return false;
-
- if (!isInstrUniform(I))
- return false;
-
- if (hasVgprParts(AddrInfo))
- return false;
+bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
+ // TODO: Can/should we insert m0 initialization here for DS instructions and
+ // call the normal selector?
+ return false;
+}
+bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned DstReg = I.getOperand(0).getReg();
+ MachineOperand &CondOp = I.getOperand(0);
+ Register CondReg = CondOp.getReg();
const DebugLoc &DL = I.getDebugLoc();
- unsigned Opcode;
- unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
-
- if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) {
-
- const GEPInfo &GEPInfo = AddrInfo[0];
-
- unsigned PtrReg = GEPInfo.SgprParts[0];
- int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm);
- if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) {
- Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize);
- MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
- .addReg(PtrReg)
- .addImm(EncodedImm)
- .addImm(0); // glc
- return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
- }
+ unsigned BrOpcode;
+ Register CondPhysReg;
+ const TargetRegisterClass *ConstrainRC;
+
+ // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
+ // whether the branch is uniform when selecting the instruction. In
+ // GlobalISel, we should push that decision into RegBankSelect. Assume for now
+ // RegBankSelect knows what it's doing if the branch condition is scc, even
+ // though it currently does not.
+ if (isSCC(CondReg, MRI)) {
+ CondPhysReg = AMDGPU::SCC;
+ BrOpcode = AMDGPU::S_CBRANCH_SCC1;
+ ConstrainRC = &AMDGPU::SReg_32_XM0RegClass;
+ } else if (isVCC(CondReg, MRI)) {
+ // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
+ // We sort of know that a VCC producer based on the register bank, that ands
+ // inactive lanes with 0. What if there was a logical operation with vcc
+ // producers in different blocks/with different exec masks?
+ // FIXME: Should scc->vcc copies and with exec?
+ CondPhysReg = TRI.getVCC();
+ BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
+ ConstrainRC = TRI.getBoolRC();
+ } else
+ return false;
- if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS &&
- isUInt<32>(EncodedImm)) {
- Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize);
- MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
- .addReg(PtrReg)
- .addImm(EncodedImm)
- .addImm(0); // glc
- return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
- }
+ if (!MRI.getRegClassOrNull(CondReg))
+ MRI.setRegClass(CondReg, ConstrainRC);
- if (isUInt<32>(GEPInfo.Imm)) {
- Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize);
- unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg)
- .addImm(GEPInfo.Imm);
-
- MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
- .addReg(PtrReg)
- .addReg(OffsetReg)
- .addImm(0); // glc
- return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
- }
- }
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
+ .addReg(CondReg);
+ BuildMI(*BB, &I, DL, TII.get(BrOpcode))
+ .addMBB(I.getOperand(1).getMBB());
- unsigned PtrReg = I.getOperand(1).getReg();
- Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize);
- MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
- .addReg(PtrReg)
- .addImm(0)
- .addImm(0); // glc
- return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
}
-
-bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- DebugLoc DL = I.getDebugLoc();
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned PtrReg = I.getOperand(1).getReg();
- unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
- unsigned Opcode;
-
- SmallVector<GEPInfo, 4> AddrInfo;
-
- getAddrModeInfo(I, MRI, AddrInfo);
-
- if (selectSMRD(I, AddrInfo)) {
- I.eraseFromParent();
- return true;
- }
- switch (LoadSize) {
- default:
- llvm_unreachable("Load size not supported\n");
- case 32:
- Opcode = AMDGPU::FLAT_LOAD_DWORD;
- break;
- case 64:
- Opcode = AMDGPU::FLAT_LOAD_DWORDX2;
- break;
- }
+ Register DstReg = I.getOperand(0).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
+ if (IsVGPR)
+ I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
- MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
- .add(I.getOperand(0))
- .addReg(PtrReg)
- .addImm(0) // offset
- .addImm(0) // glc
- .addImm(0); // slc
-
- bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
- I.eraseFromParent();
- return Ret;
+ return RBI.constrainGenericRegister(
+ DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI);
}
bool AMDGPUInstructionSelector::select(MachineInstr &I,
CodeGenCoverage &CoverageInfo) const {
+ if (I.isPHI())
+ return selectPHI(I);
if (!isPreISelGenericOpcode(I.getOpcode())) {
if (I.isCopy())
@@ -626,28 +1319,75 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
}
switch (I.getOpcode()) {
- default:
+ case TargetOpcode::G_AND:
+ case TargetOpcode::G_OR:
+ case TargetOpcode::G_XOR:
+ if (selectG_AND_OR_XOR(I))
+ return true;
return selectImpl(I, CoverageInfo);
case TargetOpcode::G_ADD:
- return selectG_ADD(I);
+ case TargetOpcode::G_SUB:
+ if (selectG_ADD_SUB(I))
+ return true;
+ LLVM_FALLTHROUGH;
+ default:
+ return selectImpl(I, CoverageInfo);
case TargetOpcode::G_INTTOPTR:
case TargetOpcode::G_BITCAST:
return selectCOPY(I);
case TargetOpcode::G_CONSTANT:
case TargetOpcode::G_FCONSTANT:
return selectG_CONSTANT(I);
+ case TargetOpcode::G_EXTRACT:
+ return selectG_EXTRACT(I);
+ case TargetOpcode::G_MERGE_VALUES:
+ case TargetOpcode::G_BUILD_VECTOR:
+ case TargetOpcode::G_CONCAT_VECTORS:
+ return selectG_MERGE_VALUES(I);
+ case TargetOpcode::G_UNMERGE_VALUES:
+ return selectG_UNMERGE_VALUES(I);
case TargetOpcode::G_GEP:
return selectG_GEP(I);
case TargetOpcode::G_IMPLICIT_DEF:
return selectG_IMPLICIT_DEF(I);
+ case TargetOpcode::G_INSERT:
+ return selectG_INSERT(I);
case TargetOpcode::G_INTRINSIC:
return selectG_INTRINSIC(I, CoverageInfo);
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo);
+ case TargetOpcode::G_ICMP:
+ if (selectG_ICMP(I))
+ return true;
+ return selectImpl(I, CoverageInfo);
case TargetOpcode::G_LOAD:
- return selectG_LOAD(I);
+ return selectImpl(I, CoverageInfo);
+ case TargetOpcode::G_SELECT:
+ return selectG_SELECT(I);
case TargetOpcode::G_STORE:
+ if (selectImpl(I, CoverageInfo))
+ return true;
return selectG_STORE(I);
+ case TargetOpcode::G_TRUNC:
+ return selectG_TRUNC(I);
+ case TargetOpcode::G_SEXT:
+ case TargetOpcode::G_ZEXT:
+ case TargetOpcode::G_ANYEXT:
+ if (selectG_SZA_EXT(I)) {
+ I.eraseFromParent();
+ return true;
+ }
+
+ return false;
+ case TargetOpcode::G_BRCOND:
+ return selectG_BRCOND(I);
+ case TargetOpcode::G_FRAME_INDEX:
+ return selectG_FRAME_INDEX(I);
+ case TargetOpcode::G_FENCE:
+ // FIXME: Tablegen importer doesn't handle the imm operands correctly, and
+ // is checking for G_CONSTANT
+ I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE));
+ return true;
}
return false;
}
@@ -660,6 +1400,26 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectVOP3ModsImpl(
+ Register Src, const MachineRegisterInfo &MRI) const {
+ unsigned Mods = 0;
+ MachineInstr *MI = MRI.getVRegDef(Src);
+
+ if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
+ Src = MI->getOperand(1).getReg();
+ Mods |= SISrcMods::NEG;
+ MI = MRI.getVRegDef(Src);
+ }
+
+ if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
+ Src = MI->getOperand(1).getReg();
+ Mods |= SISrcMods::ABS;
+ }
+
+ return std::make_pair(Src, Mods);
+}
+
///
/// This will select either an SGPR or VGPR operand and will save us from
/// having to write an extra tablegen pattern.
@@ -672,11 +1432,18 @@ AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
+
return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src0_mods
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
}};
}
InstructionSelector::ComplexRendererFns
@@ -690,8 +1457,274 @@ AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
+
return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI =
+ Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ SmallVector<GEPInfo, 4> AddrInfo;
+ getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
+
+ if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
+ return None;
+
+ const GEPInfo &GEPInfo = AddrInfo[0];
+
+ if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm))
+ return None;
+
+ unsigned PtrReg = GEPInfo.SgprParts[0];
+ int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI =
+ Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ SmallVector<GEPInfo, 4> AddrInfo;
+ getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
+
+ if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
+ return None;
+
+ const GEPInfo &GEPInfo = AddrInfo[0];
+ unsigned PtrReg = GEPInfo.SgprParts[0];
+ int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
+ if (!isUInt<32>(EncodedImm))
+ return None;
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+ SmallVector<GEPInfo, 4> AddrInfo;
+ getAddrModeInfo(*MI, MRI, AddrInfo);
+
+ // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
+ // then we can select all ptr + 32-bit offsets not just immediate offsets.
+ if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
+ return None;
+
+ const GEPInfo &GEPInfo = AddrInfo[0];
+ if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm))
+ return None;
+
+ // If we make it this far we have a load with an 32-bit immediate offset.
+ // It is OK to select this using a sgpr offset, because we have already
+ // failed trying to select this load into one of the _IMM variants since
+ // the _IMM Patterns are considered before the _SGPR patterns.
+ unsigned PtrReg = GEPInfo.SgprParts[0];
+ unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addImm(GEPInfo.Imm);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
+ }};
+}
+
+template <bool Signed>
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+ InstructionSelector::ComplexRendererFns Default = {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc
+ }};
+
+ if (!STI.hasFlatInstOffsets())
+ return Default;
+
+ const MachineInstr *OpDef = MRI.getVRegDef(Root.getReg());
+ if (!OpDef || OpDef->getOpcode() != AMDGPU::G_GEP)
+ return Default;
+
+ Optional<int64_t> Offset =
+ getConstantVRegVal(OpDef->getOperand(2).getReg(), MRI);
+ if (!Offset.hasValue())
+ return Default;
+
+ unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
+ if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
+ return Default;
+
+ Register BasePtr = OpDef->getOperand(1).getReg();
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
+ return selectFlatOffsetImpl<false>(Root);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
+ return selectFlatOffsetImpl<true>(Root);
+}
+
+// FIXME: Implement
+static bool signBitIsZero(const MachineOperand &Op,
+ const MachineRegisterInfo &MRI) {
+ return false;
+}
+
+static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
+ auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
+ return PSV && PSV->isStack();
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+ int64_t Offset = 0;
+ if (mi_match(Root.getReg(), MRI, m_ICst(Offset))) {
+ Register HighBits = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ // TODO: Should this be inside the render function? The iterator seems to
+ // move.
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+ HighBits)
+ .addImm(Offset & ~4095);
+
+ return {{[=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(Info->getScratchRSrcReg());
+ },
+ [=](MachineInstrBuilder &MIB) { // vaddr
+ MIB.addReg(HighBits);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ const MachineMemOperand *MMO = *MI->memoperands_begin();
+ const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
+
+ Register SOffsetReg = isStackPtrRelative(PtrInfo)
+ ? Info->getStackPtrOffsetReg()
+ : Info->getScratchWaveOffsetReg();
+ MIB.addReg(SOffsetReg);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(Offset & 4095);
+ }}};
+ }
+
+ assert(Offset == 0);
+
+ // Try to fold a frame index directly into the MUBUF vaddr field, and any
+ // offsets.
+ Optional<int> FI;
+ Register VAddr = Root.getReg();
+ if (const MachineInstr *RootDef = MRI.getVRegDef(Root.getReg())) {
+ if (isBaseWithConstantOffset(Root, MRI)) {
+ const MachineOperand &LHS = RootDef->getOperand(1);
+ const MachineOperand &RHS = RootDef->getOperand(2);
+ const MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
+ const MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
+ if (LHSDef && RHSDef) {
+ int64_t PossibleOffset =
+ RHSDef->getOperand(1).getCImm()->getSExtValue();
+ if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
+ (!STI.privateMemoryResourceIsRangeChecked() ||
+ signBitIsZero(LHS, MRI))) {
+ if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
+ FI = LHSDef->getOperand(1).getIndex();
+ else
+ VAddr = LHS.getReg();
+ Offset = PossibleOffset;
+ }
+ }
+ } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
+ FI = RootDef->getOperand(1).getIndex();
+ }
+ }
+
+ // If we don't know this private access is a local stack object, it needs to
+ // be relative to the entry point's scratch wave offset register.
+ // TODO: Should split large offsets that don't fit like above.
+ // TODO: Don't use scratch wave offset just because the offset didn't fit.
+ Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg()
+ : Info->getScratchWaveOffsetReg();
+
+ return {{[=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(Info->getScratchRSrcReg());
+ },
+ [=](MachineInstrBuilder &MIB) { // vaddr
+ if (FI.hasValue())
+ MIB.addFrameIndex(FI.getValue());
+ else
+ MIB.addReg(VAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ MIB.addReg(SOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(Offset);
+ }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFScratchOffset(
+ MachineOperand &Root) const {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+ int64_t Offset = 0;
+ if (!mi_match(Root.getReg(), MRI, m_ICst(Offset)) ||
+ !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
+ return {};
+
+ const MachineFunction *MF = MBB->getParent();
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ const MachineMemOperand *MMO = *MI->memoperands_begin();
+ const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
+
+ Register SOffsetReg = isStackPtrRelative(PtrInfo)
+ ? Info->getStackPtrOffsetReg()
+ : Info->getScratchWaveOffsetReg();
+ return {{
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(Info->getScratchRSrcReg());
+ }, // rsrc
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
}};
}
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 449431adc561..4f489ddfb23d 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -1,9 +1,8 @@
//===- AMDGPUInstructionSelector --------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -18,7 +17,9 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/IR/InstrTypes.h"
namespace {
#define GET_GLOBALISEL_PREDICATE_BITSET
@@ -58,24 +59,45 @@ private:
GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
};
+ bool isInstrUniform(const MachineInstr &MI) const;
+ bool isVCC(Register Reg, const MachineRegisterInfo &MRI) const;
+
/// tblgen-erated 'select' implementation.
bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
- MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const;
+ MachineOperand getSubOperand64(MachineOperand &MO,
+ const TargetRegisterClass &SubRC,
+ unsigned SubIdx) const;
bool selectCOPY(MachineInstr &I) const;
+ bool selectPHI(MachineInstr &I) const;
+ bool selectG_TRUNC(MachineInstr &I) const;
+ bool selectG_SZA_EXT(MachineInstr &I) const;
bool selectG_CONSTANT(MachineInstr &I) const;
- bool selectG_ADD(MachineInstr &I) const;
+ bool selectG_AND_OR_XOR(MachineInstr &I) const;
+ bool selectG_ADD_SUB(MachineInstr &I) const;
+ bool selectG_EXTRACT(MachineInstr &I) const;
+ bool selectG_MERGE_VALUES(MachineInstr &I) const;
+ bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
bool selectG_GEP(MachineInstr &I) const;
bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
+ bool selectG_INSERT(MachineInstr &I) const;
bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I,
CodeGenCoverage &CoverageInfo) const;
+ int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const;
+ bool selectG_ICMP(MachineInstr &I) const;
bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
SmallVectorImpl<GEPInfo> &AddrInfo) const;
bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const;
bool selectG_LOAD(MachineInstr &I) const;
+ bool selectG_SELECT(MachineInstr &I) const;
bool selectG_STORE(MachineInstr &I) const;
+ bool selectG_BRCOND(MachineInstr &I) const;
+ bool selectG_FRAME_INDEX(MachineInstr &I) const;
+
+ std::pair<Register, unsigned>
+ selectVOP3ModsImpl(Register Src, const MachineRegisterInfo &MRI) const;
InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand &Root) const;
@@ -90,6 +112,27 @@ private:
InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSmrdImm(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSmrdImm32(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectSmrdSgpr(MachineOperand &Root) const;
+
+ template <bool Signed>
+ InstructionSelector::ComplexRendererFns
+ selectFlatOffsetImpl(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectFlatOffset(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectFlatOffsetSigned(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectMUBUFScratchOffen(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectMUBUFScratchOffset(MachineOperand &Root) const;
+
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
const AMDGPURegisterBankInfo &RBI;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index eb8f2002ff2d..61bc415c839d 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -1,9 +1,8 @@
//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -12,6 +11,18 @@
//
//===----------------------------------------------------------------------===//
+class AddressSpacesImpl {
+ int Flat = 0;
+ int Global = 1;
+ int Region = 2;
+ int Local = 3;
+ int Constant = 4;
+ int Private = 5;
+}
+
+def AddrSpaces : AddressSpacesImpl;
+
+
class AMDGPUInst <dag outs, dag ins, string asm = "",
list<dag> pattern = []> : Instruction {
field bit isRegisterLoad = 0;
@@ -66,17 +77,15 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
def TruePredicate : Predicate<"true">;
-// Exists to help track down where SubtargetPredicate isn't set rather
-// than letting tablegen crash with an unhelpful error.
-def InvalidPred : Predicate<"predicate not set on instruction or pattern">;
-
class PredicateControl {
- Predicate SubtargetPredicate = InvalidPred;
+ Predicate SubtargetPredicate = TruePredicate;
list<Predicate> AssemblerPredicates = [];
Predicate AssemblerPredicate = TruePredicate;
+ Predicate WaveSizePredicate = TruePredicate;
list<Predicate> OtherPredicates = [];
list<Predicate> Predicates = !listconcat([SubtargetPredicate,
- AssemblerPredicate],
+ AssemblerPredicate,
+ WaveSizePredicate],
AssemblerPredicates,
OtherPredicates);
}
@@ -326,6 +335,10 @@ def TEX_SHADOW_ARRAY : PatLeaf<
// Load/Store Pattern Fragments
//===----------------------------------------------------------------------===//
+class AddressSpaceList<list<int> AS> {
+ list<int> AddrSpaces = AS;
+}
+
class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
}]>;
@@ -344,21 +357,25 @@ class StoreHi16<SDPatternOperator op> : PatFrag <
(ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)
>;
-class PrivateAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
-}]>;
+def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant ]>;
+def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, AddrSpaces.Constant ]>;
+def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>;
-class ConstantAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
-}]>;
+def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat,
+ AddrSpaces.Global,
+ AddrSpaces.Constant ]>;
+def StoreAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, AddrSpaces.Global ]>;
+
+def LoadAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>;
+def StoreAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>;
+
+def LoadAddress_local : AddressSpaceList<[ AddrSpaces.Local ]>;
+def StoreAddress_local : AddressSpaceList<[ AddrSpaces.Local ]>;
+
+def LoadAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>;
+def StoreAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>;
-class LocalAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
-class GlobalAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-}]>;
class GlobalLoadAddress : CodePatPred<[{
auto AS = cast<MemSDNode>(N)->getAddressSpace();
@@ -372,86 +389,126 @@ class FlatLoadAddress : CodePatPred<[{
AS == AMDGPUAS::CONSTANT_ADDRESS;
}]>;
-class FlatStoreAddress : CodePatPred<[{
- const auto AS = cast<MemSDNode>(N)->getAddressSpace();
- return AS == AMDGPUAS::FLAT_ADDRESS ||
- AS == AMDGPUAS::GLOBAL_ADDRESS;
+class GlobalAddress : CodePatPred<[{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
-class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
- (ld_node node:$ptr), [{
- LoadSDNode *L = cast<LoadSDNode>(N);
- return L->getExtensionType() == ISD::ZEXTLOAD ||
- L->getExtensionType() == ISD::EXTLOAD;
+class PrivateAddress : CodePatPred<[{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
}]>;
-def az_extload : AZExtLoadBase <unindexedload>;
-
-def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+class LocalAddress : CodePatPred<[{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}]>;
-def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+class RegionAddress : CodePatPred<[{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
}]>;
-def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+class FlatStoreAddress : CodePatPred<[{
+ const auto AS = cast<MemSDNode>(N)->getAddressSpace();
+ return AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
-class PrivateLoad <SDPatternOperator op> : LoadFrag <op>, PrivateAddress;
+// TODO: Remove these when stores to new PatFrag format.
class PrivateStore <SDPatternOperator op> : StoreFrag <op>, PrivateAddress;
-
-class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress;
class LocalStore <SDPatternOperator op> : StoreFrag <op>, LocalAddress;
-
-class GlobalLoad <SDPatternOperator op> : LoadFrag<op>, GlobalLoadAddress;
+class RegionStore <SDPatternOperator op> : StoreFrag <op>, RegionAddress;
class GlobalStore <SDPatternOperator op> : StoreFrag<op>, GlobalAddress;
-
-class FlatLoad <SDPatternOperator op> : LoadFrag <op>, FlatLoadAddress;
class FlatStore <SDPatternOperator op> : StoreFrag <op>, FlatStoreAddress;
-class ConstantLoad <SDPatternOperator op> : LoadFrag <op>, ConstantAddress;
+foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
-def load_private : PrivateLoad <load>;
-def az_extloadi8_private : PrivateLoad <az_extloadi8>;
-def sextloadi8_private : PrivateLoad <sextloadi8>;
-def az_extloadi16_private : PrivateLoad <az_extloadi16>;
-def sextloadi16_private : PrivateLoad <sextloadi16>;
+def load_#as : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+}
-def store_private : PrivateStore <store>;
-def truncstorei8_private : PrivateStore<truncstorei8>;
-def truncstorei16_private : PrivateStore <truncstorei16>;
-def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress;
-def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress;
+def extloadi8_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
+
+def extloadi16_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
+def sextloadi8_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
+
+def sextloadi16_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
+
+def zextloadi8_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
+
+def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
+
+def atomic_load_32_#as : PatFrag<(ops node:$ptr), (atomic_load_32 node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i32;
+}
-def load_global : GlobalLoad <load>;
-def sextloadi8_global : GlobalLoad <sextloadi8>;
-def az_extloadi8_global : GlobalLoad <az_extloadi8>;
-def sextloadi16_global : GlobalLoad <sextloadi16>;
-def az_extloadi16_global : GlobalLoad <az_extloadi16>;
-def atomic_load_global : GlobalLoad<atomic_load>;
+def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i64;
+}
+
+def store_#as : PatFrag<(ops node:$val, node:$ptr),
+ (unindexedstore node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
+
+// truncstore fragments.
+def truncstore_#as : PatFrag<(ops node:$val, node:$ptr),
+ (unindexedstore node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 1;
+}
+
+// TODO: We don't really need the truncstore here. We can use
+// unindexedstore with MemoryVT directly, which will save an
+// unnecessary check that the memory size is less than the value type
+// in the generated matcher table.
+def truncstorei8_#as : PatFrag<(ops node:$val, node:$ptr),
+ (truncstore node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i8;
+}
+
+def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr),
+ (truncstore node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i16;
+}
+
+defm atomic_store_#as : binary_atomic_op<atomic_store>;
+
+} // End let AddressSpaces = ...
+} // End foreach AddrSpace
+
+
+def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress;
+def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress;
-def store_global : GlobalStore <store>;
-def truncstorei8_global : GlobalStore <truncstorei8>;
-def truncstorei16_global : GlobalStore <truncstorei16>;
def store_atomic_global : GlobalStore<atomic_store>;
def truncstorei8_hi16_global : StoreHi16 <truncstorei8>, GlobalAddress;
def truncstorei16_hi16_global : StoreHi16 <truncstorei16>, GlobalAddress;
-def load_local : LocalLoad <load>;
-def az_extloadi8_local : LocalLoad <az_extloadi8>;
-def sextloadi8_local : LocalLoad <sextloadi8>;
-def az_extloadi16_local : LocalLoad <az_extloadi16>;
-def sextloadi16_local : LocalLoad <sextloadi16>;
-def atomic_load_32_local : LocalLoad<atomic_load_32>;
-def atomic_load_64_local : LocalLoad<atomic_load_64>;
-
-def store_local : LocalStore <store>;
-def truncstorei8_local : LocalStore <truncstorei8>;
-def truncstorei16_local : LocalStore <truncstorei16>;
def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
def atomic_store_local : LocalStore <atomic_store>;
@@ -472,34 +529,24 @@ def store_align16_local : Aligned16Bytes <
(ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
>;
-def load_flat : FlatLoad <load>;
-def az_extloadi8_flat : FlatLoad <az_extloadi8>;
-def sextloadi8_flat : FlatLoad <sextloadi8>;
-def az_extloadi16_flat : FlatLoad <az_extloadi16>;
-def sextloadi16_flat : FlatLoad <sextloadi16>;
-def atomic_load_flat : FlatLoad<atomic_load>;
-
-def store_flat : FlatStore <store>;
-def truncstorei8_flat : FlatStore <truncstorei8>;
-def truncstorei16_flat : FlatStore <truncstorei16>;
def atomic_store_flat : FlatStore <atomic_store>;
def truncstorei8_hi16_flat : StoreHi16<truncstorei8>, FlatStoreAddress;
def truncstorei16_hi16_flat : StoreHi16<truncstorei16>, FlatStoreAddress;
-def constant_load : ConstantLoad<load>;
-def sextloadi8_constant : ConstantLoad <sextloadi8>;
-def az_extloadi8_constant : ConstantLoad <az_extloadi8>;
-def sextloadi16_constant : ConstantLoad <sextloadi16>;
-def az_extloadi16_constant : ConstantLoad <az_extloadi16>;
-
-
class local_binary_atomic_op<SDNode atomic_op> :
PatFrag<(ops node:$ptr, node:$value),
(atomic_op node:$ptr, node:$value), [{
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}]>;
+class region_binary_atomic_op<SDNode atomic_op> :
+ PatFrag<(ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value), [{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+}]>;
+
+
def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
@@ -524,13 +571,22 @@ class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag<
return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}]>;
+class AtomicCmpSwapRegion <SDNode cmp_swap_node> : PatFrag<
+ (ops node:$ptr, node:$cmp, node:$swap),
+ (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
+ AtomicSDNode *AN = cast<AtomicSDNode>(N);
+ return AN->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+}]>;
+
def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>;
+class global_binary_atomic_op_frag<SDNode atomic_op> : PatFrag<
+ (ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+
multiclass global_binary_atomic_op<SDNode atomic_op> {
- def "" : PatFrag<
- (ops node:$ptr, node:$value),
- (atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+ def "" : global_binary_atomic_op_frag<atomic_op>;
def _noret : PatFrag<
(ops node:$ptr, node:$value),
@@ -585,7 +641,6 @@ int TWO_PI_INV = 0x3e22f983;
int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding
int FP16_ONE = 0x3C00;
int FP16_NEG_ONE = 0xBC00;
-int V2FP16_ONE = 0x3C003C00;
int FP32_ONE = 0x3f800000;
int FP32_NEG_ONE = 0xbf800000;
int FP64_ONE = 0x3ff0000000000000;
@@ -626,9 +681,7 @@ class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
: AMDGPUPat<
(sub_type (extractelt vec_type:$src, sub_idx)),
(EXTRACT_SUBREG $src, sub_reg)
-> {
- let SubtargetPredicate = TruePredicate;
-}
+>;
/* Insert element pattern */
class Insert_Element <ValueType elem_type, ValueType vec_type,
@@ -636,9 +689,7 @@ class Insert_Element <ValueType elem_type, ValueType vec_type,
: AMDGPUPat <
(insertelt vec_type:$vec, elem_type:$elem, sub_idx),
(INSERT_SUBREG $vec, $elem, sub_reg)
-> {
- let SubtargetPredicate = TruePredicate;
-}
+>;
// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
// can handle COPY instructions.
@@ -811,7 +862,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
SDPatternOperator max_oneuse,
ValueType vt = i32> {
- // This matches 16 permutations of
+ // This matches 16 permutations of
// min(max(a, b), max(min(a, b), c))
def : AMDGPUPat <
(min (max_oneuse vt:$src0, vt:$src1),
@@ -819,7 +870,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
(med3Inst vt:$src0, vt:$src1, vt:$src2)
>;
- // This matches 16 permutations of
+ // This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
def : AMDGPUPat <
(max (min_oneuse vt:$src0, vt:$src1),
@@ -827,7 +878,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
(med3Inst $src0, $src1, $src2)
>;
}
-
+
// Special conversion patterns
def cvt_rpi_i32_f32 : PatFrag <
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
deleted file mode 100644
index 02108ca3ddd7..000000000000
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// AMDGPU Implementation of the IntrinsicInfo class.
-//
-//===-----------------------------------------------------------------------===//
-
-#include "AMDGPUIntrinsicInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-
-using namespace llvm;
-
-AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
- : TargetIntrinsicInfo() {}
-
-static const char *const IntrinsicNameTable[] = {
-#define GET_INTRINSIC_NAME_TABLE
-#include "AMDGPUGenIntrinsicImpl.inc"
-#undef GET_INTRINSIC_NAME_TABLE
-};
-
-namespace {
-#define GET_INTRINSIC_ATTRIBUTES
-#include "AMDGPUGenIntrinsicImpl.inc"
-#undef GET_INTRINSIC_ATTRIBUTES
-}
-
-StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
- ArrayRef<Type *> Tys) const {
- if (IntrID < Intrinsic::num_intrinsics)
- return StringRef();
-
- assert(IntrID < SIIntrinsic::num_AMDGPU_intrinsics &&
- "Invalid intrinsic ID");
-
- return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
-}
-
-std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
- unsigned NumTys) const {
- return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
-}
-
-FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
- ArrayRef<Type*> Tys) const {
- // FIXME: Re-use Intrinsic::getType machinery
- llvm_unreachable("unhandled intrinsic");
-}
-
-unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
- unsigned Len) const {
- StringRef Name(NameData, Len);
- if (!Name.startswith("llvm."))
- return 0; // All intrinsics start with 'llvm.'
-
- // Look for a name match in our table. If the intrinsic is not overloaded,
- // require an exact match. If it is overloaded, require a prefix match. The
- // AMDGPU enum enum starts at Intrinsic::num_intrinsics.
- int Idx = Intrinsic::lookupLLVMIntrinsicByName(IntrinsicNameTable, Name);
- if (Idx >= 0) {
- bool IsPrefixMatch = Name.size() > strlen(IntrinsicNameTable[Idx]);
- return IsPrefixMatch == isOverloaded(Idx + 1)
- ? Intrinsic::num_intrinsics + Idx
- : 0;
- }
-
- return 0;
-}
-
-bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
-// Overload Table
-#define GET_INTRINSIC_OVERLOAD_TABLE
-#include "AMDGPUGenIntrinsicImpl.inc"
-#undef GET_INTRINSIC_OVERLOAD_TABLE
-}
-
-Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
- ArrayRef<Type *> Tys) const {
- FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
- Function *F
- = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
-
- AttributeList AS =
- getAttributes(M->getContext(), static_cast<SIIntrinsic::ID>(IntrID));
- F->setAttributes(AS);
- return F;
-}
-
-Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
- Type **Tys,
- unsigned NumTys) const {
- return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
-}
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
deleted file mode 100644
index a1a094dded23..000000000000
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ /dev/null
@@ -1,58 +0,0 @@
-//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// Interface for the AMDGPU Implementation of the Intrinsic Info class.
-//
-//===-----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
-
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
-
-namespace llvm {
-class TargetMachine;
-
-namespace SIIntrinsic {
-enum ID {
- last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
-#define GET_INTRINSIC_ENUM_VALUES
-#include "AMDGPUGenIntrinsicEnums.inc"
-#undef GET_INTRINSIC_ENUM_VALUES
- , num_AMDGPU_intrinsics
-};
-
-} // end namespace AMDGPUIntrinsic
-
-class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
-public:
- AMDGPUIntrinsicInfo();
-
- StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
-
- std::string getName(unsigned IntrId, Type **Tys = nullptr,
- unsigned NumTys = 0) const override;
-
- unsigned lookupName(const char *Name, unsigned Len) const override;
- bool isOverloaded(unsigned IID) const override;
- Function *getDeclaration(Module *M, unsigned ID,
- Type **Tys = nullptr,
- unsigned NumTys = 0) const override;
-
- Function *getDeclaration(Module *M, unsigned ID,
- ArrayRef<Type *> = None) const;
-
- FunctionType *getType(LLVMContext &Context, unsigned ID,
- ArrayRef<Type*> Tys = None) const;
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ef85c1040545..670f6225fbf7 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1,9 +1,8 @@
//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -15,17 +14,93 @@
#include "AMDGPU.h"
#include "AMDGPULegalizerInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
+#define DEBUG_TYPE "amdgpu-legalinfo"
+
using namespace llvm;
using namespace LegalizeActions;
+using namespace LegalizeMutations;
+using namespace LegalityPredicates;
+
+
+static LegalityPredicate isMultiple32(unsigned TypeIdx,
+ unsigned MaxSize = 512) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ const LLT EltTy = Ty.getScalarType();
+ return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
+ };
+}
+
+static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ return Ty.isVector() &&
+ Ty.getNumElements() % 2 != 0 &&
+ Ty.getElementType().getSizeInBits() < 32;
+ };
+}
-AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
- const GCNTargetMachine &TM) {
+static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ const LLT EltTy = Ty.getElementType();
+ return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
+ };
+}
+
+static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ const LLT EltTy = Ty.getElementType();
+ unsigned Size = Ty.getSizeInBits();
+ unsigned Pieces = (Size + 63) / 64;
+ unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
+ return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
+ };
+}
+
+static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
+ return [=](const LegalityQuery &Query) {
+ const LLT QueryTy = Query.Types[TypeIdx];
+ return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
+ };
+}
+
+static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT QueryTy = Query.Types[TypeIdx];
+ return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
+ };
+}
+
+// Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
+// v2s16.
+static LegalityPredicate isRegisterType(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ if (Ty.isVector()) {
+ const int EltSize = Ty.getElementType().getSizeInBits();
+ return EltSize == 32 || EltSize == 64 ||
+ (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
+ EltSize == 128 || EltSize == 256;
+ }
+
+ return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
+ };
+}
+
+AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
+ const GCNTargetMachine &TM)
+ : ST(ST_) {
using namespace TargetOpcode;
auto GetAddrSpacePtr = [&TM](unsigned AS) {
@@ -33,13 +108,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
};
const LLT S1 = LLT::scalar(1);
+ const LLT S8 = LLT::scalar(8);
+ const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
+ const LLT S128 = LLT::scalar(128);
+ const LLT S256 = LLT::scalar(256);
const LLT S512 = LLT::scalar(512);
const LLT V2S16 = LLT::vector(2, 16);
const LLT V4S16 = LLT::vector(4, 16);
- const LLT V8S16 = LLT::vector(8, 16);
const LLT V2S32 = LLT::vector(2, 32);
const LLT V3S32 = LLT::vector(3, 32);
@@ -79,156 +157,428 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
const LLT CodePtr = FlatPtr;
- const LLT AddrSpaces[] = {
- GlobalPtr,
- ConstantPtr,
- LocalPtr,
- FlatPtr,
- PrivatePtr
+ const std::initializer_list<LLT> AddrSpaces64 = {
+ GlobalPtr, ConstantPtr, FlatPtr
+ };
+
+ const std::initializer_list<LLT> AddrSpaces32 = {
+ LocalPtr, PrivatePtr
+ };
+
+ const std::initializer_list<LLT> FPTypesBase = {
+ S32, S64
+ };
+
+ const std::initializer_list<LLT> FPTypes16 = {
+ S32, S64, S16
+ };
+
+ const std::initializer_list<LLT> FPTypesPK16 = {
+ S32, S64, S16, V2S16
};
setAction({G_BRCOND, S1}, Legal);
- setAction({G_ADD, S32}, Legal);
- setAction({G_ASHR, S32}, Legal);
- setAction({G_SUB, S32}, Legal);
- setAction({G_MUL, S32}, Legal);
+ // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
+ // elements for v3s16
+ getActionDefinitionsBuilder(G_PHI)
+ .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
+ .legalFor(AllS32Vectors)
+ .legalFor(AllS64Vectors)
+ .legalFor(AddrSpaces64)
+ .legalFor(AddrSpaces32)
+ .clampScalar(0, S32, S256)
+ .widenScalarToNextPow2(0, 32)
+ .clampMaxNumElements(0, S32, 16)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .legalIf(isPointer(0));
- // FIXME: 64-bit ones only legal for scalar
+ if (ST.has16BitInsts()) {
+ getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+ .legalFor({S32, S16})
+ .clampScalar(0, S16, S32)
+ .scalarize(0);
+ } else {
+ getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+ .legalFor({S32})
+ .clampScalar(0, S32, S32)
+ .scalarize(0);
+ }
+
+ getActionDefinitionsBuilder({G_UMULH, G_SMULH})
+ .legalFor({S32})
+ .clampScalar(0, S32, S32)
+ .scalarize(0);
+
+ // Report legal for any types we can handle anywhere. For the cases only legal
+ // on the SALU, RegBankSelect will be able to re-legalize.
getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
- .legalFor({S32, S1, S64, V2S32});
+ .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
+ .clampScalar(0, S32, S64)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
+ .widenScalarToNextPow2(0)
+ .scalarize(0);
getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
- .legalFor({{S32, S1}});
+ .legalFor({{S32, S1}})
+ .clampScalar(0, S32, S32);
- setAction({G_BITCAST, V2S16}, Legal);
- setAction({G_BITCAST, 1, S32}, Legal);
+ getActionDefinitionsBuilder(G_BITCAST)
+ .legalForCartesianProduct({S32, V2S16})
+ .legalForCartesianProduct({S64, V2S32, V4S16})
+ .legalForCartesianProduct({V2S64, V4S32})
+ // Don't worry about the size constraint.
+ .legalIf(all(isPointer(0), isPointer(1)));
- setAction({G_BITCAST, S32}, Legal);
- setAction({G_BITCAST, 1, V2S16}, Legal);
-
- getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({S32, S64});
+ if (ST.has16BitInsts()) {
+ getActionDefinitionsBuilder(G_FCONSTANT)
+ .legalFor({S32, S64, S16})
+ .clampScalar(0, S16, S64);
+ } else {
+ getActionDefinitionsBuilder(G_FCONSTANT)
+ .legalFor({S32, S64})
+ .clampScalar(0, S32, S64);
+ }
- // G_IMPLICIT_DEF is a no-op so we can make it legal for any value type that
- // can fit in a register.
- // FIXME: We need to legalize several more operations before we can add
- // a test case for size > 512.
getActionDefinitionsBuilder(G_IMPLICIT_DEF)
- .legalIf([=](const LegalityQuery &Query) {
- return Query.Types[0].getSizeInBits() <= 512;
- })
- .clampScalar(0, S1, S512);
+ .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
+ ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .clampScalarOrElt(0, S32, S512)
+ .legalIf(isMultiple32(0))
+ .widenScalarToNextPow2(0, 32)
+ .clampMaxNumElements(0, S32, 16);
- getActionDefinitionsBuilder(G_CONSTANT)
- .legalFor({S1, S32, S64});
// FIXME: i1 operands to intrinsics should always be legal, but other i1
// values may not be legal. We need to figure out how to distinguish
// between these two scenarios.
- setAction({G_CONSTANT, S1}, Legal);
+ getActionDefinitionsBuilder(G_CONSTANT)
+ .legalFor({S1, S32, S64, GlobalPtr,
+ LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
+ .clampScalar(0, S32, S64)
+ .widenScalarToNextPow2(0)
+ .legalIf(isPointer(0));
setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
- getActionDefinitionsBuilder(
- { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA})
+ auto &FPOpActions = getActionDefinitionsBuilder(
+ { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
.legalFor({S32, S64});
- getActionDefinitionsBuilder(G_FPTRUNC)
- .legalFor({{S32, S64}});
+ if (ST.has16BitInsts()) {
+ if (ST.hasVOP3PInsts())
+ FPOpActions.legalFor({S16, V2S16});
+ else
+ FPOpActions.legalFor({S16});
+ }
- // Use actual fsub instruction
- setAction({G_FSUB, S32}, Legal);
+ auto &MinNumMaxNum = getActionDefinitionsBuilder({
+ G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
+
+ if (ST.hasVOP3PInsts()) {
+ MinNumMaxNum.customFor(FPTypesPK16)
+ .clampMaxNumElements(0, S16, 2)
+ .clampScalar(0, S16, S64)
+ .scalarize(0);
+ } else if (ST.has16BitInsts()) {
+ MinNumMaxNum.customFor(FPTypes16)
+ .clampScalar(0, S16, S64)
+ .scalarize(0);
+ } else {
+ MinNumMaxNum.customFor(FPTypesBase)
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
+ }
- // Must use fadd + fneg
- setAction({G_FSUB, S64}, Lower);
+ // TODO: Implement
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
- setAction({G_FCMP, S1}, Legal);
- setAction({G_FCMP, 1, S32}, Legal);
- setAction({G_FCMP, 1, S64}, Legal);
+ if (ST.hasVOP3PInsts())
+ FPOpActions.clampMaxNumElements(0, S16, 2);
+ FPOpActions
+ .scalarize(0)
+ .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
- setAction({G_ZEXT, S64}, Legal);
- setAction({G_ZEXT, 1, S32}, Legal);
+ if (ST.has16BitInsts()) {
+ getActionDefinitionsBuilder(G_FSQRT)
+ .legalFor({S32, S64, S16})
+ .scalarize(0)
+ .clampScalar(0, S16, S64);
+ } else {
+ getActionDefinitionsBuilder(G_FSQRT)
+ .legalFor({S32, S64})
+ .scalarize(0)
+ .clampScalar(0, S32, S64);
+ }
- setAction({G_SEXT, S64}, Legal);
- setAction({G_SEXT, 1, S32}, Legal);
+ getActionDefinitionsBuilder(G_FPTRUNC)
+ .legalFor({{S32, S64}, {S16, S32}})
+ .scalarize(0);
- setAction({G_ANYEXT, S64}, Legal);
- setAction({G_ANYEXT, 1, S32}, Legal);
+ getActionDefinitionsBuilder(G_FPEXT)
+ .legalFor({{S64, S32}, {S32, S16}})
+ .lowerFor({{S64, S16}}) // FIXME: Implement
+ .scalarize(0);
- setAction({G_FPTOSI, S32}, Legal);
- setAction({G_FPTOSI, 1, S32}, Legal);
+ // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
+ getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
- setAction({G_SITOFP, S32}, Legal);
- setAction({G_SITOFP, 1, S32}, Legal);
+ getActionDefinitionsBuilder(G_FSUB)
+ // Use actual fsub instruction
+ .legalFor({S32})
+ // Must use fadd + fneg
+ .lowerFor({S64, S16, V2S16})
+ .scalarize(0)
+ .clampScalar(0, S32, S64);
- setAction({G_UITOFP, S32}, Legal);
- setAction({G_UITOFP, 1, S32}, Legal);
+ getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
+ .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
+ {S32, S1}, {S64, S1}, {S16, S1},
+ // FIXME: Hack
+ {S64, LLT::scalar(33)},
+ {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
+ .scalarize(0);
- setAction({G_FPTOUI, S32}, Legal);
- setAction({G_FPTOUI, 1, S32}, Legal);
+ getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+ .legalFor({{S32, S32}, {S64, S32}})
+ .lowerFor({{S32, S64}})
+ .customFor({{S64, S64}})
+ .scalarize(0);
- setAction({G_FPOW, S32}, Legal);
- setAction({G_FEXP2, S32}, Legal);
- setAction({G_FLOG2, S32}, Legal);
+ getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+ .legalFor({{S32, S32}, {S32, S64}})
+ .scalarize(0);
- getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND})
- .legalFor({S32, S64});
+ getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
+ .legalFor({S32, S64})
+ .scalarize(0);
- for (LLT PtrTy : AddrSpaces) {
- LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits());
- setAction({G_GEP, PtrTy}, Legal);
- setAction({G_GEP, 1, IdxTy}, Legal);
+ if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
+ getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
+ .legalFor({S32, S64})
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
+ } else {
+ getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
+ .legalFor({S32})
+ .customFor({S64})
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
}
+ getActionDefinitionsBuilder(G_GEP)
+ .legalForCartesianProduct(AddrSpaces64, {S64})
+ .legalForCartesianProduct(AddrSpaces32, {S32})
+ .scalarize(0);
+
setAction({G_BLOCK_ADDR, CodePtr}, Legal);
- setAction({G_ICMP, S1}, Legal);
- setAction({G_ICMP, 1, S32}, Legal);
+ auto &CmpBuilder =
+ getActionDefinitionsBuilder(G_ICMP)
+ .legalForCartesianProduct(
+ {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
+ .legalFor({{S1, S32}, {S1, S64}});
+ if (ST.has16BitInsts()) {
+ CmpBuilder.legalFor({{S1, S16}});
+ }
+
+ CmpBuilder
+ .widenScalarToNextPow2(1)
+ .clampScalar(1, S32, S64)
+ .scalarize(0)
+ .legalIf(all(typeIs(0, S1), isPointer(1)));
+
+ getActionDefinitionsBuilder(G_FCMP)
+ .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
+ .widenScalarToNextPow2(1)
+ .clampScalar(1, S32, S64)
+ .scalarize(0);
+
+ // FIXME: fexp, flog2, flog10 needs to be custom lowered.
+ getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
+ G_FLOG, G_FLOG2, G_FLOG10})
+ .legalFor({S32})
+ .scalarize(0);
+
+ // The 64-bit versions produce 32-bit results, but only on the SALU.
+ getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
+ G_CTTZ, G_CTTZ_ZERO_UNDEF,
+ G_CTPOP})
+ .legalFor({{S32, S32}, {S32, S64}})
+ .clampScalar(0, S32, S32)
+ .clampScalar(1, S32, S64)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 32)
+ .widenScalarToNextPow2(1, 32);
+
+ // TODO: Expand for > s32
+ getActionDefinitionsBuilder(G_BSWAP)
+ .legalFor({S32})
+ .clampScalar(0, S32, S32)
+ .scalarize(0);
+
+ if (ST.has16BitInsts()) {
+ if (ST.hasVOP3PInsts()) {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16, V2S16})
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .clampMaxNumElements(0, S16, 2)
+ .clampScalar(0, S16, S32)
+ .widenScalarToNextPow2(0)
+ .scalarize(0);
+ } else {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32, S16})
+ .widenScalarToNextPow2(0)
+ .clampScalar(0, S16, S32)
+ .scalarize(0);
+ }
+ } else {
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ .legalFor({S32})
+ .clampScalar(0, S32, S32)
+ .widenScalarToNextPow2(0)
+ .scalarize(0);
+ }
- setAction({G_CTLZ, S32}, Legal);
- setAction({G_CTLZ_ZERO_UNDEF, S32}, Legal);
- setAction({G_CTTZ, S32}, Legal);
- setAction({G_CTTZ_ZERO_UNDEF, S32}, Legal);
- setAction({G_BSWAP, S32}, Legal);
- setAction({G_CTPOP, S32}, Legal);
+ auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
+ return [=](const LegalityQuery &Query) {
+ return Query.Types[TypeIdx0].getSizeInBits() <
+ Query.Types[TypeIdx1].getSizeInBits();
+ };
+ };
+
+ auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
+ return [=](const LegalityQuery &Query) {
+ return Query.Types[TypeIdx0].getSizeInBits() >
+ Query.Types[TypeIdx1].getSizeInBits();
+ };
+ };
getActionDefinitionsBuilder(G_INTTOPTR)
- .legalIf([](const LegalityQuery &Query) {
- return true;
- });
+ // List the common cases
+ .legalForCartesianProduct(AddrSpaces64, {S64})
+ .legalForCartesianProduct(AddrSpaces32, {S32})
+ .scalarize(0)
+ // Accept any address space as long as the size matches
+ .legalIf(sameSize(0, 1))
+ .widenScalarIf(smallerThan(1, 0),
+ [](const LegalityQuery &Query) {
+ return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
+ })
+ .narrowScalarIf(greaterThan(1, 0),
+ [](const LegalityQuery &Query) {
+ return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
+ });
getActionDefinitionsBuilder(G_PTRTOINT)
- .legalIf([](const LegalityQuery &Query) {
- return true;
- });
+ // List the common cases
+ .legalForCartesianProduct(AddrSpaces64, {S64})
+ .legalForCartesianProduct(AddrSpaces32, {S32})
+ .scalarize(0)
+ // Accept any address space as long as the size matches
+ .legalIf(sameSize(0, 1))
+ .widenScalarIf(smallerThan(0, 1),
+ [](const LegalityQuery &Query) {
+ return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
+ })
+ .narrowScalarIf(
+ greaterThan(0, 1),
+ [](const LegalityQuery &Query) {
+ return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
+ });
+
+ if (ST.hasFlatAddressSpace()) {
+ getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
+ .scalarize(0)
+ .custom();
+ }
+ // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
+ // handle some operations by just promoting the register during
+ // selection. There are also d16 loads on GFX9+ which preserve the high bits.
getActionDefinitionsBuilder({G_LOAD, G_STORE})
- .legalIf([=, &ST](const LegalityQuery &Query) {
+ .narrowScalarIf([](const LegalityQuery &Query) {
+ unsigned Size = Query.Types[0].getSizeInBits();
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ return (Size > 32 && MemSize < Size);
+ },
+ [](const LegalityQuery &Query) {
+ return std::make_pair(0, LLT::scalar(32));
+ })
+ .fewerElementsIf([=](const LegalityQuery &Query) {
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ return (MemSize == 96) &&
+ Query.Types[0].isVector() &&
+ !ST.hasDwordx3LoadStores();
+ },
+ [=](const LegalityQuery &Query) {
+ return std::make_pair(0, V2S32);
+ })
+ .legalIf([=](const LegalityQuery &Query) {
const LLT &Ty0 = Query.Types[0];
+ unsigned Size = Ty0.getSizeInBits();
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ if (Size < 32 || (Size > 32 && MemSize < Size))
+ return false;
+
+ if (Ty0.isVector() && Size != MemSize)
+ return false;
+
// TODO: Decompose private loads into 4-byte components.
// TODO: Illegal flat loads on SI
- switch (Ty0.getSizeInBits()) {
+ switch (MemSize) {
+ case 8:
+ case 16:
+ return Size == 32;
case 32:
case 64:
case 128:
return true;
case 96:
- // XXX hasLoadX3
- return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS);
+ return ST.hasDwordx3LoadStores();
case 256:
case 512:
- // TODO: constant loads
+ // TODO: Possibly support loads of i256 and i512 . This will require
+ // adding i256 and i512 types to MVT in order for to be able to use
+ // TableGen.
+ // TODO: Add support for other vector types, this will require
+ // defining more value mappings for the new types.
+ return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
+ Ty0.getScalarType().getSizeInBits() == 64);
+
default:
return false;
}
- });
+ })
+ .clampScalar(0, S32, S64);
+ // FIXME: Handle alignment requirements.
+ auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
+ .legalForTypesWithMemDesc({
+ {S32, GlobalPtr, 8, 8},
+ {S32, GlobalPtr, 16, 8},
+ {S32, LocalPtr, 8, 8},
+ {S32, LocalPtr, 16, 8},
+ {S32, PrivatePtr, 8, 8},
+ {S32, PrivatePtr, 16, 8}});
+ if (ST.hasFlatAddressSpace()) {
+ ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
+ {S32, FlatPtr, 16, 8}});
+ }
+
+ ExtLoads.clampScalar(0, S32, S32)
+ .widenScalarToNextPow2(0)
+ .unsupportedIfMemSizeNotPow2()
+ .lower();
+
auto &Atomics = getActionDefinitionsBuilder(
{G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
@@ -240,84 +590,805 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
- setAction({G_SELECT, S32}, Legal);
- setAction({G_SELECT, 1, S1}, Legal);
+ // TODO: Pointer types, any 32-bit or 64-bit vector
+ getActionDefinitionsBuilder(G_SELECT)
+ .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
+ GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
+ LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
+ .clampScalar(0, S16, S64)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .fewerElementsIf(numElementsNotEven(0), scalarize(0))
+ .scalarize(1)
+ .clampMaxNumElements(0, S32, 2)
+ .clampMaxNumElements(0, LocalPtr, 2)
+ .clampMaxNumElements(0, PrivatePtr, 2)
+ .scalarize(0)
+ .widenScalarToNextPow2(0)
+ .legalIf(all(isPointer(0), typeIs(1, S1)));
- setAction({G_SHL, S32}, Legal);
+ // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
+ // be more flexible with the shift amount type.
+ auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
+ .legalFor({{S32, S32}, {S64, S32}});
+ if (ST.has16BitInsts()) {
+ if (ST.hasVOP3PInsts()) {
+ Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
+ .clampMaxNumElements(0, S16, 2);
+ } else
+ Shifts.legalFor({{S16, S32}, {S16, S16}});
-
- // FIXME: When RegBankSelect inserts copies, it will only create new
- // registers with scalar types. This means we can end up with
- // G_LOAD/G_STORE/G_GEP instruction with scalar types for their pointer
- // operands. In assert builds, the instruction selector will assert
- // if it sees a generic instruction which isn't legal, so we need to
- // tell it that scalar types are legal for pointer operands
- setAction({G_GEP, S64}, Legal);
+ Shifts.clampScalar(1, S16, S32);
+ Shifts.clampScalar(0, S16, S64);
+ Shifts.widenScalarToNextPow2(0, 16);
+ } else {
+ // Make sure we legalize the shift amount type first, as the general
+ // expansion for the shifted type will produce much worse code if it hasn't
+ // been truncated already.
+ Shifts.clampScalar(1, S32, S32);
+ Shifts.clampScalar(0, S32, S64);
+ Shifts.widenScalarToNextPow2(0, 32);
+ }
+ Shifts.scalarize(0);
for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
+ unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
+ unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
+ unsigned IdxTypeIdx = 2;
+
getActionDefinitionsBuilder(Op)
- .legalIf([=](const LegalityQuery &Query) {
- const LLT &VecTy = Query.Types[1];
- const LLT &IdxTy = Query.Types[2];
- return VecTy.getSizeInBits() % 32 == 0 &&
- VecTy.getSizeInBits() <= 512 &&
- IdxTy.getSizeInBits() == 32;
- });
+ .customIf([=](const LegalityQuery &Query) {
+ const LLT EltTy = Query.Types[EltTypeIdx];
+ const LLT VecTy = Query.Types[VecTypeIdx];
+ const LLT IdxTy = Query.Types[IdxTypeIdx];
+ return (EltTy.getSizeInBits() == 16 ||
+ EltTy.getSizeInBits() % 32 == 0) &&
+ VecTy.getSizeInBits() % 32 == 0 &&
+ VecTy.getSizeInBits() <= 512 &&
+ IdxTy.getSizeInBits() == 32;
+ })
+ .clampScalar(EltTypeIdx, S32, S64)
+ .clampScalar(VecTypeIdx, S32, S64)
+ .clampScalar(IdxTypeIdx, S32, S32);
}
- // FIXME: Doesn't handle extract of illegal sizes.
- getActionDefinitionsBuilder({G_EXTRACT, G_INSERT})
- .legalIf([=](const LegalityQuery &Query) {
- const LLT &Ty0 = Query.Types[0];
- const LLT &Ty1 = Query.Types[1];
- return (Ty0.getSizeInBits() % 32 == 0) &&
- (Ty1.getSizeInBits() % 32 == 0);
+ getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
+ .unsupportedIf([=](const LegalityQuery &Query) {
+ const LLT &EltTy = Query.Types[1].getElementType();
+ return Query.Types[0] != EltTy;
});
+ for (unsigned Op : {G_EXTRACT, G_INSERT}) {
+ unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
+ unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
+
+ // FIXME: Doesn't handle extract of illegal sizes.
+ getActionDefinitionsBuilder(Op)
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT BigTy = Query.Types[BigTyIdx];
+ const LLT LitTy = Query.Types[LitTyIdx];
+ return (BigTy.getSizeInBits() % 32 == 0) &&
+ (LitTy.getSizeInBits() % 16 == 0);
+ })
+ .widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ const LLT BigTy = Query.Types[BigTyIdx];
+ return (BigTy.getScalarSizeInBits() < 16);
+ },
+ LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
+ .widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ const LLT LitTy = Query.Types[LitTyIdx];
+ return (LitTy.getScalarSizeInBits() < 16);
+ },
+ LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
+ .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
+ .widenScalarToNextPow2(BigTyIdx, 32);
+
+ }
+
getActionDefinitionsBuilder(G_BUILD_VECTOR)
- .legalForCartesianProduct(AllS32Vectors, {S32})
- .legalForCartesianProduct(AllS64Vectors, {S64})
- .clampNumElements(0, V16S32, V16S32)
- .clampNumElements(0, V2S64, V8S64)
- .minScalarSameAs(1, 0);
+ .legalForCartesianProduct(AllS32Vectors, {S32})
+ .legalForCartesianProduct(AllS64Vectors, {S64})
+ .clampNumElements(0, V16S32, V16S32)
+ .clampNumElements(0, V2S64, V8S64)
+ .minScalarSameAs(1, 0)
+ .legalIf(isRegisterType(0))
+ .minScalarOrElt(0, S32);
- // TODO: Support any combination of v2s32
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
- .legalFor({{V4S32, V2S32},
- {V8S32, V2S32},
- {V8S32, V4S32},
- {V4S64, V2S64},
- {V4S16, V2S16},
- {V8S16, V2S16},
- {V8S16, V4S16}});
+ .legalIf(isRegisterType(0));
// Merge/Unmerge
for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
+ auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
+ const LLT &Ty = Query.Types[TypeIdx];
+ if (Ty.isVector()) {
+ const LLT &EltTy = Ty.getElementType();
+ if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
+ return true;
+ if (!isPowerOf2_32(EltTy.getSizeInBits()))
+ return true;
+ }
+ return false;
+ };
+
getActionDefinitionsBuilder(Op)
+ .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
+ // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
+ // worth considering the multiples of 64 since 2*192 and 2*384 are not
+ // valid.
+ .clampScalar(LitTyIdx, S16, S256)
+ .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
+
+ // Break up vectors with weird elements into scalars
+ .fewerElementsIf(
+ [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
+ scalarize(0))
+ .fewerElementsIf(
+ [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
+ scalarize(1))
+ .clampScalar(BigTyIdx, S32, S512)
+ .widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ const LLT &Ty = Query.Types[BigTyIdx];
+ return !isPowerOf2_32(Ty.getSizeInBits()) &&
+ Ty.getSizeInBits() % 16 != 0;
+ },
+ [=](const LegalityQuery &Query) {
+ // Pick the next power of 2, or a multiple of 64 over 128.
+ // Whichever is smaller.
+ const LLT &Ty = Query.Types[BigTyIdx];
+ unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
+ if (NewSizeInBits >= 256) {
+ unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
+ if (RoundedTo < NewSizeInBits)
+ NewSizeInBits = RoundedTo;
+ }
+ return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
+ })
.legalIf([=](const LegalityQuery &Query) {
const LLT &BigTy = Query.Types[BigTyIdx];
const LLT &LitTy = Query.Types[LitTyIdx];
- return BigTy.getSizeInBits() % 32 == 0 &&
- LitTy.getSizeInBits() % 32 == 0 &&
+
+ if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
+ return false;
+ if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
+ return false;
+
+ return BigTy.getSizeInBits() % 16 == 0 &&
+ LitTy.getSizeInBits() % 16 == 0 &&
BigTy.getSizeInBits() <= 512;
})
// Any vectors left are the wrong size. Scalarize them.
- .fewerElementsIf([](const LegalityQuery &Query) { return true; },
- [](const LegalityQuery &Query) {
- return std::make_pair(
- 0, Query.Types[0].getElementType());
- })
- .fewerElementsIf([](const LegalityQuery &Query) { return true; },
- [](const LegalityQuery &Query) {
- return std::make_pair(
- 1, Query.Types[1].getElementType());
- });
-
+ .scalarize(0)
+ .scalarize(1);
}
computeTables();
verify(*ST.getInstrInfo());
}
+
+bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const {
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_ADDRSPACE_CAST:
+ return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_FRINT:
+ return legalizeFrint(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_FCEIL:
+ return legalizeFceil(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_INTRINSIC_TRUNC:
+ return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_SITOFP:
+ return legalizeITOFP(MI, MRI, MIRBuilder, true);
+ case TargetOpcode::G_UITOFP:
+ return legalizeITOFP(MI, MRI, MIRBuilder, false);
+ case TargetOpcode::G_FMINNUM:
+ case TargetOpcode::G_FMAXNUM:
+ case TargetOpcode::G_FMINNUM_IEEE:
+ case TargetOpcode::G_FMAXNUM_IEEE:
+ return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+ return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
+ case TargetOpcode::G_INSERT_VECTOR_ELT:
+ return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
+ default:
+ return false;
+ }
+
+ llvm_unreachable("expected switch to return");
+}
+
+Register AMDGPULegalizerInfo::getSegmentAperture(
+ unsigned AS,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const LLT S32 = LLT::scalar(32);
+
+ if (ST.hasApertureRegs()) {
+ // FIXME: Use inline constants (src_{shared, private}_base) instead of
+ // getreg.
+ unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
+ AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
+ AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
+ unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
+ AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
+ AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
+ unsigned Encoding =
+ AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
+ Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
+ WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
+
+ Register ApertureReg = MRI.createGenericVirtualRegister(S32);
+ Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
+ .addDef(GetReg)
+ .addImm(Encoding);
+ MRI.setType(GetReg, S32);
+
+ auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
+ MIRBuilder.buildInstr(TargetOpcode::G_SHL)
+ .addDef(ApertureReg)
+ .addUse(GetReg)
+ .addUse(ShiftAmt.getReg(0));
+
+ return ApertureReg;
+ }
+
+ Register QueuePtr = MRI.createGenericVirtualRegister(
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+
+ // FIXME: Placeholder until we can track the input registers.
+ MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
+
+ // Offset into amd_queue_t for group_segment_aperture_base_hi /
+ // private_segment_aperture_base_hi.
+ uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+
+ // FIXME: Don't use undef
+ Value *V = UndefValue::get(PointerType::get(
+ Type::getInt8Ty(MF.getFunction().getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS));
+
+ MachinePointerInfo PtrInfo(V, StructOffset);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo,
+ MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ 4,
+ MinAlign(64, StructOffset));
+
+ Register LoadResult = MRI.createGenericVirtualRegister(S32);
+ Register LoadAddr;
+
+ MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
+ MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
+ return LoadResult;
+}
+
+bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+
+ MIRBuilder.setInstr(MI);
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Src);
+ unsigned DestAS = DstTy.getAddressSpace();
+ unsigned SrcAS = SrcTy.getAddressSpace();
+
+ // TODO: Avoid reloading from the queue ptr for each cast, or at least each
+ // vector element.
+ assert(!DstTy.isVector());
+
+ const AMDGPUTargetMachine &TM
+ = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
+ MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
+ return true;
+ }
+
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
+ assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
+ DestAS == AMDGPUAS::PRIVATE_ADDRESS);
+ unsigned NullVal = TM.getNullPointerValue(DestAS);
+
+ auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
+ auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
+
+ Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
+
+ // Extract low 32-bits of the pointer.
+ MIRBuilder.buildExtract(PtrLo32, Src, 0);
+
+ Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
+ MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
+ MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+ SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
+
+ auto SegmentNull =
+ MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
+ auto FlatNull =
+ MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
+
+ Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
+
+ Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
+ MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
+
+ Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
+
+ // Coerce the type of the low half of the result so we can use merge_values.
+ Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
+ MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
+ .addDef(SrcAsInt)
+ .addUse(Src);
+
+ // TODO: Should we allow mismatched types but matching sizes in merges to
+ // avoid the ptrtoint?
+ MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
+ MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFrint(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ MIRBuilder.setInstr(MI);
+
+ Register Src = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(Src);
+ assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
+
+ APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
+ APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
+
+ auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
+ auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
+
+ // TODO: Should this propagate fast-math-flags?
+ auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
+ auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
+
+ auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
+ auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
+
+ auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
+ MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFceil(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
+
+ const LLT S1 = LLT::scalar(1);
+ const LLT S64 = LLT::scalar(64);
+
+ Register Src = MI.getOperand(1).getReg();
+ assert(MRI.getType(Src) == S64);
+
+ // result = trunc(src)
+ // if (src > 0.0 && src != result)
+ // result += 1.0
+
+ auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
+
+ const auto Zero = B.buildFConstant(S64, 0.0);
+ const auto One = B.buildFConstant(S64, 1.0);
+ auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
+ auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
+ auto And = B.buildAnd(S1, Lt0, NeTrunc);
+ auto Add = B.buildSelect(S64, And, One, Zero);
+
+ // TODO: Should this propagate fast-math-flags?
+ B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
+ return true;
+}
+
+static MachineInstrBuilder extractF64Exponent(unsigned Hi,
+ MachineIRBuilder &B) {
+ const unsigned FractBits = 52;
+ const unsigned ExpBits = 11;
+ LLT S32 = LLT::scalar(32);
+
+ auto Const0 = B.buildConstant(S32, FractBits - 32);
+ auto Const1 = B.buildConstant(S32, ExpBits);
+
+ auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
+ .addUse(Const0.getReg(0))
+ .addUse(Const1.getReg(0));
+
+ return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
+}
+
+bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
+
+ const LLT S1 = LLT::scalar(1);
+ const LLT S32 = LLT::scalar(32);
+ const LLT S64 = LLT::scalar(64);
+
+ Register Src = MI.getOperand(1).getReg();
+ assert(MRI.getType(Src) == S64);
+
+ // TODO: Should this use extract since the low half is unused?
+ auto Unmerge = B.buildUnmerge({S32, S32}, Src);
+ Register Hi = Unmerge.getReg(1);
+
+ // Extract the upper half, since this is where we will find the sign and
+ // exponent.
+ auto Exp = extractF64Exponent(Hi, B);
+
+ const unsigned FractBits = 52;
+
+ // Extract the sign bit.
+ const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
+ auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
+
+ const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
+
+ const auto Zero32 = B.buildConstant(S32, 0);
+
+ // Extend back to 64-bits.
+ auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
+
+ auto Shr = B.buildAShr(S64, FractMask, Exp);
+ auto Not = B.buildNot(S64, Shr);
+ auto Tmp0 = B.buildAnd(S64, Src, Not);
+ auto FiftyOne = B.buildConstant(S32, FractBits - 1);
+
+ auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
+ auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
+
+ auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
+ B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeITOFP(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, bool Signed) const {
+ B.setInstr(MI);
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+
+ const LLT S64 = LLT::scalar(64);
+ const LLT S32 = LLT::scalar(32);
+
+ assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
+
+ auto Unmerge = B.buildUnmerge({S32, S32}, Src);
+
+ auto CvtHi = Signed ?
+ B.buildSITOFP(S64, Unmerge.getReg(1)) :
+ B.buildUITOFP(S64, Unmerge.getReg(1));
+
+ auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
+
+ auto ThirtyTwo = B.buildConstant(S32, 32);
+ auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
+ .addUse(CvtHi.getReg(0))
+ .addUse(ThirtyTwo.getReg(0));
+
+ // TODO: Should this propagate fast-math-flags?
+ B.buildFAdd(Dst, LdExp, CvtLo);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ MachineFunction &MF = B.getMF();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
+ MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
+
+ // With ieee_mode disabled, the instructions have the correct behavior
+ // already for G_FMINNUM/G_FMAXNUM
+ if (!MFI->getMode().IEEE)
+ return !IsIEEEOp;
+
+ if (IsIEEEOp)
+ return true;
+
+ MachineIRBuilder HelperBuilder(MI);
+ GISelObserverWrapper DummyObserver;
+ LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
+ HelperBuilder.setMBB(*MI.getParent());
+ return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
+}
+
+bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // TODO: Should move some of this into LegalizerHelper.
+
+ // TODO: Promote dynamic indexing of s16 to s32
+ // TODO: Dynamic s64 indexing is only legal for SGPR.
+ Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
+ if (!IdxVal) // Dynamic case will be selected to register indexing.
+ return true;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Vec = MI.getOperand(1).getReg();
+
+ LLT VecTy = MRI.getType(Vec);
+ LLT EltTy = VecTy.getElementType();
+ assert(EltTy == MRI.getType(Dst));
+
+ B.setInstr(MI);
+
+ if (IdxVal.getValue() < VecTy.getNumElements())
+ B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
+ else
+ B.buildUndef(Dst);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // TODO: Should move some of this into LegalizerHelper.
+
+ // TODO: Promote dynamic indexing of s16 to s32
+ // TODO: Dynamic s64 indexing is only legal for SGPR.
+ Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
+ if (!IdxVal) // Dynamic case will be selected to register indexing.
+ return true;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Vec = MI.getOperand(1).getReg();
+ Register Ins = MI.getOperand(2).getReg();
+
+ LLT VecTy = MRI.getType(Vec);
+ LLT EltTy = VecTy.getElementType();
+ assert(EltTy == MRI.getType(Ins));
+
+ B.setInstr(MI);
+
+ if (IdxVal.getValue() < VecTy.getNumElements())
+ B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
+ else
+ B.buildUndef(Dst);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+// Return the use branch instruction, otherwise null if the usage is invalid.
+static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI) {
+ Register CondDef = MI.getOperand(0).getReg();
+ if (!MRI.hasOneNonDBGUse(CondDef))
+ return nullptr;
+
+ MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
+ return UseMI.getParent() == MI.getParent() &&
+ UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
+}
+
+Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
+ Register Reg, LLT Ty) const {
+ Register LiveIn = MRI.getLiveInVirtReg(Reg);
+ if (LiveIn)
+ return LiveIn;
+
+ Register NewReg = MRI.createGenericVirtualRegister(Ty);
+ MRI.addLiveIn(Reg, NewReg);
+ return NewReg;
+}
+
+bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
+ const ArgDescriptor *Arg) const {
+ if (!Arg->isRegister())
+ return false; // TODO: Handle these
+
+ assert(Arg->getRegister() != 0);
+ assert(Arg->getRegister().isPhysical());
+
+ MachineRegisterInfo &MRI = *B.getMRI();
+
+ LLT Ty = MRI.getType(DstReg);
+ Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
+
+ if (Arg->isMasked()) {
+ // TODO: Should we try to emit this once in the entry block?
+ const LLT S32 = LLT::scalar(32);
+ const unsigned Mask = Arg->getMask();
+ const unsigned Shift = countTrailingZeros<unsigned>(Mask);
+
+ auto ShiftAmt = B.buildConstant(S32, Shift);
+ auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
+ B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
+ } else
+ B.buildCopy(DstReg, LiveIn);
+
+ // Insert the argument copy if it doens't already exist.
+ // FIXME: It seems EmitLiveInCopies isn't called anywhere?
+ if (!MRI.getVRegDef(LiveIn)) {
+ MachineBasicBlock &EntryMBB = B.getMF().front();
+ EntryMBB.addLiveIn(Arg->getRegister());
+ B.setInsertPt(EntryMBB, EntryMBB.begin());
+ B.buildCopy(LiveIn, Arg->getRegister());
+ }
+
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
+ MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+ B.setInstr(MI);
+
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+
+ const ArgDescriptor *Arg;
+ const TargetRegisterClass *RC;
+ std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
+ if (!Arg) {
+ LLVM_DEBUG(dbgs() << "Required arg register missing\n");
+ return false;
+ }
+
+ if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ if (!MFI->isEntryFunction()) {
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+ }
+
+ B.setInstr(MI);
+
+ uint64_t Offset =
+ ST.getTargetLowering()->getImplicitParameterOffset(
+ B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
+
+ const ArgDescriptor *Arg;
+ const TargetRegisterClass *RC;
+ std::tie(Arg, RC)
+ = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+ if (!Arg)
+ return false;
+
+ Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
+ if (!loadInputValue(KernargPtrReg, B, Arg))
+ return false;
+
+ B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ case Intrinsic::amdgcn_if: {
+ if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
+ const SIRegisterInfo *TRI
+ = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+
+ B.setInstr(*BrCond);
+ Register Def = MI.getOperand(1).getReg();
+ Register Use = MI.getOperand(3).getReg();
+ B.buildInstr(AMDGPU::SI_IF)
+ .addDef(Def)
+ .addUse(Use)
+ .addMBB(BrCond->getOperand(1).getMBB());
+
+ MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
+ MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
+ MI.eraseFromParent();
+ BrCond->eraseFromParent();
+ return true;
+ }
+
+ return false;
+ }
+ case Intrinsic::amdgcn_loop: {
+ if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
+ const SIRegisterInfo *TRI
+ = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+
+ B.setInstr(*BrCond);
+ Register Reg = MI.getOperand(2).getReg();
+ B.buildInstr(AMDGPU::SI_LOOP)
+ .addUse(Reg)
+ .addMBB(BrCond->getOperand(1).getMBB());
+ MI.eraseFromParent();
+ BrCond->eraseFromParent();
+ MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
+ return true;
+ }
+
+ return false;
+ }
+ case Intrinsic::amdgcn_kernarg_segment_ptr:
+ return legalizePreloadedArgIntrin(
+ MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+ case Intrinsic::amdgcn_implicitarg_ptr:
+ return legalizeImplicitArgPtr(MI, MRI, B);
+ case Intrinsic::amdgcn_workitem_id_x:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+ case Intrinsic::amdgcn_workitem_id_y:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+ case Intrinsic::amdgcn_workitem_id_z:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+ case Intrinsic::amdgcn_workgroup_id_x:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+ case Intrinsic::amdgcn_workgroup_id_y:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+ case Intrinsic::amdgcn_workgroup_id_z:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_dispatch_ptr:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::DISPATCH_PTR);
+ case Intrinsic::amdgcn_queue_ptr:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::QUEUE_PTR);
+ case Intrinsic::amdgcn_implicit_buffer_ptr:
+ return legalizePreloadedArgIntrin(
+ MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
+ case Intrinsic::amdgcn_dispatch_id:
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::DISPATCH_ID);
+ default:
+ return true;
+ }
+
+ return true;
+}
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 1cbd37c42c4b..3f1cc1d265dd 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -1,9 +1,8 @@
//===- AMDGPULegalizerInfo ---------------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -16,6 +15,7 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "AMDGPUArgumentUsageInfo.h"
namespace llvm {
@@ -25,9 +25,51 @@ class GCNSubtarget;
/// This class provides the information for the target register banks.
class AMDGPULegalizerInfo : public LegalizerInfo {
+ const GCNSubtarget &ST;
+
public:
AMDGPULegalizerInfo(const GCNSubtarget &ST,
const GCNTargetMachine &TM);
+
+ bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const override;
+
+ Register getSegmentAperture(unsigned AddrSpace,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+
+ bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder, bool Signed) const;
+ bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+ bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
+
+ Register getLiveInRegister(MachineRegisterInfo &MRI,
+ Register Reg, LLT Ty) const;
+
+ bool loadInputValue(Register DstReg, MachineIRBuilder &B,
+ const ArgDescriptor *Arg) const;
+ bool legalizePreloadedArgIntrin(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+ AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+
+ bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const override;
+
};
} // End llvm namespace.
#endif
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 14e880042691..ce0a9db7c7f4 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1,9 +1,8 @@
//===- AMDGPULibCalls.cpp -------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -16,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPULibFunc.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/ADT/StringSet.h"
@@ -23,6 +23,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
@@ -30,6 +31,7 @@
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <vector>
#include <cmath>
@@ -66,6 +68,8 @@ private:
typedef llvm::AMDGPULibFunc FuncInfo;
+ const TargetMachine *TM;
+
// -fuse-native.
bool AllNative = false;
@@ -73,7 +77,7 @@ private:
// Return a pointer (pointer expr) to the function if function defintion with
// "FuncName" exists. It may create a new function prototype in pre-link mode.
- Constant *getFunction(Module *M, const FuncInfo& fInfo);
+ FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
// Replace a normal function with its native version.
bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo);
@@ -135,12 +139,15 @@ private:
// __read_pipe/__write_pipe
bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);
+ // llvm.amdgcn.wavefrontsize
+ bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B);
+
// Get insertion point at entry.
BasicBlock::iterator getEntryIns(CallInst * UI);
// Insert an Alloc instruction.
AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
// Get a scalar native builtin signle argument FP function
- Constant* getNativeFunction(Module* M, const FuncInfo &FInfo);
+ FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
protected:
CallInst *CI;
@@ -153,6 +160,8 @@ protected:
}
public:
+ AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {}
+
bool fold(CallInst *CI, AliasAnalysis *AA = nullptr);
void initNativeFuncs();
@@ -167,15 +176,16 @@ namespace {
class AMDGPUSimplifyLibCalls : public FunctionPass {
- AMDGPULibCalls Simplifier;
-
const TargetOptions Options;
+ AMDGPULibCalls Simplifier;
+
public:
static char ID; // Pass identification
- AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions())
- : FunctionPass(ID), Options(Opt) {
+ AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(),
+ const TargetMachine *TM = nullptr)
+ : FunctionPass(ID), Options(Opt), Simplifier(TM) {
initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
}
@@ -217,19 +227,19 @@ INITIALIZE_PASS(AMDGPUUseNativeCalls, "amdgpu-usenative",
false, false)
template <typename IRB>
-static CallInst *CreateCallEx(IRB &B, Value *Callee, Value *Arg,
+static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg,
const Twine &Name = "") {
CallInst *R = B.CreateCall(Callee, Arg, Name);
- if (Function* F = dyn_cast<Function>(Callee))
+ if (Function *F = dyn_cast<Function>(Callee.getCallee()))
R->setCallingConv(F->getCallingConv());
return R;
}
template <typename IRB>
-static CallInst *CreateCallEx2(IRB &B, Value *Callee, Value *Arg1, Value *Arg2,
- const Twine &Name = "") {
+static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1,
+ Value *Arg2, const Twine &Name = "") {
CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
- if (Function* F = dyn_cast<Function>(Callee))
+ if (Function *F = dyn_cast<Function>(Callee.getCallee()))
R->setCallingConv(F->getCallingConv());
return R;
}
@@ -472,7 +482,7 @@ static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
}
-Constant *AMDGPULibCalls::getFunction(Module *M, const FuncInfo& fInfo) {
+FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
// If we are doing PreLinkOpt, the function is external. So it is safe to
// use getOrInsertFunction() at this stage.
@@ -519,11 +529,11 @@ bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
nf.setPrefix(AMDGPULibFunc::NATIVE);
nf.setId(AMDGPULibFunc::EI_SIN);
- Constant *sinExpr = getFunction(M, nf);
+ FunctionCallee sinExpr = getFunction(M, nf);
nf.setPrefix(AMDGPULibFunc::NATIVE);
nf.setId(AMDGPULibFunc::EI_COS);
- Constant *cosExpr = getFunction(M, nf);
+ FunctionCallee cosExpr = getFunction(M, nf);
if (sinExpr && cosExpr) {
Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI);
Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI);
@@ -555,7 +565,7 @@ bool AMDGPULibCalls::useNative(CallInst *aCI) {
return sincosUseNative(aCI, FInfo);
FInfo.setPrefix(AMDGPULibFunc::NATIVE);
- Constant *F = getFunction(aCI->getModule(), FInfo);
+ FunctionCallee F = getFunction(aCI->getModule(), FInfo);
if (!F)
return false;
@@ -613,7 +623,7 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
auto *FTy = FunctionType::get(Callee->getReturnType(),
ArrayRef<Type *>(ArgTys), false);
AMDGPULibFunc NewLibFunc(Name, FTy);
- auto *F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc);
+ FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc);
if (!F)
return false;
@@ -640,14 +650,6 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
// Ignore indirect calls.
if (Callee == 0) return false;
- FuncInfo FInfo;
- if (!parseFunctionName(Callee->getName(), &FInfo))
- return false;
-
- // Further check the number of arguments to see if they match.
- if (CI->getNumArgOperands() != FInfo.getNumArgs())
- return false;
-
BasicBlock *BB = CI->getParent();
LLVMContext &Context = CI->getParent()->getContext();
IRBuilder<> B(Context);
@@ -659,6 +661,21 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))
B.setFastMathFlags(FPOp->getFastMathFlags());
+ switch (Callee->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::amdgcn_wavefrontsize:
+ return !EnablePreLink && fold_wavefrontsize(CI, B);
+ }
+
+ FuncInfo FInfo;
+ if (!parseFunctionName(Callee->getName(), &FInfo))
+ return false;
+
+ // Further check the number of arguments to see if they match.
+ if (CI->getNumArgOperands() != FInfo.getNumArgs())
+ return false;
+
if (TDOFold(CI, FInfo))
return true;
@@ -795,7 +812,7 @@ bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) {
AMDGPULibFunc nf = FInfo;
nf.setPrefix(AMDGPULibFunc::NATIVE);
- if (Constant *FPExpr = getFunction(M, nf)) {
+ if (FunctionCallee FPExpr = getFunction(M, nf)) {
LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
CI->setCalledFunction(FPExpr);
@@ -848,7 +865,7 @@ bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B,
namespace llvm {
static double log2(double V) {
-#if _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE || _POSIX_C_SOURCE >= 200112L
+#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
return ::log2(V);
#else
return log(V) / 0.693147180559945309417;
@@ -934,9 +951,10 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) {
// pow[r](x, [-]0.5) = sqrt(x)
bool issqrt = CF->isExactlyValue(0.5);
- if (Constant *FPExpr = getFunction(M,
- AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
- : AMDGPULibFunc::EI_RSQRT, FInfo))) {
+ if (FunctionCallee FPExpr =
+ getFunction(M, AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
+ : AMDGPULibFunc::EI_RSQRT,
+ FInfo))) {
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
<< FInfo.getName().c_str() << "(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
@@ -1003,8 +1021,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
// powr ---> exp2(y * log2(x))
// pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
- Constant *ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2,
- FInfo));
+ FunctionCallee ExpExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
if (!ExpExpr)
return false;
@@ -1090,8 +1108,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
Value *nval;
if (needabs) {
- Constant *AbsExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS,
- FInfo));
+ FunctionCallee AbsExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS, FInfo));
if (!AbsExpr)
return false;
nval = CreateCallEx(B, AbsExpr, opr0, "__fabs");
@@ -1099,8 +1117,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
nval = cnval ? cnval : opr0;
}
if (needlog) {
- Constant *LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2,
- FInfo));
+ FunctionCallee LogExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
if (!LogExpr)
return false;
nval = CreateCallEx(B,LogExpr, nval, "__log2");
@@ -1159,8 +1177,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
std::vector<const Type*> ParamsTys;
ParamsTys.push_back(opr0->getType());
Module *M = CI->getModule();
- if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT,
- FInfo))) {
+ if (FunctionCallee FPExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt");
replaceCall(nval);
@@ -1168,8 +1186,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
}
} else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
Module *M = CI->getModule();
- if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT,
- FInfo))) {
+ if (FunctionCallee FPExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) {
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
replaceCall(nval);
@@ -1186,8 +1204,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
std::vector<const Type*> ParamsTys;
ParamsTys.push_back(opr0->getType());
Module *M = CI->getModule();
- if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT,
- FInfo))) {
+ if (FunctionCallee FPExpr =
+ getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) {
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0
<< ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
@@ -1243,7 +1261,8 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
}
// Get a scalar native builtin signle argument FP function
-Constant* AMDGPULibCalls::getNativeFunction(Module* M, const FuncInfo& FInfo) {
+FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
+ const FuncInfo &FInfo) {
if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()))
return nullptr;
FuncInfo nf = FInfo;
@@ -1256,8 +1275,8 @@ bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B,
const FuncInfo &FInfo) {
if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) &&
(FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) {
- if (Constant *FPExpr = getNativeFunction(
- CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
+ if (FunctionCallee FPExpr = getNativeFunction(
+ CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
Value *opr0 = CI->getArgOperand(0);
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
<< "sqrt(" << *opr0 << ")\n");
@@ -1334,7 +1353,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
// function.
AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
- Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf));
+ FunctionCallee Fsincos = getFunction(M, nf);
if (!Fsincos) return false;
BasicBlock::iterator ItOld = B.GetInsertPoint();
@@ -1342,7 +1361,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
B.SetInsertPoint(UI);
Value *P = Alloc;
- Type *PTy = Fsincos->getFunctionType()->getParamType(1);
+ Type *PTy = Fsincos.getFunctionType()->getParamType(1);
// The allocaInst allocates the memory in private address space. This need
// to be bitcasted to point to the address space of cos pointer type.
// In OpenCL 2.0 this is generic, while in 1.2 that is private.
@@ -1356,12 +1375,12 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
if (!isSin) { // CI->cos, UI->sin
B.SetInsertPoint(&*ItOld);
UI->replaceAllUsesWith(&*Call);
- Instruction *Reload = B.CreateLoad(Alloc);
+ Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
CI->replaceAllUsesWith(Reload);
UI->eraseFromParent();
CI->eraseFromParent();
} else { // CI->sin, UI->cos
- Instruction *Reload = B.CreateLoad(Alloc);
+ Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
UI->replaceAllUsesWith(Reload);
CI->replaceAllUsesWith(Call);
UI->eraseFromParent();
@@ -1370,6 +1389,29 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
return true;
}
+bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
+ if (!TM)
+ return false;
+
+ StringRef CPU = TM->getTargetCPU();
+ StringRef Features = TM->getTargetFeatureString();
+ if ((CPU.empty() || CPU.equals_lower("generic")) &&
+ (Features.empty() ||
+ Features.find_lower("wavefrontsize") == StringRef::npos))
+ return false;
+
+ Function *F = CI->getParent()->getParent();
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F);
+ unsigned N = ST.getWavefrontSize();
+
+ LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with "
+ << N << "\n");
+
+ CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N));
+ CI->eraseFromParent();
+ return true;
+}
+
// Get insertion point at entry.
BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
Function * Func = UI->getParent()->getParent();
@@ -1679,8 +1721,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) {
}
// Public interface to the Simplify LibCalls pass.
-FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) {
- return new AMDGPUSimplifyLibCalls(Opt);
+FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt,
+ const TargetMachine *TM) {
+ return new AMDGPUSimplifyLibCalls(Opt, TM);
}
FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 4fc3fe0f105b..a5bac25701a0 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPULibFunc.cpp -------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -64,6 +63,8 @@ struct ManglingRule {
int getNumLeads() const { return (Lead[0] ? 1 : 0) + (Lead[1] ? 1 : 0); }
unsigned getNumArgs() const;
+
+ static StringMap<int> buildManglingRulesMap();
};
// Information about library functions with unmangled names.
@@ -77,16 +78,7 @@ class UnmangledFuncInfo {
// Number of entries in Table.
static const unsigned TableSize;
- // Map function name to index.
- class NameMap : public StringMap<unsigned> {
- public:
- NameMap() {
- for (unsigned I = 0; I != TableSize; ++I)
- (*this)[Table[I].Name] = I;
- }
- };
- friend class NameMap;
- static NameMap Map;
+ static StringMap<unsigned> buildNameMap();
public:
using ID = AMDGPULibFunc::EFuncId;
@@ -102,7 +94,8 @@ public:
static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED);
}
static ID toFuncId(unsigned Index) {
- assert(Index < TableSize && "Invalid unmangled library function");
+ assert(Index < TableSize &&
+ "Invalid unmangled library function");
return static_cast<ID>(
Index + 1 + static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED));
}
@@ -350,18 +343,7 @@ const UnmangledFuncInfo UnmangledFuncInfo::Table[] = {
};
const unsigned UnmangledFuncInfo::TableSize =
- sizeof(UnmangledFuncInfo::Table) / sizeof(UnmangledFuncInfo::Table[0]);
-
-UnmangledFuncInfo::NameMap UnmangledFuncInfo::Map;
-
-static const struct ManglingRulesMap : public StringMap<int> {
- ManglingRulesMap()
- : StringMap<int>(sizeof(manglingRules)/sizeof(manglingRules[0])) {
- int Id = 0;
- for (auto Rule : manglingRules)
- insert({ Rule.Name, Id++ });
- }
-} manglingRulesMap;
+ array_lengthof(UnmangledFuncInfo::Table);
static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id,
const AMDGPULibFunc::Param (&Leads)[2]) {
@@ -569,7 +551,17 @@ static AMDGPULibFunc::ENamePrefix parseNamePrefix(StringRef& mangledName) {
return Pfx;
}
+StringMap<int> ManglingRule::buildManglingRulesMap() {
+ StringMap<int> Map(array_lengthof(manglingRules));
+ int Id = 0;
+ for (auto Rule : manglingRules)
+ Map.insert({Rule.Name, Id++});
+ return Map;
+}
+
bool AMDGPUMangledLibFunc::parseUnmangledName(StringRef FullName) {
+ static const StringMap<int> manglingRulesMap =
+ ManglingRule::buildManglingRulesMap();
FuncId = static_cast<EFuncId>(manglingRulesMap.lookup(FullName));
return FuncId != EI_NONE;
}
@@ -961,8 +953,8 @@ Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) {
return nullptr;
}
-Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
- const AMDGPULibFunc &fInfo) {
+FunctionCallee AMDGPULibFunc::getOrInsertFunction(Module *M,
+ const AMDGPULibFunc &fInfo) {
std::string const FuncName = fInfo.mangle();
Function *F = dyn_cast_or_null<Function>(
M->getValueSymbolTable().lookup(FuncName));
@@ -988,7 +980,7 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
}
}
- Constant *C = nullptr;
+ FunctionCallee C;
if (hasPtr) {
// Do not set extra attributes for functions with pointer arguments.
C = M->getOrInsertFunction(FuncName, FuncTy);
@@ -1002,10 +994,18 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
}
- return cast<Function>(C);
+ return C;
+}
+
+StringMap<unsigned> UnmangledFuncInfo::buildNameMap() {
+ StringMap<unsigned> Map;
+ for (unsigned I = 0; I != TableSize; ++I)
+ Map[Table[I].Name] = I;
+ return Map;
}
bool UnmangledFuncInfo::lookup(StringRef Name, ID &Id) {
+ static const StringMap<unsigned> Map = buildNameMap();
auto Loc = Map.find(Name);
if (Loc != Map.end()) {
Id = toFuncId(Loc->second);
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h
index fe062384800a..2354ed7df205 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -1,9 +1,8 @@
//===-- AMDGPULibFunc.h ----------------------------------------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -394,8 +393,8 @@ public:
}
static Function *getFunction(llvm::Module *M, const AMDGPULibFunc &fInfo);
- static Function *getOrInsertFunction(llvm::Module *M,
- const AMDGPULibFunc &fInfo);
+ static FunctionCallee getOrInsertFunction(llvm::Module *M,
+ const AMDGPULibFunc &fInfo);
static bool parse(StringRef MangledName, AMDGPULibFunc &Ptr);
private:
diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 2cec8fe53283..15032969890e 100644
--- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 743dc7a0d00b..5dd5b3691e0a 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -110,8 +109,9 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
// modes on SI to know the high bits are 0 so pointer adds don't wrap. We
// can't represent this with range metadata because it's only allowed for
// integer types.
- if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
- ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
+ !ST.hasUsableDSOffset())
continue;
// FIXME: We can replace this with equivalent alias.scope/noalias
@@ -132,6 +132,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
KernArgBaseAlign);
Value *ArgPtr;
+ Type *AdjustedArgTy;
if (DoShiftOpt) { // FIXME: Handle aggregate types
// Since we don't have sub-dword scalar loads, avoid doing an extload by
// loading earlier than the argument address, and extracting the relevant
@@ -139,30 +140,27 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
//
// Additionally widen any sub-dword load to i32 even if suitably aligned,
// so that CSE between different argument loads works easily.
-
ArgPtr = Builder.CreateConstInBoundsGEP1_64(
- KernArgSegment,
- AlignDownOffset,
- Arg.getName() + ".kernarg.offset.align.down");
- ArgPtr = Builder.CreateBitCast(ArgPtr,
- Builder.getInt32Ty()->getPointerTo(AS),
- ArgPtr->getName() + ".cast");
+ Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
+ Arg.getName() + ".kernarg.offset.align.down");
+ AdjustedArgTy = Builder.getInt32Ty();
} else {
ArgPtr = Builder.CreateConstInBoundsGEP1_64(
- KernArgSegment,
- EltOffset,
- Arg.getName() + ".kernarg.offset");
- ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
- ArgPtr->getName() + ".cast");
+ Builder.getInt8Ty(), KernArgSegment, EltOffset,
+ Arg.getName() + ".kernarg.offset");
+ AdjustedArgTy = ArgTy;
}
if (IsV3 && Size >= 32) {
V4Ty = VectorType::get(VT->getVectorElementType(), 4);
// Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
- ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
+ AdjustedArgTy = V4Ty;
}
- LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
+ ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
+ ArgPtr->getName() + ".cast");
+ LoadInst *Load =
+ Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
MDBuilder MDB(Ctx);
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index a43dcef4cf0b..00e12f808783 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index f6bdbf5e9be2..ae4c32c258a7 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -16,7 +15,7 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
-#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600AsmPrinter.h"
#include "SIInstrInfo.h"
@@ -91,6 +90,10 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) {
return MCSymbolRefExpr::VK_AMDGPU_REL32_LO;
case SIInstrInfo::MO_REL32_HI:
return MCSymbolRefExpr::VK_AMDGPU_REL32_HI;
+ case SIInstrInfo::MO_ABS32_LO:
+ return MCSymbolRefExpr::VK_AMDGPU_ABS32_LO;
+ case SIInstrInfo::MO_ABS32_HI:
+ return MCSymbolRefExpr::VK_AMDGPU_ABS32_HI;
}
}
@@ -101,17 +104,22 @@ const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr(
= MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx);
const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx);
- assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 &&
- ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4);
+ // FIXME: The first half of this assert should be removed. This should
+ // probably be PC relative instead of using the source block symbol, and
+ // therefore the indirect branch expansion should use a bundle.
+ assert(
+ skipDebugInstructionsForward(SrcBB.begin(), SrcBB.end())->getOpcode() ==
+ AMDGPU::S_GETPC_B64 &&
+ ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4);
// s_getpc_b64 returns the address of next instruction.
const MCConstantExpr *One = MCConstantExpr::create(4, Ctx);
SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx);
- if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD)
+ if (MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_FORWARD)
return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx);
- assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD);
+ assert(MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_BACKWARD);
return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx);
}
@@ -142,10 +150,13 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
SmallString<128> SymbolName;
AP.getNameWithPrefix(SymbolName, GV);
MCSymbol *Sym = Ctx.getOrCreateSymbol(SymbolName);
- const MCExpr *SymExpr =
+ const MCExpr *Expr =
MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx);
- const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr,
- MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+ int64_t Offset = MO.getOffset();
+ if (Offset != 0) {
+ Expr = MCBinaryExpr::createAdd(Expr,
+ MCConstantExpr::create(Offset, Ctx), Ctx);
+ }
MCOp = MCOperand::createExpr(Expr);
return true;
}
@@ -321,14 +332,13 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
#endif
- if (STI.dumpCode()) {
- // Disassemble instruction/operands to text.
+ if (DumpCodeInstEmitter) {
+ // Disassemble instruction/operands to text
DisasmLines.resize(DisasmLines.size() + 1);
std::string &DisasmLine = DisasmLines.back();
raw_string_ostream DisasmStream(DisasmLine);
- AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
- *STI.getInstrInfo(),
+ AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *STI.getInstrInfo(),
*STI.getRegisterInfo());
InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), STI);
@@ -337,10 +347,8 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<char, 16> CodeBytes;
raw_svector_ostream CodeStream(CodeBytes);
- auto &ObjStreamer = static_cast<MCObjectStreamer&>(*OutStreamer);
- MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
- InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups,
- MF->getSubtarget<MCSubtargetInfo>());
+ DumpCodeInstEmitter->encodeInstruction(
+ TmpInst, CodeStream, Fixups, MF->getSubtarget<MCSubtargetInfo>());
HexLines.resize(HexLines.size() + 1);
std::string &HexLine = HexLines.back();
raw_string_ostream HexStream(HexLine);
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 6f44e2dbb2d5..237490957058 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 13b4b50149ce..0d3a1f1a769f 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUMachineFunctionInfo.cpp ---------------------------------------=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -30,13 +29,13 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
// except reserved size is not correctly aligned.
const Function &F = MF.getFunction();
- if (auto *Resolver = MF.getMMI().getResolver()) {
- if (AMDGPUPerfHintAnalysis *PHA = static_cast<AMDGPUPerfHintAnalysis*>(
- Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) {
- MemoryBound = PHA->isMemoryBound(&F);
- WaveLimiter = PHA->needsWaveLimiter(&F);
- }
- }
+ Attribute MemBoundAttr = F.getFnAttribute("amdgpu-memory-bound");
+ MemoryBound = MemBoundAttr.isStringAttribute() &&
+ MemBoundAttr.getValueAsString() == "true";
+
+ Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
+ WaveLimiter = WaveLimitAttr.isStringAttribute() &&
+ WaveLimitAttr.getValueAsString() == "true";
CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 8d6b871bc03e..52987e2fa411 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -1,9 +1,8 @@
//===-- AMDGPUMachineFunctionInfo.h -------------------------------*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index 7b9f673c418c..4d9f08b3af01 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -1,9 +1,8 @@
//===--- AMDGPUMachineModuleInfo.cpp ----------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -24,6 +23,16 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
AgentSSID = CTX.getOrInsertSyncScopeID("agent");
WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront");
+ SystemOneAddressSpaceSSID =
+ CTX.getOrInsertSyncScopeID("one-as");
+ AgentOneAddressSpaceSSID =
+ CTX.getOrInsertSyncScopeID("agent-one-as");
+ WorkgroupOneAddressSpaceSSID =
+ CTX.getOrInsertSyncScopeID("workgroup-one-as");
+ WavefrontOneAddressSpaceSSID =
+ CTX.getOrInsertSyncScopeID("wavefront-one-as");
+ SingleThreadOneAddressSpaceSSID =
+ CTX.getOrInsertSyncScopeID("singlethread-one-as");
}
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 1219ab26fb69..2b0b8b42acfe 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -1,9 +1,8 @@
//===--- AMDGPUMachineModuleInfo.h ------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -30,12 +29,22 @@ private:
// All supported memory/synchronization scopes can be found here:
// http://llvm.org/docs/AMDGPUUsage.html#memory-scopes
- /// Agent synchronization scope ID.
+ /// Agent synchronization scope ID (cross address space).
SyncScope::ID AgentSSID;
- /// Workgroup synchronization scope ID.
+ /// Workgroup synchronization scope ID (cross address space).
SyncScope::ID WorkgroupSSID;
- /// Wavefront synchronization scope ID.
+ /// Wavefront synchronization scope ID (cross address space).
SyncScope::ID WavefrontSSID;
+ /// System synchronization scope ID (single address space).
+ SyncScope::ID SystemOneAddressSpaceSSID;
+ /// Agent synchronization scope ID (single address space).
+ SyncScope::ID AgentOneAddressSpaceSSID;
+ /// Workgroup synchronization scope ID (single address space).
+ SyncScope::ID WorkgroupOneAddressSpaceSSID;
+ /// Wavefront synchronization scope ID (single address space).
+ SyncScope::ID WavefrontOneAddressSpaceSSID;
+ /// Single thread synchronization scope ID (single address space).
+ SyncScope::ID SingleThreadOneAddressSpaceSSID;
/// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
@@ -44,35 +53,70 @@ private:
/// \returns \p SSID's inclusion ordering, or "None" if \p SSID is not
/// supported by the AMDGPU target.
Optional<uint8_t> getSyncScopeInclusionOrdering(SyncScope::ID SSID) const {
- if (SSID == SyncScope::SingleThread)
+ if (SSID == SyncScope::SingleThread ||
+ SSID == getSingleThreadOneAddressSpaceSSID())
return 0;
- else if (SSID == getWavefrontSSID())
+ else if (SSID == getWavefrontSSID() ||
+ SSID == getWavefrontOneAddressSpaceSSID())
return 1;
- else if (SSID == getWorkgroupSSID())
+ else if (SSID == getWorkgroupSSID() ||
+ SSID == getWorkgroupOneAddressSpaceSSID())
return 2;
- else if (SSID == getAgentSSID())
+ else if (SSID == getAgentSSID() ||
+ SSID == getAgentOneAddressSpaceSSID())
return 3;
- else if (SSID == SyncScope::System)
+ else if (SSID == SyncScope::System ||
+ SSID == getSystemOneAddressSpaceSSID())
return 4;
return None;
}
+ /// \returns True if \p SSID is restricted to single address space, false
+ /// otherwise
+ bool isOneAddressSpace(SyncScope::ID SSID) const {
+ return SSID == getSingleThreadOneAddressSpaceSSID() ||
+ SSID == getWavefrontOneAddressSpaceSSID() ||
+ SSID == getWorkgroupOneAddressSpaceSSID() ||
+ SSID == getAgentOneAddressSpaceSSID() ||
+ SSID == getSystemOneAddressSpaceSSID();
+ }
+
public:
AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI);
- /// \returns Agent synchronization scope ID.
+ /// \returns Agent synchronization scope ID (cross address space).
SyncScope::ID getAgentSSID() const {
return AgentSSID;
}
- /// \returns Workgroup synchronization scope ID.
+ /// \returns Workgroup synchronization scope ID (cross address space).
SyncScope::ID getWorkgroupSSID() const {
return WorkgroupSSID;
}
- /// \returns Wavefront synchronization scope ID.
+ /// \returns Wavefront synchronization scope ID (cross address space).
SyncScope::ID getWavefrontSSID() const {
return WavefrontSSID;
}
+ /// \returns System synchronization scope ID (single address space).
+ SyncScope::ID getSystemOneAddressSpaceSSID() const {
+ return SystemOneAddressSpaceSSID;
+ }
+ /// \returns Agent synchronization scope ID (single address space).
+ SyncScope::ID getAgentOneAddressSpaceSSID() const {
+ return AgentOneAddressSpaceSSID;
+ }
+ /// \returns Workgroup synchronization scope ID (single address space).
+ SyncScope::ID getWorkgroupOneAddressSpaceSSID() const {
+ return WorkgroupOneAddressSpaceSSID;
+ }
+ /// \returns Wavefront synchronization scope ID (single address space).
+ SyncScope::ID getWavefrontOneAddressSpaceSSID() const {
+ return WavefrontOneAddressSpaceSSID;
+ }
+ /// \returns Single thread synchronization scope ID (single address space).
+ SyncScope::ID getSingleThreadOneAddressSpaceSSID() const {
+ return SingleThreadOneAddressSpaceSSID;
+ }
/// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
@@ -88,7 +132,11 @@ public:
if (!AIO || !BIO)
return None;
- return AIO.getValue() > BIO.getValue();
+ bool IsAOneAddressSpace = isOneAddressSpace(A);
+ bool IsBOneAddressSpace = isOneAddressSpace(B);
+
+ return AIO.getValue() >= BIO.getValue() &&
+ (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace);
}
};
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 5e0b7d429022..8c11230f411a 100644
--- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -1,9 +1,8 @@
//===--- AMDGPUMacroFusion.cpp - AMDGPU Macro Fusion ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/lib/Target/AMDGPU/AMDGPUMacroFusion.h
index 844958580a65..da4b3cf8bc24 100644
--- a/lib/Target/AMDGPU/AMDGPUMacroFusion.h
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.h
@@ -1,9 +1,8 @@
//===- AMDGPUMacroFusion.h - AMDGPU Macro Fusion ----------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index 7bd8533a0ccf..f7231471c107 100644
--- a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -120,11 +119,11 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
auto T = ArrayType::get(Type::getInt64Ty(C), 2);
auto *GV = new GlobalVariable(
M, T,
- /*IsConstant=*/false, GlobalValue::ExternalLinkage,
+ /*isConstant=*/false, GlobalValue::ExternalLinkage,
/*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
/*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
AMDGPUAS::GLOBAL_ADDRESS,
- /*IsExternallyInitialized=*/false);
+ /*isExternallyInitialized=*/false);
LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
for (auto U : F.users()) {
diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h
index 2feff14d34a1..8b69f51c1a0d 100644
--- a/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -1,9 +1,8 @@
//===-- AMDGPUNoteType.h - AMDGPU ELF PT_NOTE section info-------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index e53a8fe7c074..9613d5a843b3 100644
--- a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -18,6 +17,7 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -72,7 +72,7 @@ public:
const TargetLowering *TLI_)
: FIM(FIM_), DL(nullptr), TLI(TLI_) {}
- void runOnFunction(Function &F);
+ bool runOnFunction(Function &F);
private:
struct MemAccessInfo {
@@ -101,7 +101,7 @@ private:
const TargetLowering *TLI;
- void visit(const Function &F);
+ AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F);
static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
@@ -203,12 +203,8 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
return false;
}
-void AMDGPUPerfHint::visit(const Function &F) {
- auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo()));
- if (!FIP.second)
- return;
-
- AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second;
+AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
+ AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
@@ -234,10 +230,10 @@ void AMDGPUPerfHint::visit(const Function &F) {
if (&F == Callee) // Handle immediate recursion
continue;
- visit(*Callee);
auto Loc = FIM.find(Callee);
+ if (Loc == FIM.end())
+ continue;
- assert(Loc != FIM.end() && "No func info");
FI.MemInstCount += Loc->second.MemInstCount;
FI.InstCount += Loc->second.InstCount;
FI.IAMInstCount += Loc->second.IAMInstCount;
@@ -257,36 +253,39 @@ void AMDGPUPerfHint::visit(const Function &F) {
}
}
}
-}
-void AMDGPUPerfHint::runOnFunction(Function &F) {
- if (FIM.find(&F) != FIM.end())
- return;
+ return &FI;
+}
+bool AMDGPUPerfHint::runOnFunction(Function &F) {
const Module &M = *F.getParent();
DL = &M.getDataLayout();
- visit(F);
- auto Loc = FIM.find(&F);
+ if (F.hasFnAttribute("amdgpu-wave-limiter") &&
+ F.hasFnAttribute("amdgpu-memory-bound"))
+ return false;
+
+ const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
- assert(Loc != FIM.end() && "No func info");
- LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount
+ LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount
<< '\n'
- << " IAMInst: " << Loc->second.IAMInstCount << '\n'
- << " LSMInst: " << Loc->second.LSMInstCount << '\n'
- << " TotalInst: " << Loc->second.InstCount << '\n');
-
- auto &FI = Loc->second;
+ << " IAMInst: " << Info->IAMInstCount << '\n'
+ << " LSMInst: " << Info->LSMInstCount << '\n'
+ << " TotalInst: " << Info->InstCount << '\n');
- if (isMemBound(FI)) {
+ if (isMemBound(*Info)) {
LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
NumMemBound++;
+ F.addFnAttr("amdgpu-memory-bound", "true");
}
- if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) {
+ if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
NumLimitWave++;
+ F.addFnAttr("amdgpu-wave-limiter", "true");
}
+
+ return true;
}
bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
@@ -365,17 +364,27 @@ bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
}
} // namespace
-bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) {
+bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
return false;
const TargetMachine &TM = TPC->getTM<TargetMachine>();
- const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F);
- AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
- Analyzer.runOnFunction(F);
- return false;
+ bool Changed = false;
+ for (CallGraphNode *I : SCC) {
+ Function *F = I->getFunction();
+ if (!F || F->isDeclaration())
+ continue;
+
+ const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F);
+ AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
+
+ if (Analyzer.runOnFunction(*F))
+ Changed = true;
+ }
+
+ return Changed;
}
bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index be7f37cb6815..9599e09fbd96 100644
--- a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -1,9 +1,8 @@
-//===- AMDGPUPerfHintAnalysis.h - analysis of functions memory traffic ----===//
+//===- AMDGPUPerfHintAnalysis.h ---- analysis of memory traffic -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,18 +14,20 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
#define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+
+#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/IR/ValueMap.h"
#include "llvm/Pass.h"
namespace llvm {
-struct AMDGPUPerfHintAnalysis : public FunctionPass {
+struct AMDGPUPerfHintAnalysis : public CallGraphSCCPass {
static char ID;
public:
- AMDGPUPerfHintAnalysis() : FunctionPass(ID) {}
+ AMDGPUPerfHintAnalysis() : CallGraphSCCPass(ID) {}
- bool runOnFunction(Function &F) override;
+ bool runOnSCC(CallGraphSCC &SCC) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 5d087c099184..e4c9d6685d4a 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -163,12 +162,16 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
bool SufficientLDS = hasSufficientLocalMem(F);
bool Changed = false;
BasicBlock &EntryBB = *F.begin();
- for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
- AllocaInst *AI = dyn_cast<AllocaInst>(I);
- ++I;
- if (AI)
- Changed |= handleAlloca(*AI, SufficientLDS);
+ SmallVector<AllocaInst *, 16> Allocas;
+ for (Instruction &I : EntryBB) {
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+ Allocas.push_back(AI);
+ }
+
+ for (AllocaInst *AI : Allocas) {
+ if (handleAlloca(*AI, SufficientLDS))
+ Changed = true;
}
return Changed;
@@ -245,11 +248,11 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
// We could do a single 64-bit load here, but it's likely that the basic
// 32-bit and extract sequence is already present, and it is probably easier
// to CSE this. The loads should be mergable later anyway.
- Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
- LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
+ Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
+ LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4);
- Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
- LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
+ Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
+ LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4);
MDNode *MD = MDNode::get(Mod->getContext(), None);
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
@@ -427,7 +430,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
- Value *VecValue = Builder.CreateLoad(BitCast);
+ Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
Inst->replaceAllUsesWith(ExtractElement);
Inst->eraseFromParent();
@@ -442,7 +445,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
Value *Ptr = SI->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
- Value *VecValue = Builder.CreateLoad(BitCast);
+ Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
Value *NewVecValue = Builder.CreateInsertElement(VecValue,
SI->getValueOperand(),
Index);
@@ -919,7 +922,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
);
CallInst *NewCall = Builder.CreateCall(
- ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)});
+ ObjectSize,
+ {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
Intr->replaceAllUsesWith(NewCall);
Intr->eraseFromParent();
continue;
diff --git a/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
new file mode 100644
index 000000000000..7a7addd0f5cf
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -0,0 +1,336 @@
+//===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass propagates attributes from kernels to the non-entry
+/// functions. Most of the library functions were not compiled for specific ABI,
+/// yet will be correctly compiled if proper attrbutes are propagated from the
+/// caller.
+///
+/// The pass analyzes call graph and propagates ABI target features through the
+/// call graph.
+///
+/// It can run in two modes: as a function or module pass. A function pass
+/// simply propagates attributes. A module pass clones functions if there are
+/// callers with different ABI. If a function is clonned all call sites will
+/// be updated to use a correct clone.
+///
+/// A function pass is limited in functionality but can run early in the
+/// pipeline. A module pass is more powerful but has to run late, so misses
+/// library folding opportunities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <string>
+
+#define DEBUG_TYPE "amdgpu-propagate-attributes"
+
+using namespace llvm;
+
+namespace llvm {
+extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1];
+}
+
+namespace {
+
+class AMDGPUPropagateAttributes {
+ const FeatureBitset TargetFeatures = {
+ AMDGPU::FeatureWavefrontSize16,
+ AMDGPU::FeatureWavefrontSize32,
+ AMDGPU::FeatureWavefrontSize64
+ };
+
+ class Clone{
+ public:
+ Clone(FeatureBitset FeatureMask, Function *OrigF, Function *NewF) :
+ FeatureMask(FeatureMask), OrigF(OrigF), NewF(NewF) {}
+
+ FeatureBitset FeatureMask;
+ Function *OrigF;
+ Function *NewF;
+ };
+
+ const TargetMachine *TM;
+
+ // Clone functions as needed or just set attributes.
+ bool AllowClone;
+
+ // Option propagation roots.
+ SmallSet<Function *, 32> Roots;
+
+ // Clones of functions with their attributes.
+ SmallVector<Clone, 32> Clones;
+
+ // Find a clone with required features.
+ Function *findFunction(const FeatureBitset &FeaturesNeeded,
+ Function *OrigF);
+
+ // Clone function F and set NewFeatures on the clone.
+ // Cole takes the name of original function.
+ Function *cloneWithFeatures(Function &F,
+ const FeatureBitset &NewFeatures);
+
+ // Set new function's features in place.
+ void setFeatures(Function &F, const FeatureBitset &NewFeatures);
+
+ std::string getFeatureString(const FeatureBitset &Features) const;
+
+ // Propagate attributes from Roots.
+ bool process();
+
+public:
+ AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) :
+ TM(TM), AllowClone(AllowClone) {}
+
+ // Use F as a root and propagate its attributes.
+ bool process(Function &F);
+
+ // Propagate attributes starting from kernel functions.
+ bool process(Module &M);
+};
+
+// Allows to propagate attributes early, but no clonning is allowed as it must
+// be a function pass to run before any optimizations.
+// TODO: We shall only need a one instance of module pass, but that needs to be
+// in the linker pipeline which is currently not possible.
+class AMDGPUPropagateAttributesEarly : public FunctionPass {
+ const TargetMachine *TM;
+
+public:
+ static char ID; // Pass identification
+
+ AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) :
+ FunctionPass(ID), TM(TM) {
+ initializeAMDGPUPropagateAttributesEarlyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+
+// Allows to propagate attributes with clonning but does that late in the
+// pipeline.
+class AMDGPUPropagateAttributesLate : public ModulePass {
+ const TargetMachine *TM;
+
+public:
+ static char ID; // Pass identification
+
+ AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) :
+ ModulePass(ID), TM(TM) {
+ initializeAMDGPUPropagateAttributesLatePass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+};
+
+} // end anonymous namespace.
+
+char AMDGPUPropagateAttributesEarly::ID = 0;
+char AMDGPUPropagateAttributesLate::ID = 0;
+
+INITIALIZE_PASS(AMDGPUPropagateAttributesEarly,
+ "amdgpu-propagate-attributes-early",
+ "Early propagate attributes from kernels to functions",
+ false, false)
+INITIALIZE_PASS(AMDGPUPropagateAttributesLate,
+ "amdgpu-propagate-attributes-late",
+ "Late propagate attributes from kernels to functions",
+ false, false)
+
+Function *
+AMDGPUPropagateAttributes::findFunction(const FeatureBitset &FeaturesNeeded,
+ Function *OrigF) {
+ // TODO: search for clone's clones.
+ for (Clone &C : Clones)
+ if (C.OrigF == OrigF && FeaturesNeeded == C.FeatureMask)
+ return C.NewF;
+
+ return nullptr;
+}
+
+bool AMDGPUPropagateAttributes::process(Module &M) {
+ for (auto &F : M.functions())
+ if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+ Roots.insert(&F);
+
+ return process();
+}
+
+bool AMDGPUPropagateAttributes::process(Function &F) {
+ Roots.insert(&F);
+ return process();
+}
+
+bool AMDGPUPropagateAttributes::process() {
+ bool Changed = false;
+ SmallSet<Function *, 32> NewRoots;
+ SmallSet<Function *, 32> Replaced;
+
+ if (Roots.empty())
+ return false;
+ Module &M = *(*Roots.begin())->getParent();
+
+ do {
+ Roots.insert(NewRoots.begin(), NewRoots.end());
+ NewRoots.clear();
+
+ for (auto &F : M.functions()) {
+ if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F))
+ continue;
+
+ const FeatureBitset &CalleeBits =
+ TM->getSubtargetImpl(F)->getFeatureBits();
+ SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
+
+ for (User *U : F.users()) {
+ Instruction *I = dyn_cast<Instruction>(U);
+ if (!I)
+ continue;
+ CallBase *CI = dyn_cast<CallBase>(I);
+ if (!CI)
+ continue;
+ Function *Caller = CI->getCaller();
+ if (!Caller)
+ continue;
+ if (!Roots.count(Caller))
+ continue;
+
+ const FeatureBitset &CallerBits =
+ TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures;
+
+ if (CallerBits == (CalleeBits & TargetFeatures)) {
+ NewRoots.insert(&F);
+ continue;
+ }
+
+ Function *NewF = findFunction(CallerBits, &F);
+ if (!NewF) {
+ FeatureBitset NewFeatures((CalleeBits & ~TargetFeatures) |
+ CallerBits);
+ if (!AllowClone) {
+ // This may set different features on different iteartions if
+ // there is a contradiction in callers' attributes. In this case
+ // we rely on a second pass running on Module, which is allowed
+ // to clone.
+ setFeatures(F, NewFeatures);
+ NewRoots.insert(&F);
+ Changed = true;
+ break;
+ }
+
+ NewF = cloneWithFeatures(F, NewFeatures);
+ Clones.push_back(Clone(CallerBits, &F, NewF));
+ NewRoots.insert(NewF);
+ }
+
+ ToReplace.push_back(std::make_pair(CI, NewF));
+ Replaced.insert(&F);
+
+ Changed = true;
+ }
+
+ while (!ToReplace.empty()) {
+ auto R = ToReplace.pop_back_val();
+ R.first->setCalledFunction(R.second);
+ }
+ }
+ } while (!NewRoots.empty());
+
+ for (Function *F : Replaced) {
+ if (F->use_empty())
+ F->eraseFromParent();
+ }
+
+ return Changed;
+}
+
+Function *
+AMDGPUPropagateAttributes::cloneWithFeatures(Function &F,
+ const FeatureBitset &NewFeatures) {
+ LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n');
+
+ ValueToValueMapTy dummy;
+ Function *NewF = CloneFunction(&F, dummy);
+ setFeatures(*NewF, NewFeatures);
+
+ // Swap names. If that is the only clone it will retain the name of now
+ // dead value.
+ if (F.hasName()) {
+ std::string NewName = NewF->getName();
+ NewF->takeName(&F);
+ F.setName(NewName);
+
+ // Name has changed, it does not need an external symbol.
+ F.setVisibility(GlobalValue::DefaultVisibility);
+ F.setLinkage(GlobalValue::InternalLinkage);
+ }
+
+ return NewF;
+}
+
+void AMDGPUPropagateAttributes::setFeatures(Function &F,
+ const FeatureBitset &NewFeatures) {
+ std::string NewFeatureStr = getFeatureString(NewFeatures);
+
+ LLVM_DEBUG(dbgs() << "Set features "
+ << getFeatureString(NewFeatures & TargetFeatures)
+ << " on " << F.getName() << '\n');
+
+ F.removeFnAttr("target-features");
+ F.addFnAttr("target-features", NewFeatureStr);
+}
+
+std::string
+AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
+{
+ std::string Ret;
+ for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) {
+ if (Features[KV.Value])
+ Ret += (StringRef("+") + KV.Key + ",").str();
+ else if (TargetFeatures[KV.Value])
+ Ret += (StringRef("-") + KV.Key + ",").str();
+ }
+ Ret.pop_back(); // Remove last comma.
+ return Ret;
+}
+
+bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
+ if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+ return false;
+
+ return AMDGPUPropagateAttributes(TM, false).process(F);
+}
+
+bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
+ if (!TM)
+ return false;
+
+ return AMDGPUPropagateAttributes(TM, true).process(M);
+}
+
+FunctionPass
+*llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) {
+ return new AMDGPUPropagateAttributesEarly(TM);
+}
+
+ModulePass
+*llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) {
+ return new AMDGPUPropagateAttributesLate(TM);
+}
diff --git a/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp b/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp
deleted file mode 100644
index 36d88f52910d..000000000000
--- a/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-//===-- AMDGPURegAsmNames.inc - Register asm names ----------*- C++ -*-----===//
-
-#ifdef AMDGPU_REG_ASM_NAMES
-
-static const char *const VGPR32RegNames[] = {
- "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
- "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
- "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
- "v27", "v28", "v29", "v30", "v31", "v32", "v33", "v34", "v35",
- "v36", "v37", "v38", "v39", "v40", "v41", "v42", "v43", "v44",
- "v45", "v46", "v47", "v48", "v49", "v50", "v51", "v52", "v53",
- "v54", "v55", "v56", "v57", "v58", "v59", "v60", "v61", "v62",
- "v63", "v64", "v65", "v66", "v67", "v68", "v69", "v70", "v71",
- "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", "v80",
- "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89",
- "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98",
- "v99", "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107",
- "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", "v116",
- "v117", "v118", "v119", "v120", "v121", "v122", "v123", "v124", "v125",
- "v126", "v127", "v128", "v129", "v130", "v131", "v132", "v133", "v134",
- "v135", "v136", "v137", "v138", "v139", "v140", "v141", "v142", "v143",
- "v144", "v145", "v146", "v147", "v148", "v149", "v150", "v151", "v152",
- "v153", "v154", "v155", "v156", "v157", "v158", "v159", "v160", "v161",
- "v162", "v163", "v164", "v165", "v166", "v167", "v168", "v169", "v170",
- "v171", "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
- "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", "v188",
- "v189", "v190", "v191", "v192", "v193", "v194", "v195", "v196", "v197",
- "v198", "v199", "v200", "v201", "v202", "v203", "v204", "v205", "v206",
- "v207", "v208", "v209", "v210", "v211", "v212", "v213", "v214", "v215",
- "v216", "v217", "v218", "v219", "v220", "v221", "v222", "v223", "v224",
- "v225", "v226", "v227", "v228", "v229", "v230", "v231", "v232", "v233",
- "v234", "v235", "v236", "v237", "v238", "v239", "v240", "v241", "v242",
- "v243", "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
- "v252", "v253", "v254", "v255"
-};
-
-static const char *const SGPR32RegNames[] = {
- "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9",
- "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19",
- "s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29",
- "s30", "s31", "s32", "s33", "s34", "s35", "s36", "s37", "s38", "s39",
- "s40", "s41", "s42", "s43", "s44", "s45", "s46", "s47", "s48", "s49",
- "s50", "s51", "s52", "s53", "s54", "s55", "s56", "s57", "s58", "s59",
- "s60", "s61", "s62", "s63", "s64", "s65", "s66", "s67", "s68", "s69",
- "s70", "s71", "s72", "s73", "s74", "s75", "s76", "s77", "s78", "s79",
- "s80", "s81", "s82", "s83", "s84", "s85", "s86", "s87", "s88", "s89",
- "s90", "s91", "s92", "s93", "s94", "s95", "s96", "s97", "s98", "s99",
- "s100", "s101", "s102", "s103"
-};
-
-static const char *const VGPR64RegNames[] = {
- "v[0:1]", "v[1:2]", "v[2:3]", "v[3:4]", "v[4:5]",
- "v[5:6]", "v[6:7]", "v[7:8]", "v[8:9]", "v[9:10]",
- "v[10:11]", "v[11:12]", "v[12:13]", "v[13:14]", "v[14:15]",
- "v[15:16]", "v[16:17]", "v[17:18]", "v[18:19]", "v[19:20]",
- "v[20:21]", "v[21:22]", "v[22:23]", "v[23:24]", "v[24:25]",
- "v[25:26]", "v[26:27]", "v[27:28]", "v[28:29]", "v[29:30]",
- "v[30:31]", "v[31:32]", "v[32:33]", "v[33:34]", "v[34:35]",
- "v[35:36]", "v[36:37]", "v[37:38]", "v[38:39]", "v[39:40]",
- "v[40:41]", "v[41:42]", "v[42:43]", "v[43:44]", "v[44:45]",
- "v[45:46]", "v[46:47]", "v[47:48]", "v[48:49]", "v[49:50]",
- "v[50:51]", "v[51:52]", "v[52:53]", "v[53:54]", "v[54:55]",
- "v[55:56]", "v[56:57]", "v[57:58]", "v[58:59]", "v[59:60]",
- "v[60:61]", "v[61:62]", "v[62:63]", "v[63:64]", "v[64:65]",
- "v[65:66]", "v[66:67]", "v[67:68]", "v[68:69]", "v[69:70]",
- "v[70:71]", "v[71:72]", "v[72:73]", "v[73:74]", "v[74:75]",
- "v[75:76]", "v[76:77]", "v[77:78]", "v[78:79]", "v[79:80]",
- "v[80:81]", "v[81:82]", "v[82:83]", "v[83:84]", "v[84:85]",
- "v[85:86]", "v[86:87]", "v[87:88]", "v[88:89]", "v[89:90]",
- "v[90:91]", "v[91:92]", "v[92:93]", "v[93:94]", "v[94:95]",
- "v[95:96]", "v[96:97]", "v[97:98]", "v[98:99]", "v[99:100]",
- "v[100:101]", "v[101:102]", "v[102:103]", "v[103:104]", "v[104:105]",
- "v[105:106]", "v[106:107]", "v[107:108]", "v[108:109]", "v[109:110]",
- "v[110:111]", "v[111:112]", "v[112:113]", "v[113:114]", "v[114:115]",
- "v[115:116]", "v[116:117]", "v[117:118]", "v[118:119]", "v[119:120]",
- "v[120:121]", "v[121:122]", "v[122:123]", "v[123:124]", "v[124:125]",
- "v[125:126]", "v[126:127]", "v[127:128]", "v[128:129]", "v[129:130]",
- "v[130:131]", "v[131:132]", "v[132:133]", "v[133:134]", "v[134:135]",
- "v[135:136]", "v[136:137]", "v[137:138]", "v[138:139]", "v[139:140]",
- "v[140:141]", "v[141:142]", "v[142:143]", "v[143:144]", "v[144:145]",
- "v[145:146]", "v[146:147]", "v[147:148]", "v[148:149]", "v[149:150]",
- "v[150:151]", "v[151:152]", "v[152:153]", "v[153:154]", "v[154:155]",
- "v[155:156]", "v[156:157]", "v[157:158]", "v[158:159]", "v[159:160]",
- "v[160:161]", "v[161:162]", "v[162:163]", "v[163:164]", "v[164:165]",
- "v[165:166]", "v[166:167]", "v[167:168]", "v[168:169]", "v[169:170]",
- "v[170:171]", "v[171:172]", "v[172:173]", "v[173:174]", "v[174:175]",
- "v[175:176]", "v[176:177]", "v[177:178]", "v[178:179]", "v[179:180]",
- "v[180:181]", "v[181:182]", "v[182:183]", "v[183:184]", "v[184:185]",
- "v[185:186]", "v[186:187]", "v[187:188]", "v[188:189]", "v[189:190]",
- "v[190:191]", "v[191:192]", "v[192:193]", "v[193:194]", "v[194:195]",
- "v[195:196]", "v[196:197]", "v[197:198]", "v[198:199]", "v[199:200]",
- "v[200:201]", "v[201:202]", "v[202:203]", "v[203:204]", "v[204:205]",
- "v[205:206]", "v[206:207]", "v[207:208]", "v[208:209]", "v[209:210]",
- "v[210:211]", "v[211:212]", "v[212:213]", "v[213:214]", "v[214:215]",
- "v[215:216]", "v[216:217]", "v[217:218]", "v[218:219]", "v[219:220]",
- "v[220:221]", "v[221:222]", "v[222:223]", "v[223:224]", "v[224:225]",
- "v[225:226]", "v[226:227]", "v[227:228]", "v[228:229]", "v[229:230]",
- "v[230:231]", "v[231:232]", "v[232:233]", "v[233:234]", "v[234:235]",
- "v[235:236]", "v[236:237]", "v[237:238]", "v[238:239]", "v[239:240]",
- "v[240:241]", "v[241:242]", "v[242:243]", "v[243:244]", "v[244:245]",
- "v[245:246]", "v[246:247]", "v[247:248]", "v[248:249]", "v[249:250]",
- "v[250:251]", "v[251:252]", "v[252:253]", "v[253:254]", "v[254:255]"
-};
-
-static const char *const VGPR96RegNames[] = {
- "v[0:2]", "v[1:3]", "v[2:4]", "v[3:5]", "v[4:6]",
- "v[5:7]", "v[6:8]", "v[7:9]", "v[8:10]", "v[9:11]",
- "v[10:12]", "v[11:13]", "v[12:14]", "v[13:15]", "v[14:16]",
- "v[15:17]", "v[16:18]", "v[17:19]", "v[18:20]", "v[19:21]",
- "v[20:22]", "v[21:23]", "v[22:24]", "v[23:25]", "v[24:26]",
- "v[25:27]", "v[26:28]", "v[27:29]", "v[28:30]", "v[29:31]",
- "v[30:32]", "v[31:33]", "v[32:34]", "v[33:35]", "v[34:36]",
- "v[35:37]", "v[36:38]", "v[37:39]", "v[38:40]", "v[39:41]",
- "v[40:42]", "v[41:43]", "v[42:44]", "v[43:45]", "v[44:46]",
- "v[45:47]", "v[46:48]", "v[47:49]", "v[48:50]", "v[49:51]",
- "v[50:52]", "v[51:53]", "v[52:54]", "v[53:55]", "v[54:56]",
- "v[55:57]", "v[56:58]", "v[57:59]", "v[58:60]", "v[59:61]",
- "v[60:62]", "v[61:63]", "v[62:64]", "v[63:65]", "v[64:66]",
- "v[65:67]", "v[66:68]", "v[67:69]", "v[68:70]", "v[69:71]",
- "v[70:72]", "v[71:73]", "v[72:74]", "v[73:75]", "v[74:76]",
- "v[75:77]", "v[76:78]", "v[77:79]", "v[78:80]", "v[79:81]",
- "v[80:82]", "v[81:83]", "v[82:84]", "v[83:85]", "v[84:86]",
- "v[85:87]", "v[86:88]", "v[87:89]", "v[88:90]", "v[89:91]",
- "v[90:92]", "v[91:93]", "v[92:94]", "v[93:95]", "v[94:96]",
- "v[95:97]", "v[96:98]", "v[97:99]", "v[98:100]", "v[99:101]",
- "v[100:102]", "v[101:103]", "v[102:104]", "v[103:105]", "v[104:106]",
- "v[105:107]", "v[106:108]", "v[107:109]", "v[108:110]", "v[109:111]",
- "v[110:112]", "v[111:113]", "v[112:114]", "v[113:115]", "v[114:116]",
- "v[115:117]", "v[116:118]", "v[117:119]", "v[118:120]", "v[119:121]",
- "v[120:122]", "v[121:123]", "v[122:124]", "v[123:125]", "v[124:126]",
- "v[125:127]", "v[126:128]", "v[127:129]", "v[128:130]", "v[129:131]",
- "v[130:132]", "v[131:133]", "v[132:134]", "v[133:135]", "v[134:136]",
- "v[135:137]", "v[136:138]", "v[137:139]", "v[138:140]", "v[139:141]",
- "v[140:142]", "v[141:143]", "v[142:144]", "v[143:145]", "v[144:146]",
- "v[145:147]", "v[146:148]", "v[147:149]", "v[148:150]", "v[149:151]",
- "v[150:152]", "v[151:153]", "v[152:154]", "v[153:155]", "v[154:156]",
- "v[155:157]", "v[156:158]", "v[157:159]", "v[158:160]", "v[159:161]",
- "v[160:162]", "v[161:163]", "v[162:164]", "v[163:165]", "v[164:166]",
- "v[165:167]", "v[166:168]", "v[167:169]", "v[168:170]", "v[169:171]",
- "v[170:172]", "v[171:173]", "v[172:174]", "v[173:175]", "v[174:176]",
- "v[175:177]", "v[176:178]", "v[177:179]", "v[178:180]", "v[179:181]",
- "v[180:182]", "v[181:183]", "v[182:184]", "v[183:185]", "v[184:186]",
- "v[185:187]", "v[186:188]", "v[187:189]", "v[188:190]", "v[189:191]",
- "v[190:192]", "v[191:193]", "v[192:194]", "v[193:195]", "v[194:196]",
- "v[195:197]", "v[196:198]", "v[197:199]", "v[198:200]", "v[199:201]",
- "v[200:202]", "v[201:203]", "v[202:204]", "v[203:205]", "v[204:206]",
- "v[205:207]", "v[206:208]", "v[207:209]", "v[208:210]", "v[209:211]",
- "v[210:212]", "v[211:213]", "v[212:214]", "v[213:215]", "v[214:216]",
- "v[215:217]", "v[216:218]", "v[217:219]", "v[218:220]", "v[219:221]",
- "v[220:222]", "v[221:223]", "v[222:224]", "v[223:225]", "v[224:226]",
- "v[225:227]", "v[226:228]", "v[227:229]", "v[228:230]", "v[229:231]",
- "v[230:232]", "v[231:233]", "v[232:234]", "v[233:235]", "v[234:236]",
- "v[235:237]", "v[236:238]", "v[237:239]", "v[238:240]", "v[239:241]",
- "v[240:242]", "v[241:243]", "v[242:244]", "v[243:245]", "v[244:246]",
- "v[245:247]", "v[246:248]", "v[247:249]", "v[248:250]", "v[249:251]",
- "v[250:252]", "v[251:253]", "v[252:254]", "v[253:255]"
-};
-
-static const char *const VGPR128RegNames[] = {
- "v[0:3]", "v[1:4]", "v[2:5]", "v[3:6]", "v[4:7]",
- "v[5:8]", "v[6:9]", "v[7:10]", "v[8:11]", "v[9:12]",
- "v[10:13]", "v[11:14]", "v[12:15]", "v[13:16]", "v[14:17]",
- "v[15:18]", "v[16:19]", "v[17:20]", "v[18:21]", "v[19:22]",
- "v[20:23]", "v[21:24]", "v[22:25]", "v[23:26]", "v[24:27]",
- "v[25:28]", "v[26:29]", "v[27:30]", "v[28:31]", "v[29:32]",
- "v[30:33]", "v[31:34]", "v[32:35]", "v[33:36]", "v[34:37]",
- "v[35:38]", "v[36:39]", "v[37:40]", "v[38:41]", "v[39:42]",
- "v[40:43]", "v[41:44]", "v[42:45]", "v[43:46]", "v[44:47]",
- "v[45:48]", "v[46:49]", "v[47:50]", "v[48:51]", "v[49:52]",
- "v[50:53]", "v[51:54]", "v[52:55]", "v[53:56]", "v[54:57]",
- "v[55:58]", "v[56:59]", "v[57:60]", "v[58:61]", "v[59:62]",
- "v[60:63]", "v[61:64]", "v[62:65]", "v[63:66]", "v[64:67]",
- "v[65:68]", "v[66:69]", "v[67:70]", "v[68:71]", "v[69:72]",
- "v[70:73]", "v[71:74]", "v[72:75]", "v[73:76]", "v[74:77]",
- "v[75:78]", "v[76:79]", "v[77:80]", "v[78:81]", "v[79:82]",
- "v[80:83]", "v[81:84]", "v[82:85]", "v[83:86]", "v[84:87]",
- "v[85:88]", "v[86:89]", "v[87:90]", "v[88:91]", "v[89:92]",
- "v[90:93]", "v[91:94]", "v[92:95]", "v[93:96]", "v[94:97]",
- "v[95:98]", "v[96:99]", "v[97:100]", "v[98:101]", "v[99:102]",
- "v[100:103]", "v[101:104]", "v[102:105]", "v[103:106]", "v[104:107]",
- "v[105:108]", "v[106:109]", "v[107:110]", "v[108:111]", "v[109:112]",
- "v[110:113]", "v[111:114]", "v[112:115]", "v[113:116]", "v[114:117]",
- "v[115:118]", "v[116:119]", "v[117:120]", "v[118:121]", "v[119:122]",
- "v[120:123]", "v[121:124]", "v[122:125]", "v[123:126]", "v[124:127]",
- "v[125:128]", "v[126:129]", "v[127:130]", "v[128:131]", "v[129:132]",
- "v[130:133]", "v[131:134]", "v[132:135]", "v[133:136]", "v[134:137]",
- "v[135:138]", "v[136:139]", "v[137:140]", "v[138:141]", "v[139:142]",
- "v[140:143]", "v[141:144]", "v[142:145]", "v[143:146]", "v[144:147]",
- "v[145:148]", "v[146:149]", "v[147:150]", "v[148:151]", "v[149:152]",
- "v[150:153]", "v[151:154]", "v[152:155]", "v[153:156]", "v[154:157]",
- "v[155:158]", "v[156:159]", "v[157:160]", "v[158:161]", "v[159:162]",
- "v[160:163]", "v[161:164]", "v[162:165]", "v[163:166]", "v[164:167]",
- "v[165:168]", "v[166:169]", "v[167:170]", "v[168:171]", "v[169:172]",
- "v[170:173]", "v[171:174]", "v[172:175]", "v[173:176]", "v[174:177]",
- "v[175:178]", "v[176:179]", "v[177:180]", "v[178:181]", "v[179:182]",
- "v[180:183]", "v[181:184]", "v[182:185]", "v[183:186]", "v[184:187]",
- "v[185:188]", "v[186:189]", "v[187:190]", "v[188:191]", "v[189:192]",
- "v[190:193]", "v[191:194]", "v[192:195]", "v[193:196]", "v[194:197]",
- "v[195:198]", "v[196:199]", "v[197:200]", "v[198:201]", "v[199:202]",
- "v[200:203]", "v[201:204]", "v[202:205]", "v[203:206]", "v[204:207]",
- "v[205:208]", "v[206:209]", "v[207:210]", "v[208:211]", "v[209:212]",
- "v[210:213]", "v[211:214]", "v[212:215]", "v[213:216]", "v[214:217]",
- "v[215:218]", "v[216:219]", "v[217:220]", "v[218:221]", "v[219:222]",
- "v[220:223]", "v[221:224]", "v[222:225]", "v[223:226]", "v[224:227]",
- "v[225:228]", "v[226:229]", "v[227:230]", "v[228:231]", "v[229:232]",
- "v[230:233]", "v[231:234]", "v[232:235]", "v[233:236]", "v[234:237]",
- "v[235:238]", "v[236:239]", "v[237:240]", "v[238:241]", "v[239:242]",
- "v[240:243]", "v[241:244]", "v[242:245]", "v[243:246]", "v[244:247]",
- "v[245:248]", "v[246:249]", "v[247:250]", "v[248:251]", "v[249:252]",
- "v[250:253]", "v[251:254]", "v[252:255]"
-};
-
-static const char *const VGPR256RegNames[] = {
- "v[0:7]", "v[1:8]", "v[2:9]", "v[3:10]", "v[4:11]",
- "v[5:12]", "v[6:13]", "v[7:14]", "v[8:15]", "v[9:16]",
- "v[10:17]", "v[11:18]", "v[12:19]", "v[13:20]", "v[14:21]",
- "v[15:22]", "v[16:23]", "v[17:24]", "v[18:25]", "v[19:26]",
- "v[20:27]", "v[21:28]", "v[22:29]", "v[23:30]", "v[24:31]",
- "v[25:32]", "v[26:33]", "v[27:34]", "v[28:35]", "v[29:36]",
- "v[30:37]", "v[31:38]", "v[32:39]", "v[33:40]", "v[34:41]",
- "v[35:42]", "v[36:43]", "v[37:44]", "v[38:45]", "v[39:46]",
- "v[40:47]", "v[41:48]", "v[42:49]", "v[43:50]", "v[44:51]",
- "v[45:52]", "v[46:53]", "v[47:54]", "v[48:55]", "v[49:56]",
- "v[50:57]", "v[51:58]", "v[52:59]", "v[53:60]", "v[54:61]",
- "v[55:62]", "v[56:63]", "v[57:64]", "v[58:65]", "v[59:66]",
- "v[60:67]", "v[61:68]", "v[62:69]", "v[63:70]", "v[64:71]",
- "v[65:72]", "v[66:73]", "v[67:74]", "v[68:75]", "v[69:76]",
- "v[70:77]", "v[71:78]", "v[72:79]", "v[73:80]", "v[74:81]",
- "v[75:82]", "v[76:83]", "v[77:84]", "v[78:85]", "v[79:86]",
- "v[80:87]", "v[81:88]", "v[82:89]", "v[83:90]", "v[84:91]",
- "v[85:92]", "v[86:93]", "v[87:94]", "v[88:95]", "v[89:96]",
- "v[90:97]", "v[91:98]", "v[92:99]", "v[93:100]", "v[94:101]",
- "v[95:102]", "v[96:103]", "v[97:104]", "v[98:105]", "v[99:106]",
- "v[100:107]", "v[101:108]", "v[102:109]", "v[103:110]", "v[104:111]",
- "v[105:112]", "v[106:113]", "v[107:114]", "v[108:115]", "v[109:116]",
- "v[110:117]", "v[111:118]", "v[112:119]", "v[113:120]", "v[114:121]",
- "v[115:122]", "v[116:123]", "v[117:124]", "v[118:125]", "v[119:126]",
- "v[120:127]", "v[121:128]", "v[122:129]", "v[123:130]", "v[124:131]",
- "v[125:132]", "v[126:133]", "v[127:134]", "v[128:135]", "v[129:136]",
- "v[130:137]", "v[131:138]", "v[132:139]", "v[133:140]", "v[134:141]",
- "v[135:142]", "v[136:143]", "v[137:144]", "v[138:145]", "v[139:146]",
- "v[140:147]", "v[141:148]", "v[142:149]", "v[143:150]", "v[144:151]",
- "v[145:152]", "v[146:153]", "v[147:154]", "v[148:155]", "v[149:156]",
- "v[150:157]", "v[151:158]", "v[152:159]", "v[153:160]", "v[154:161]",
- "v[155:162]", "v[156:163]", "v[157:164]", "v[158:165]", "v[159:166]",
- "v[160:167]", "v[161:168]", "v[162:169]", "v[163:170]", "v[164:171]",
- "v[165:172]", "v[166:173]", "v[167:174]", "v[168:175]", "v[169:176]",
- "v[170:177]", "v[171:178]", "v[172:179]", "v[173:180]", "v[174:181]",
- "v[175:182]", "v[176:183]", "v[177:184]", "v[178:185]", "v[179:186]",
- "v[180:187]", "v[181:188]", "v[182:189]", "v[183:190]", "v[184:191]",
- "v[185:192]", "v[186:193]", "v[187:194]", "v[188:195]", "v[189:196]",
- "v[190:197]", "v[191:198]", "v[192:199]", "v[193:200]", "v[194:201]",
- "v[195:202]", "v[196:203]", "v[197:204]", "v[198:205]", "v[199:206]",
- "v[200:207]", "v[201:208]", "v[202:209]", "v[203:210]", "v[204:211]",
- "v[205:212]", "v[206:213]", "v[207:214]", "v[208:215]", "v[209:216]",
- "v[210:217]", "v[211:218]", "v[212:219]", "v[213:220]", "v[214:221]",
- "v[215:222]", "v[216:223]", "v[217:224]", "v[218:225]", "v[219:226]",
- "v[220:227]", "v[221:228]", "v[222:229]", "v[223:230]", "v[224:231]",
- "v[225:232]", "v[226:233]", "v[227:234]", "v[228:235]", "v[229:236]",
- "v[230:237]", "v[231:238]", "v[232:239]", "v[233:240]", "v[234:241]",
- "v[235:242]", "v[236:243]", "v[237:244]", "v[238:245]", "v[239:246]",
- "v[240:247]", "v[241:248]", "v[242:249]", "v[243:250]", "v[244:251]",
- "v[245:252]", "v[246:253]", "v[247:254]", "v[248:255]"
-};
-
-static const char *const VGPR512RegNames[] = {
- "v[0:15]", "v[1:16]", "v[2:17]", "v[3:18]", "v[4:19]",
- "v[5:20]", "v[6:21]", "v[7:22]", "v[8:23]", "v[9:24]",
- "v[10:25]", "v[11:26]", "v[12:27]", "v[13:28]", "v[14:29]",
- "v[15:30]", "v[16:31]", "v[17:32]", "v[18:33]", "v[19:34]",
- "v[20:35]", "v[21:36]", "v[22:37]", "v[23:38]", "v[24:39]",
- "v[25:40]", "v[26:41]", "v[27:42]", "v[28:43]", "v[29:44]",
- "v[30:45]", "v[31:46]", "v[32:47]", "v[33:48]", "v[34:49]",
- "v[35:50]", "v[36:51]", "v[37:52]", "v[38:53]", "v[39:54]",
- "v[40:55]", "v[41:56]", "v[42:57]", "v[43:58]", "v[44:59]",
- "v[45:60]", "v[46:61]", "v[47:62]", "v[48:63]", "v[49:64]",
- "v[50:65]", "v[51:66]", "v[52:67]", "v[53:68]", "v[54:69]",
- "v[55:70]", "v[56:71]", "v[57:72]", "v[58:73]", "v[59:74]",
- "v[60:75]", "v[61:76]", "v[62:77]", "v[63:78]", "v[64:79]",
- "v[65:80]", "v[66:81]", "v[67:82]", "v[68:83]", "v[69:84]",
- "v[70:85]", "v[71:86]", "v[72:87]", "v[73:88]", "v[74:89]",
- "v[75:90]", "v[76:91]", "v[77:92]", "v[78:93]", "v[79:94]",
- "v[80:95]", "v[81:96]", "v[82:97]", "v[83:98]", "v[84:99]",
- "v[85:100]", "v[86:101]", "v[87:102]", "v[88:103]", "v[89:104]",
- "v[90:105]", "v[91:106]", "v[92:107]", "v[93:108]", "v[94:109]",
- "v[95:110]", "v[96:111]", "v[97:112]", "v[98:113]", "v[99:114]",
- "v[100:115]", "v[101:116]", "v[102:117]", "v[103:118]", "v[104:119]",
- "v[105:120]", "v[106:121]", "v[107:122]", "v[108:123]", "v[109:124]",
- "v[110:125]", "v[111:126]", "v[112:127]", "v[113:128]", "v[114:129]",
- "v[115:130]", "v[116:131]", "v[117:132]", "v[118:133]", "v[119:134]",
- "v[120:135]", "v[121:136]", "v[122:137]", "v[123:138]", "v[124:139]",
- "v[125:140]", "v[126:141]", "v[127:142]", "v[128:143]", "v[129:144]",
- "v[130:145]", "v[131:146]", "v[132:147]", "v[133:148]", "v[134:149]",
- "v[135:150]", "v[136:151]", "v[137:152]", "v[138:153]", "v[139:154]",
- "v[140:155]", "v[141:156]", "v[142:157]", "v[143:158]", "v[144:159]",
- "v[145:160]", "v[146:161]", "v[147:162]", "v[148:163]", "v[149:164]",
- "v[150:165]", "v[151:166]", "v[152:167]", "v[153:168]", "v[154:169]",
- "v[155:170]", "v[156:171]", "v[157:172]", "v[158:173]", "v[159:174]",
- "v[160:175]", "v[161:176]", "v[162:177]", "v[163:178]", "v[164:179]",
- "v[165:180]", "v[166:181]", "v[167:182]", "v[168:183]", "v[169:184]",
- "v[170:185]", "v[171:186]", "v[172:187]", "v[173:188]", "v[174:189]",
- "v[175:190]", "v[176:191]", "v[177:192]", "v[178:193]", "v[179:194]",
- "v[180:195]", "v[181:196]", "v[182:197]", "v[183:198]", "v[184:199]",
- "v[185:200]", "v[186:201]", "v[187:202]", "v[188:203]", "v[189:204]",
- "v[190:205]", "v[191:206]", "v[192:207]", "v[193:208]", "v[194:209]",
- "v[195:210]", "v[196:211]", "v[197:212]", "v[198:213]", "v[199:214]",
- "v[200:215]", "v[201:216]", "v[202:217]", "v[203:218]", "v[204:219]",
- "v[205:220]", "v[206:221]", "v[207:222]", "v[208:223]", "v[209:224]",
- "v[210:225]", "v[211:226]", "v[212:227]", "v[213:228]", "v[214:229]",
- "v[215:230]", "v[216:231]", "v[217:232]", "v[218:233]", "v[219:234]",
- "v[220:235]", "v[221:236]", "v[222:237]", "v[223:238]", "v[224:239]",
- "v[225:240]", "v[226:241]", "v[227:242]", "v[228:243]", "v[229:244]",
- "v[230:245]", "v[231:246]", "v[232:247]", "v[233:248]", "v[234:249]",
- "v[235:250]", "v[236:251]", "v[237:252]", "v[238:253]", "v[239:254]",
- "v[240:255]"
-};
-
-static const char *const SGPR64RegNames[] = {
- "s[0:1]", "s[2:3]", "s[4:5]", "s[6:7]", "s[8:9]", "s[10:11]",
- "s[12:13]", "s[14:15]", "s[16:17]", "s[18:19]", "s[20:21]", "s[22:23]",
- "s[24:25]", "s[26:27]", "s[28:29]", "s[30:31]", "s[32:33]", "s[34:35]",
- "s[36:37]", "s[38:39]", "s[40:41]", "s[42:43]", "s[44:45]", "s[46:47]",
- "s[48:49]", "s[50:51]", "s[52:53]", "s[54:55]", "s[56:57]", "s[58:59]",
- "s[60:61]", "s[62:63]", "s[64:65]", "s[66:67]", "s[68:69]", "s[70:71]",
- "s[72:73]", "s[74:75]", "s[76:77]", "s[78:79]", "s[80:81]", "s[82:83]",
- "s[84:85]", "s[86:87]", "s[88:89]", "s[90:91]", "s[92:93]", "s[94:95]",
- "s[96:97]", "s[98:99]", "s[100:101]", "s[102:103]"
-};
-
-static const char *const SGPR128RegNames[] = {
- "s[0:3]", "s[4:7]", "s[8:11]", "s[12:15]", "s[16:19]", "s[20:23]",
- "s[24:27]", "s[28:31]", "s[32:35]", "s[36:39]", "s[40:43]", "s[44:47]",
- "s[48:51]", "s[52:55]", "s[56:59]", "s[60:63]", "s[64:67]", "s[68:71]",
- "s[72:75]", "s[76:79]", "s[80:83]", "s[84:87]", "s[88:91]", "s[92:95]",
- "s[96:99]", "s[100:103]"
-};
-
-static const char *const SGPR256RegNames[] = {
- "s[0:7]", "s[4:11]", "s[8:15]", "s[12:19]", "s[16:23]",
- "s[20:27]", "s[24:31]", "s[28:35]", "s[32:39]", "s[36:43]",
- "s[40:47]", "s[44:51]", "s[48:55]", "s[52:59]", "s[56:63]",
- "s[60:67]", "s[64:71]", "s[68:75]", "s[72:79]", "s[76:83]",
- "s[80:87]", "s[84:91]", "s[88:95]", "s[92:99]", "s[96:103]"
-};
-
-static const char *const SGPR512RegNames[] = {
- "s[0:15]", "s[4:19]", "s[8:23]", "s[12:27]", "s[16:31]", "s[20:35]",
- "s[24:39]", "s[28:43]", "s[32:47]", "s[36:51]", "s[40:55]", "s[44:59]",
- "s[48:63]", "s[52:67]", "s[56:71]", "s[60:75]", "s[64:79]", "s[68:83]",
- "s[72:87]", "s[76:91]", "s[80:95]", "s[84:99]", "s[88:103]"
-};
-
-#endif
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 7a760dcf7a90..815cbc5e26ee 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1,9 +1,8 @@
//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -14,9 +13,13 @@
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -31,6 +34,56 @@
using namespace llvm;
+namespace {
+
+// Observer to apply a register bank to new registers created by LegalizerHelper.
+class ApplyRegBankMapping final : public GISelChangeObserver {
+private:
+ MachineRegisterInfo &MRI;
+ const RegisterBank *NewBank;
+ SmallVector<MachineInstr *, 4> NewInsts;
+
+public:
+ ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
+ : MRI(MRI_), NewBank(RB) {}
+
+ ~ApplyRegBankMapping() {
+ for (MachineInstr *MI : NewInsts)
+ applyBank(*MI);
+ }
+
+ /// Set any registers that don't have a set register class or bank to SALU.
+ void applyBank(MachineInstr &MI) {
+ for (MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+
+ Register Reg = Op.getReg();
+ if (MRI.getRegClassOrRegBank(Reg))
+ continue;
+
+ const RegisterBank *RB = NewBank;
+ // FIXME: This might not be enough to detect when SCC should be used.
+ if (MRI.getType(Reg) == LLT::scalar(1))
+ RB = (NewBank == &AMDGPU::SGPRRegBank ?
+ &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
+
+ MRI.setRegBank(Reg, *RB);
+ }
+ }
+
+ void erasingInstr(MachineInstr &MI) override {}
+
+ void createdInstr(MachineInstr &MI) override {
+ // At this point, the instruction was just inserted and has no operands.
+ NewInsts.push_back(&MI);
+ }
+
+ void changingInstr(MachineInstr &MI) override {}
+ void changedInstr(MachineInstr &MI) override {}
+};
+
+}
AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
: AMDGPUGenRegisterBankInfo(),
TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
@@ -52,43 +105,62 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
}
-static bool isConstant(const MachineOperand &MO, int64_t &C) {
- const MachineFunction *MF = MO.getParent()->getParent()->getParent();
- const MachineRegisterInfo &MRI = MF->getRegInfo();
- const MachineInstr *Def = MRI.getVRegDef(MO.getReg());
- if (!Def)
- return false;
-
- if (Def->getOpcode() == AMDGPU::G_CONSTANT) {
- C = Def->getOperand(1).getCImm()->getSExtValue();
- return true;
- }
-
- if (Def->getOpcode() == AMDGPU::COPY)
- return isConstant(Def->getOperand(1), C);
-
- return false;
-}
-
unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
const RegisterBank &Src,
unsigned Size) const {
+ // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
if (Dst.getID() == AMDGPU::SGPRRegBankID &&
Src.getID() == AMDGPU::VGPRRegBankID) {
return std::numeric_limits<unsigned>::max();
}
- // SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by
- // the valu.
- if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID &&
+ // Bool values are tricky, because the meaning is based on context. The SCC
+ // and VCC banks are for the natural scalar and vector conditions produced by
+ // a compare.
+ //
+ // Legalization doesn't know about the necessary context, so an s1 use may
+ // have been a truncate from an arbitrary value, in which case a copy (lowered
+ // as a compare with 0) needs to be inserted.
+ if (Size == 1 &&
+ (Dst.getID() == AMDGPU::SCCRegBankID ||
+ Dst.getID() == AMDGPU::SGPRRegBankID) &&
(Src.getID() == AMDGPU::SGPRRegBankID ||
Src.getID() == AMDGPU::VGPRRegBankID ||
Src.getID() == AMDGPU::VCCRegBankID))
return std::numeric_limits<unsigned>::max();
+ if (Dst.getID() == AMDGPU::SCCRegBankID &&
+ Src.getID() == AMDGPU::VCCRegBankID)
+ return std::numeric_limits<unsigned>::max();
+
return RegisterBankInfo::copyCost(Dst, Src, Size);
}
+unsigned AMDGPURegisterBankInfo::getBreakDownCost(
+ const ValueMapping &ValMapping,
+ const RegisterBank *CurBank) const {
+ // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
+ // VGPR.
+ // FIXME: Is there a better way to do this?
+ if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
+ return 10; // This is expensive.
+
+ assert(ValMapping.NumBreakDowns == 2 &&
+ ValMapping.BreakDown[0].Length == 32 &&
+ ValMapping.BreakDown[0].StartIdx == 0 &&
+ ValMapping.BreakDown[1].Length == 32 &&
+ ValMapping.BreakDown[1].StartIdx == 32 &&
+ ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
+
+ // 32-bit extract of a 64-bit value is just access of a subregister, so free.
+ // TODO: Cost of 0 hits assert, though it's not clear it's what we really
+ // want.
+
+ // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
+ // alignment restrictions, but this probably isn't important.
+ return 1;
+}
+
const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
const TargetRegisterClass &RC) const {
@@ -98,6 +170,163 @@ const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
return getRegBank(AMDGPU::VGPRRegBankID);
}
+template <unsigned NumOps>
+RegisterBankInfo::InstructionMappings
+AMDGPURegisterBankInfo::addMappingFromTable(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI,
+ const std::array<unsigned, NumOps> RegSrcOpIdx,
+ ArrayRef<OpRegBankEntry<NumOps>> Table) const {
+
+ InstructionMappings AltMappings;
+
+ SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
+
+ unsigned Sizes[NumOps];
+ for (unsigned I = 0; I < NumOps; ++I) {
+ Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
+ Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
+ }
+
+ for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
+ unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
+ Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
+ }
+
+ unsigned MappingID = 0;
+ for (const auto &Entry : Table) {
+ for (unsigned I = 0; I < NumOps; ++I) {
+ int OpIdx = RegSrcOpIdx[I];
+ Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
+ }
+
+ AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
+ getOperandsMapping(Operands),
+ Operands.size()));
+ }
+
+ return AltMappings;
+}
+
+RegisterBankInfo::InstructionMappings
+AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ case Intrinsic::amdgcn_readlane: {
+ static const OpRegBankEntry<3> Table[2] = {
+ // Perfectly legal.
+ { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
+
+ // Need a readfirstlane for the index.
+ { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
+ };
+
+ const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
+ return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ case Intrinsic::amdgcn_writelane: {
+ static const OpRegBankEntry<4> Table[4] = {
+ // Perfectly legal.
+ { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
+
+ // Need readfirstlane of first op
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
+
+ // Need readfirstlane of second op
+ { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
+
+ // Need readfirstlane of both ops
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
+ };
+
+ // rsrc, voffset, offset
+ const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
+ return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ default:
+ return RegisterBankInfo::getInstrAlternativeMappings(MI);
+ }
+}
+
+RegisterBankInfo::InstructionMappings
+AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ case Intrinsic::amdgcn_buffer_load: {
+ static const OpRegBankEntry<3> Table[4] = {
+ // Perfectly legal.
+ { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
+ { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
+
+ // Waterfall loop needed for rsrc. In the worst case this will execute
+ // approximately an extra 10 * wavesize + 2 instructions.
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
+ };
+
+ // rsrc, voffset, offset
+ const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
+ return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ case Intrinsic::amdgcn_s_buffer_load: {
+ static const OpRegBankEntry<2> Table[4] = {
+ // Perfectly legal.
+ { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
+
+ // Only need 1 register in loop
+ { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
+
+ // Have to waterfall the resource.
+ { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
+
+ // Have to waterfall the resource, and the offset.
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
+ };
+
+ // rsrc, offset
+ const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
+ return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap: {
+ // VGPR = M0, VGPR
+ static const OpRegBankEntry<3> Table[2] = {
+ // Perfectly legal.
+ { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
+
+ // Need a readfirstlane for m0
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
+ };
+
+ const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
+ return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ case Intrinsic::amdgcn_s_sendmsg:
+ case Intrinsic::amdgcn_s_sendmsghalt: {
+ static const OpRegBankEntry<1> Table[2] = {
+ // Perfectly legal.
+ { { AMDGPU::SGPRRegBankID }, 1 },
+
+ // Need readlane
+ { { AMDGPU::VGPRRegBankID }, 3 }
+ };
+
+ const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
+ return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
+ default:
+ return RegisterBankInfo::getInstrAlternativeMappings(MI);
+ }
+}
+
+static bool isInstrUniform(const MachineInstr &MI) {
+ if (!MI.hasOneMemOperand())
+ return false;
+
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+ return AMDGPUInstrInfo::isUniformMMO(MMO);
+}
+
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const MachineInstr &MI) const {
@@ -108,31 +337,102 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
InstructionMappings AltMappings;
switch (MI.getOpcode()) {
- case TargetOpcode::G_LOAD: {
+ case TargetOpcode::G_AND:
+ case TargetOpcode::G_OR:
+ case TargetOpcode::G_XOR: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
- // FIXME: Should we be hard coding the size for these mappings?
- const InstructionMapping &SSMapping = getInstructionMapping(
+
+ if (Size == 1) {
+ // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
+ const InstructionMapping &SCCMapping = getInstructionMapping(
1, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
- 2); // Num Operands
+ {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&SCCMapping);
+
+ const InstructionMapping &SGPRMapping = getInstructionMapping(
+ 1, 1, getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&SGPRMapping);
+
+ const InstructionMapping &VCCMapping0 = getInstructionMapping(
+ 2, 10, getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&VCCMapping0);
+ return AltMappings;
+ }
+
+ if (Size != 64)
+ break;
+
+ const InstructionMapping &SSMapping = getInstructionMapping(
+ 1, 1, getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ 3); // Num Operands
AltMappings.push_back(&SSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(
+ 2, 2, getOperandsMapping(
+ {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&VVMapping);
+
+ const InstructionMapping &SVMapping = getInstructionMapping(
+ 3, 3, getOperandsMapping(
+ {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&SVMapping);
+
+ // SGPR in LHS is slightly preferrable, so make it VS more expensive than
+ // SV.
+ const InstructionMapping &VSMapping = getInstructionMapping(
+ 3, 4, getOperandsMapping(
+ {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
+ 3); // Num Operands
+ AltMappings.push_back(&VSMapping);
+ break;
+ }
+ case TargetOpcode::G_LOAD: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
+ // FIXME: Should we be hard coding the size for these mappings?
+ if (isInstrUniform(MI)) {
+ const InstructionMapping &SSMapping = getInstructionMapping(
+ 1, 1, getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+ 2); // Num Operands
+ AltMappings.push_back(&SSMapping);
+ }
+
+ const InstructionMapping &VVMapping = getInstructionMapping(
2, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
2); // Num Operands
AltMappings.push_back(&VVMapping);
- // FIXME: Should this be the pointer-size (64-bits) or the size of the
- // register that will hold the bufffer resourc (128-bits).
- const InstructionMapping &VSMapping = getInstructionMapping(
- 3, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
- 2); // Num Operands
- AltMappings.push_back(&VSMapping);
+ // It may be possible to have a vgpr = load sgpr mapping here, because
+ // the mubuf instructions support this kind of load, but probably for only
+ // gfx7 and older. However, the addressing mode matching in the instruction
+ // selector should be able to do a better job of detecting and selecting
+ // these kinds of loads from the vgpr = load vgpr mapping.
return AltMappings;
@@ -184,15 +484,32 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
AltMappings.push_back(&SSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
- getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
4); // Num Operands
AltMappings.push_back(&VVMapping);
return AltMappings;
}
+ case TargetOpcode::G_SMIN:
+ case TargetOpcode::G_SMAX:
+ case TargetOpcode::G_UMIN:
+ case TargetOpcode::G_UMAX: {
+ static const OpRegBankEntry<3> Table[4] = {
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
+ { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
+ { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
+
+ // Scalar requires cmp+select, and extends if 16-bit.
+ // FIXME: Should there be separate costs for 32 and 16-bit
+ { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
+ };
+
+ const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
+ return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
+ }
case TargetOpcode::G_UADDE:
case TargetOpcode::G_USUBE:
case TargetOpcode::G_SADDE:
@@ -234,23 +551,816 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
AltMappings.push_back(&VMapping);
return AltMappings;
}
+ case AMDGPU::G_INTRINSIC:
+ return getInstrAlternativeMappingsIntrinsic(MI, MRI);
+ case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
+ return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
default:
break;
}
return RegisterBankInfo::getInstrAlternativeMappings(MI);
}
-void AMDGPURegisterBankInfo::applyMappingImpl(
- const OperandsMapper &OpdMapper) const {
- return applyDefaultMapping(OpdMapper);
+void AMDGPURegisterBankInfo::split64BitValueForMapping(
+ MachineIRBuilder &B,
+ SmallVector<Register, 2> &Regs,
+ LLT HalfTy,
+ Register Reg) const {
+ assert(HalfTy.getSizeInBits() == 32);
+ MachineRegisterInfo *MRI = B.getMRI();
+ Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
+ Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
+ const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
+ MRI->setRegBank(LoLHS, *Bank);
+ MRI->setRegBank(HiLHS, *Bank);
+
+ Regs.push_back(LoLHS);
+ Regs.push_back(HiLHS);
+
+ B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
+ .addDef(LoLHS)
+ .addDef(HiLHS)
+ .addUse(Reg);
}
-static bool isInstrUniform(const MachineInstr &MI) {
- if (!MI.hasOneMemOperand())
+/// Replace the current type each register in \p Regs has with \p NewTy
+static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
+ LLT NewTy) {
+ for (Register Reg : Regs) {
+ assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
+ MRI.setType(Reg, NewTy);
+ }
+}
+
+static LLT getHalfSizedType(LLT Ty) {
+ if (Ty.isVector()) {
+ assert(Ty.getNumElements() % 2 == 0);
+ return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
+ }
+
+ assert(Ty.getSizeInBits() % 2 == 0);
+ return LLT::scalar(Ty.getSizeInBits() / 2);
+}
+
+/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
+/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
+/// execute the instruction for each unique combination of values in all lanes
+/// in the wave. The block will be split such that rest of the instructions are
+/// moved to a new block.
+///
+/// Essentially performs this loop:
+//
+/// Save Execution Mask
+/// For (Lane : Wavefront) {
+/// Enable Lane, Disable all other lanes
+/// SGPR = read SGPR value for current lane from VGPR
+/// VGPRResult[Lane] = use_op SGPR
+/// }
+/// Restore Execution Mask
+///
+/// There is additional complexity to try for compare values to identify the
+/// unique values used.
+void AMDGPURegisterBankInfo::executeInWaterfallLoop(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const {
+ MachineFunction *MF = MI.getParent()->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineBasicBlock::iterator I(MI);
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ // Use a set to avoid extra readfirstlanes in the case where multiple operands
+ // are the same register.
+ SmallSet<Register, 4> SGPROperandRegs;
+ for (unsigned Op : OpIndices) {
+ assert(MI.getOperand(Op).isUse());
+ Register Reg = MI.getOperand(Op).getReg();
+ const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
+ if (OpBank->getID() == AMDGPU::VGPRRegBankID)
+ SGPROperandRegs.insert(Reg);
+ }
+
+ // No operands need to be replaced, so no need to loop.
+ if (SGPROperandRegs.empty())
+ return;
+
+ MachineIRBuilder B(MI);
+ SmallVector<Register, 4> ResultRegs;
+ SmallVector<Register, 4> InitResultRegs;
+ SmallVector<Register, 4> PhiRegs;
+ for (MachineOperand &Def : MI.defs()) {
+ LLT ResTy = MRI.getType(Def.getReg());
+ const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
+ ResultRegs.push_back(Def.getReg());
+ Register InitReg = B.buildUndef(ResTy).getReg(0);
+ Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
+ InitResultRegs.push_back(InitReg);
+ PhiRegs.push_back(PhiReg);
+ MRI.setRegBank(PhiReg, *DefBank);
+ MRI.setRegBank(InitReg, *DefBank);
+ }
+
+ Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ // Don't bother using generic instructions/registers for the exec mask.
+ B.buildInstr(TargetOpcode::IMPLICIT_DEF)
+ .addDef(InitSaveExecReg);
+
+ Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ // To insert the loop we need to split the block. Move everything before this
+ // point to a new block, and insert a new empty block before this instruction.
+ MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+ MF->insert(MBBI, LoopBB);
+ MF->insert(MBBI, RestoreExecBB);
+ MF->insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(RestoreExecBB);
+ LoopBB->addSuccessor(LoopBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+
+ MBB.addSuccessor(LoopBB);
+ RestoreExecBB->addSuccessor(RemainderBB);
+
+ B.setInsertPt(*LoopBB, LoopBB->end());
+
+ B.buildInstr(TargetOpcode::PHI)
+ .addDef(PhiExec)
+ .addReg(InitSaveExecReg)
+ .addMBB(&MBB)
+ .addReg(NewExec)
+ .addMBB(LoopBB);
+
+ for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
+ B.buildInstr(TargetOpcode::G_PHI)
+ .addDef(std::get<2>(Result))
+ .addReg(std::get<0>(Result)) // Initial value / implicit_def
+ .addMBB(&MBB)
+ .addReg(std::get<1>(Result)) // Mid-loop value.
+ .addMBB(LoopBB);
+ }
+
+ // Move the instruction into the loop.
+ LoopBB->splice(LoopBB->end(), &MBB, I);
+ I = std::prev(LoopBB->end());
+
+ B.setInstr(*I);
+
+ Register CondReg;
+
+ for (MachineOperand &Op : MI.uses()) {
+ if (!Op.isReg())
+ continue;
+
+ assert(!Op.isDef());
+ if (SGPROperandRegs.count(Op.getReg())) {
+ LLT OpTy = MRI.getType(Op.getReg());
+ unsigned OpSize = OpTy.getSizeInBits();
+
+ // Can only do a readlane of 32-bit pieces.
+ if (OpSize == 32) {
+ // Avoid extra copies in the simple case of one 32-bit register.
+ Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ MRI.setType(CurrentLaneOpReg, OpTy);
+
+ constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
+ .addReg(Op.getReg());
+
+ Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ bool First = CondReg == AMDGPU::NoRegister;
+ if (First)
+ CondReg = NewCondReg;
+
+ // Compare the just read M0 value to all possible Idx values.
+ B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
+ .addDef(NewCondReg)
+ .addReg(CurrentLaneOpReg)
+ .addReg(Op.getReg());
+ Op.setReg(CurrentLaneOpReg);
+
+ if (!First) {
+ Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ // If there are multiple operands to consider, and the conditions.
+ B.buildInstr(AMDGPU::S_AND_B64)
+ .addDef(AndReg)
+ .addReg(NewCondReg)
+ .addReg(CondReg);
+ CondReg = AndReg;
+ }
+ } else {
+ LLT S32 = LLT::scalar(32);
+ SmallVector<Register, 8> ReadlanePieces;
+
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
+
+ bool Is64 = OpSize % 64 == 0;
+
+ LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
+ unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
+ : AMDGPU::V_CMP_EQ_U32_e64;
+
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
+
+ // Insert the unmerge before the loop.
+
+ B.setMBB(MBB);
+ auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
+ B.setInstr(*I);
+
+ unsigned NumPieces = Unmerge->getNumOperands() - 1;
+ for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
+ unsigned UnmergePiece = Unmerge.getReg(PieceIdx);
+
+ Register CurrentLaneOpReg;
+ if (Is64) {
+ Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
+ Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
+
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
+ MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
+ MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegLo)
+ .addReg(UnmergePiece, 0, AMDGPU::sub0);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegHi)
+ .addReg(UnmergePiece, 0, AMDGPU::sub1);
+
+ CurrentLaneOpReg =
+ B.buildMerge(LLT::scalar(64),
+ {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
+ .getReg(0);
+
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
+
+ if (OpTy.getScalarSizeInBits() == 64) {
+ // If we need to produce a 64-bit element vector, so use the
+ // merged pieces
+ ReadlanePieces.push_back(CurrentLaneOpReg);
+ } else {
+ // 32-bit element type.
+ ReadlanePieces.push_back(CurrentLaneOpRegLo);
+ ReadlanePieces.push_back(CurrentLaneOpRegHi);
+ }
+ } else {
+ CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpReg)
+ .addReg(UnmergePiece);
+ ReadlanePieces.push_back(CurrentLaneOpReg);
+ }
+
+ Register NewCondReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ bool First = CondReg == AMDGPU::NoRegister;
+ if (First)
+ CondReg = NewCondReg;
+
+ B.buildInstr(CmpOp)
+ .addDef(NewCondReg)
+ .addReg(CurrentLaneOpReg)
+ .addReg(UnmergePiece);
+
+ if (!First) {
+ Register AndReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ // If there are multiple operands to consider, and the conditions.
+ B.buildInstr(AMDGPU::S_AND_B64)
+ .addDef(AndReg)
+ .addReg(NewCondReg)
+ .addReg(CondReg);
+ CondReg = AndReg;
+ }
+ }
+
+ // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
+ // BUILD_VECTOR
+ if (OpTy.isVector()) {
+ auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
+ } else {
+ auto Merge = B.buildMerge(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
+ }
+
+ MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
+ }
+ }
+ }
+
+ B.setInsertPt(*LoopBB, LoopBB->end());
+
+ // Update EXEC, save the original EXEC value to VCC.
+ B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64)
+ .addDef(NewExec)
+ .addReg(CondReg, RegState::Kill);
+
+ MRI.setSimpleHint(NewExec, CondReg);
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ B.buildInstr(AMDGPU::S_XOR_B64_term)
+ .addDef(AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(NewExec);
+
+ // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
+ // s_cbranch_scc0?
+
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
+ B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
+ .addMBB(LoopBB);
+
+ // Save the EXEC mask before the loop.
+ BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
+ .addReg(AMDGPU::EXEC);
+
+ // Restore the EXEC mask after the loop.
+ B.setMBB(*RestoreExecBB);
+ B.buildInstr(AMDGPU::S_MOV_B64_term)
+ .addDef(AMDGPU::EXEC)
+ .addReg(SaveExecReg);
+}
+
+// Legalize an operand that must be an SGPR by inserting a readfirstlane.
+void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
+ MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
+ Register Reg = MI.getOperand(OpIdx).getReg();
+ const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
+ if (Bank != &AMDGPU::VGPRRegBank)
+ return;
+
+ MachineIRBuilder B(MI);
+ Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
+ .addDef(SGPR)
+ .addReg(Reg);
+
+ const TargetRegisterClass *Constrained =
+ constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
+ (void)Constrained;
+ assert(Constrained && "Failed to constrain readfirstlane src reg");
+
+ MI.getOperand(OpIdx).setReg(SGPR);
+}
+
+// When regbankselect repairs registers, it will insert a repair instruction
+// which defines the repaired register. Then it calls applyMapping and expects
+// that the targets will either delete or rewrite the originally wrote to the
+// repaired registers. Beccause of this, we end up in a situation where
+// we have 2 instructions defining the same registers.
+static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
+ Register Reg,
+ const MachineInstr &MI) {
+ // Is there some way we can assert that there are exactly 2 def instructions?
+ for (MachineInstr &Other : MRI.def_instructions(Reg)) {
+ if (&Other != &MI)
+ return &Other;
+ }
+
+ return nullptr;
+}
+
+bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ const LLT LoadTy = MRI.getType(DstReg);
+ unsigned LoadSize = LoadTy.getSizeInBits();
+ const unsigned MaxNonSmrdLoadSize = 128;
+ // 128-bit loads are supported for all instruction types.
+ if (LoadSize <= MaxNonSmrdLoadSize)
return false;
- const MachineMemOperand *MMO = *MI.memoperands_begin();
- return AMDGPUInstrInfo::isUniformMMO(MMO);
+ SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
+ SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
+
+ // If the pointer is an SGPR, we have nothing to do.
+ if (SrcRegs.empty())
+ return false;
+
+ assert(LoadSize % MaxNonSmrdLoadSize == 0);
+
+ // We want to get the repair instruction now, because it will help us
+ // determine which instruction the legalizer inserts that will also
+ // write to DstReg.
+ MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
+
+ // RegBankSelect only emits scalar types, so we need to reset the pointer
+ // operand to a pointer type.
+ Register BasePtrReg = SrcRegs[0];
+ LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
+ MRI.setType(BasePtrReg, PtrTy);
+
+ MachineIRBuilder B(MI);
+
+ unsigned SplitElts =
+ MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
+ const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
+ ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
+ GISelObserverWrapper Observer(&O);
+ B.setChangeObserver(Observer);
+ LegalizerHelper Helper(B.getMF(), Observer, B);
+ if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+ return false;
+
+ // At this point, the legalizer has split the original load into smaller
+ // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
+ // that combines the outputs of the lower loads and writes it to DstReg.
+ // The register bank selector has also added the RepairInst which writes to
+ // DstReg as well.
+
+ MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
+
+ // Replace the output of the LegalizedInst with a temporary register, since
+ // RepairInst already defines DstReg.
+ Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
+ LegalizedInst->getOperand(0).setReg(TmpReg);
+ B.setInsertPt(*RepairInst->getParent(), RepairInst);
+
+ for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
+ Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+ B.buildConstant(IdxReg, DefIdx);
+ MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
+ B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
+ }
+
+ MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
+ return true;
+}
+
+// For cases where only a single copy is inserted for matching register banks.
+// Replace the register in the instruction operand
+static void substituteSimpleCopyRegs(
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
+ SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
+ if (!SrcReg.empty()) {
+ assert(SrcReg.size() == 1);
+ OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
+ }
+}
+
+void AMDGPURegisterBankInfo::applyMappingImpl(
+ const OperandsMapper &OpdMapper) const {
+ MachineInstr &MI = OpdMapper.getMI();
+ unsigned Opc = MI.getOpcode();
+ MachineRegisterInfo &MRI = OpdMapper.getMRI();
+ switch (Opc) {
+ case AMDGPU::G_SELECT: {
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ if (DstTy.getSizeInBits() != 64)
+ break;
+
+ LLT HalfTy = getHalfSizedType(DstTy);
+
+ SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
+ SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1));
+ SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
+ SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
+
+ // All inputs are SGPRs, nothing special to do.
+ if (DefRegs.empty()) {
+ assert(Src1Regs.empty() && Src2Regs.empty());
+ break;
+ }
+
+ MachineIRBuilder B(MI);
+ if (Src0Regs.empty())
+ Src0Regs.push_back(MI.getOperand(1).getReg());
+ else {
+ assert(Src0Regs.size() == 1);
+ }
+
+ if (Src1Regs.empty())
+ split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
+ else {
+ setRegsToType(MRI, Src1Regs, HalfTy);
+ }
+
+ if (Src2Regs.empty())
+ split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
+ else
+ setRegsToType(MRI, Src2Regs, HalfTy);
+
+ setRegsToType(MRI, DefRegs, HalfTy);
+
+ B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]);
+ B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]);
+
+ MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
+ MI.eraseFromParent();
+ return;
+ }
+ case AMDGPU::G_AND:
+ case AMDGPU::G_OR:
+ case AMDGPU::G_XOR: {
+ // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
+ // there is a VGPR input.
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ if (DstTy.getSizeInBits() != 64)
+ break;
+
+ LLT HalfTy = getHalfSizedType(DstTy);
+ SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
+ SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
+ SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
+
+ // All inputs are SGPRs, nothing special to do.
+ if (DefRegs.empty()) {
+ assert(Src0Regs.empty() && Src1Regs.empty());
+ break;
+ }
+
+ assert(DefRegs.size() == 2);
+ assert(Src0Regs.size() == Src1Regs.size() &&
+ (Src0Regs.empty() || Src0Regs.size() == 2));
+
+ // Depending on where the source registers came from, the generic code may
+ // have decided to split the inputs already or not. If not, we still need to
+ // extract the values.
+ MachineIRBuilder B(MI);
+
+ if (Src0Regs.empty())
+ split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
+ else
+ setRegsToType(MRI, Src0Regs, HalfTy);
+
+ if (Src1Regs.empty())
+ split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
+ else
+ setRegsToType(MRI, Src1Regs, HalfTy);
+
+ setRegsToType(MRI, DefRegs, HalfTy);
+
+ B.buildInstr(Opc)
+ .addDef(DefRegs[0])
+ .addUse(Src0Regs[0])
+ .addUse(Src1Regs[0]);
+
+ B.buildInstr(Opc)
+ .addDef(DefRegs[1])
+ .addUse(Src0Regs[1])
+ .addUse(Src1Regs[1]);
+
+ MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
+ MI.eraseFromParent();
+ return;
+ }
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SUB:
+ case AMDGPU::G_MUL: {
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ if (DstTy != LLT::scalar(16))
+ break;
+
+ const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
+ if (DstBank == &AMDGPU::VGPRRegBank)
+ break;
+
+ // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
+ MachineFunction *MF = MI.getParent()->getParent();
+ MachineIRBuilder B(MI);
+ ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
+ GISelObserverWrapper Observer(&ApplySALU);
+ LegalizerHelper Helper(*MF, Observer, B);
+
+ if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
+ LegalizerHelper::Legalized)
+ llvm_unreachable("widen scalar should have succeeded");
+ return;
+ }
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX: {
+ Register DstReg = MI.getOperand(0).getReg();
+ const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
+ if (DstBank == &AMDGPU::VGPRRegBank)
+ break;
+
+ MachineFunction *MF = MI.getParent()->getParent();
+ MachineIRBuilder B(MI);
+ ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
+ GISelObserverWrapper Observer(&ApplySALU);
+ LegalizerHelper Helper(*MF, Observer, B);
+
+ // Turn scalar min/max into a compare and select.
+ LLT Ty = MRI.getType(DstReg);
+ LLT S32 = LLT::scalar(32);
+ LLT S16 = LLT::scalar(16);
+
+ if (Ty == S16) {
+ // Need to widen to s32, and expand as cmp + select.
+ if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
+ llvm_unreachable("widenScalar should have succeeded");
+
+ // FIXME: This is relying on widenScalar leaving MI in place.
+ if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized)
+ llvm_unreachable("lower should have succeeded");
+ } else {
+ if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized)
+ llvm_unreachable("lower should have succeeded");
+ }
+
+ return;
+ }
+ case AMDGPU::G_SEXT:
+ case AMDGPU::G_ZEXT: {
+ Register SrcReg = MI.getOperand(1).getReg();
+ LLT SrcTy = MRI.getType(SrcReg);
+ bool Signed = Opc == AMDGPU::G_SEXT;
+
+ MachineIRBuilder B(MI);
+ const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ if (DstTy.isScalar() &&
+ SrcBank != &AMDGPU::SGPRRegBank &&
+ SrcBank != &AMDGPU::SCCRegBank &&
+ SrcBank != &AMDGPU::VCCRegBank &&
+ // FIXME: Should handle any type that round to s64 when irregular
+ // breakdowns supported.
+ DstTy.getSizeInBits() == 64 &&
+ SrcTy.getSizeInBits() <= 32) {
+ const LLT S32 = LLT::scalar(32);
+ SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
+
+ // Extend to 32-bit, and then extend the low half.
+ if (Signed) {
+ // TODO: Should really be buildSExtOrCopy
+ B.buildSExtOrTrunc(DefRegs[0], SrcReg);
+
+ // Replicate sign bit from 32-bit extended part.
+ auto ShiftAmt = B.buildConstant(S32, 31);
+ MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
+ B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
+ } else {
+ B.buildZExtOrTrunc(DefRegs[0], SrcReg);
+ B.buildConstant(DefRegs[1], 0);
+ }
+
+ MRI.setRegBank(DstReg, *SrcBank);
+ MI.eraseFromParent();
+ return;
+ }
+
+ if (SrcTy != LLT::scalar(1))
+ return;
+
+ if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) {
+ SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
+
+ const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ?
+ &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank;
+
+ unsigned DstSize = DstTy.getSizeInBits();
+ // 64-bit select is SGPR only
+ const bool UseSel64 = DstSize > 32 &&
+ SrcBank->getID() == AMDGPU::SCCRegBankID;
+
+ // TODO: Should s16 select be legal?
+ LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
+ auto True = B.buildConstant(SelType, Signed ? -1 : 1);
+ auto False = B.buildConstant(SelType, 0);
+
+ MRI.setRegBank(True.getReg(0), *DstBank);
+ MRI.setRegBank(False.getReg(0), *DstBank);
+ MRI.setRegBank(DstReg, *DstBank);
+
+ if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) {
+ B.buildSelect(DefRegs[0], SrcReg, True, False);
+ B.buildCopy(DefRegs[1], DefRegs[0]);
+ } else if (DstSize < 32) {
+ auto Sel = B.buildSelect(SelType, SrcReg, True, False);
+ MRI.setRegBank(Sel.getReg(0), *DstBank);
+ B.buildTrunc(DstReg, Sel);
+ } else {
+ B.buildSelect(DstReg, SrcReg, True, False);
+ }
+
+ MI.eraseFromParent();
+ return;
+ }
+
+ // Fixup the case with an s1 src that isn't a condition register. Use shifts
+ // instead of introducing a compare to avoid an unnecessary condition
+ // register (and since there's no scalar 16-bit compares).
+ auto Ext = B.buildAnyExt(DstTy, SrcReg);
+ auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
+ auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
+
+ if (MI.getOpcode() == AMDGPU::G_SEXT)
+ B.buildAShr(DstReg, Shl, ShiftAmt);
+ else
+ B.buildLShr(DstReg, Shl, ShiftAmt);
+
+ MRI.setRegBank(DstReg, *SrcBank);
+ MRI.setRegBank(Ext.getReg(0), *SrcBank);
+ MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
+ MRI.setRegBank(Shl.getReg(0), *SrcBank);
+ MI.eraseFromParent();
+ return;
+ }
+ case AMDGPU::G_EXTRACT_VECTOR_ELT:
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, { 2 });
+ return;
+ case AMDGPU::G_INTRINSIC: {
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ case Intrinsic::amdgcn_s_buffer_load: {
+ // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
+ executeInWaterfallLoop(MI, MRI, { 2, 3 });
+ return;
+ }
+ case Intrinsic::amdgcn_readlane: {
+ substituteSimpleCopyRegs(OpdMapper, 2);
+
+ assert(empty(OpdMapper.getVRegs(0)));
+ assert(empty(OpdMapper.getVRegs(3)));
+
+ // Make sure the index is an SGPR. It doesn't make sense to run this in a
+ // waterfall loop, so assume it's a uniform value.
+ constrainOpWithReadfirstlane(MI, MRI, 3); // Index
+ return;
+ }
+ case Intrinsic::amdgcn_writelane: {
+ assert(empty(OpdMapper.getVRegs(0)));
+ assert(empty(OpdMapper.getVRegs(2)));
+ assert(empty(OpdMapper.getVRegs(3)));
+
+ substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
+ constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
+ constrainOpWithReadfirstlane(MI, MRI, 3); // Index
+ return;
+ }
+ default:
+ break;
+ }
+ break;
+ }
+ case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ case Intrinsic::amdgcn_buffer_load: {
+ executeInWaterfallLoop(MI, MRI, { 2 });
+ return;
+ }
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap: {
+ // This is only allowed to execute with 1 lane, so readfirstlane is safe.
+ assert(empty(OpdMapper.getVRegs(0)));
+ substituteSimpleCopyRegs(OpdMapper, 3);
+ constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ return;
+ }
+ case Intrinsic::amdgcn_s_sendmsg:
+ case Intrinsic::amdgcn_s_sendmsghalt: {
+ // FIXME: Should this use a waterfall loop?
+ constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ return;
+ }
+ default:
+ break;
+ }
+ break;
+ }
+ case AMDGPU::G_LOAD: {
+ if (applyMappingWideLoad(MI, OpdMapper, MRI))
+ return;
+ break;
+ }
+ default:
+ break;
+ }
+
+ return applyDefaultMapping(OpdMapper);
}
bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
@@ -259,7 +1369,7 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
if (!MI.getOperand(i).isReg())
continue;
- unsigned Reg = MI.getOperand(i).getReg();
+ Register Reg = MI.getOperand(i).getReg();
if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
if (Bank->getID() == AMDGPU::VGPRRegBankID)
return false;
@@ -299,7 +1409,7 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
if (MI.getOperand(OpdIdx).isIntrinsicID())
OpdsMapping[OpdIdx++] = nullptr;
- unsigned Reg1 = MI.getOperand(OpdIdx).getReg();
+ Register Reg1 = MI.getOperand(OpdIdx).getReg();
unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
unsigned DefaultBankID = Size1 == 1 ?
@@ -309,7 +1419,11 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
- unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI);
+ const MachineOperand &MO = MI.getOperand(OpdIdx);
+ if (!MO.isReg())
+ continue;
+
+ unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
}
@@ -325,7 +1439,11 @@ AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
- unsigned Size = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
+ const MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg())
+ continue;
+
+ unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
}
@@ -340,6 +1458,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
const ValueMapping *ValMapping;
@@ -350,7 +1469,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
} else {
- ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
// FIXME: What would happen if we used SGPRRegBankID here?
PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
}
@@ -366,7 +1485,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
}
unsigned
-AMDGPURegisterBankInfo::getRegBankID(unsigned Reg,
+AMDGPURegisterBankInfo::getRegBankID(Register Reg,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
unsigned Default) const {
@@ -383,13 +1502,81 @@ AMDGPURegisterBankInfo::getRegBankID(unsigned Reg,
///
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
- const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (MI.isRegSequence()) {
+ // If any input is a VGPR, the result must be a VGPR. The default handling
+ // assumes any copy between banks is legal.
+ unsigned BankID = AMDGPU::SGPRRegBankID;
+
+ for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+ auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
+ // It doesn't make sense to use vcc or scc banks here, so just ignore
+ // them.
+ if (OpBank != AMDGPU::SGPRRegBankID) {
+ BankID = AMDGPU::VGPRRegBankID;
+ break;
+ }
+ }
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+
+ const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
+ return getInstructionMapping(
+ 1, /*Cost*/ 1,
+ /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
+ }
+
+ // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
+ // properly.
+ //
+ // TODO: There are additional exec masking dependencies to analyze.
+ if (MI.getOpcode() == TargetOpcode::G_PHI) {
+ // TODO: Generate proper invalid bank enum.
+ int ResultBank = -1;
+
+ for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+ unsigned Reg = MI.getOperand(I).getReg();
+ const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
+
+ // FIXME: Assuming VGPR for any undetermined inputs.
+ if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
+ ResultBank = AMDGPU::VGPRRegBankID;
+ break;
+ }
+
+ unsigned OpBank = Bank->getID();
+ // scc, scc -> sgpr
+ if (OpBank == AMDGPU::SCCRegBankID) {
+ // There's only one SCC register, so a phi requires copying to SGPR.
+ OpBank = AMDGPU::SGPRRegBankID;
+ } else if (OpBank == AMDGPU::VCCRegBankID) {
+ // vcc, vcc -> vcc
+ // vcc, sgpr -> vgpr
+ if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) {
+ ResultBank = AMDGPU::VGPRRegBankID;
+ break;
+ }
+ }
+
+ ResultBank = OpBank;
+ }
+
+ assert(ResultBank != -1);
+
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+
+ const ValueMapping &ValMap =
+ getValueMapping(0, Size, getRegBank(ResultBank));
+ return getInstructionMapping(
+ 1, /*Cost*/ 1,
+ /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
+ }
+
+ const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
if (Mapping.isValid())
return Mapping;
- const MachineFunction &MF = *MI.getParent()->getParent();
- const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
switch (MI.getOpcode()) {
@@ -401,18 +1588,86 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_XOR: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (Size == 1) {
- OpdsMapping[0] = OpdsMapping[1] =
- OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ const RegisterBank *DstBank
+ = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
+
+ unsigned TargetBankID = -1;
+ unsigned BankLHS = -1;
+ unsigned BankRHS = -1;
+ if (DstBank) {
+ TargetBankID = DstBank->getID();
+ if (DstBank == &AMDGPU::VCCRegBank) {
+ TargetBankID = AMDGPU::VCCRegBankID;
+ BankLHS = AMDGPU::VCCRegBankID;
+ BankRHS = AMDGPU::VCCRegBankID;
+ } else if (DstBank == &AMDGPU::SCCRegBank) {
+ TargetBankID = AMDGPU::SCCRegBankID;
+ BankLHS = AMDGPU::SGPRRegBankID;
+ BankRHS = AMDGPU::SGPRRegBankID;
+ } else {
+ BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ }
+ } else {
+ BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ AMDGPU::VCCRegBankID);
+ BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ AMDGPU::VCCRegBankID);
+
+ // Both inputs should be true booleans to produce a boolean result.
+ if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
+ TargetBankID = AMDGPU::VGPRRegBankID;
+ } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
+ TargetBankID = AMDGPU::VCCRegBankID;
+ BankLHS = AMDGPU::VCCRegBankID;
+ BankRHS = AMDGPU::VCCRegBankID;
+ } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
+ TargetBankID = AMDGPU::SGPRRegBankID;
+ } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) {
+ // The operation must be done on a 32-bit register, but it will set
+ // scc. The result type could interchangably be SCC or SGPR, since
+ // both values will be produced.
+ TargetBankID = AMDGPU::SCCRegBankID;
+ BankLHS = AMDGPU::SGPRRegBankID;
+ BankRHS = AMDGPU::SGPRRegBankID;
+ }
+ }
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
+ OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
+ break;
+ }
+
+ if (Size == 64) {
+
+ if (isSALUMapping(MI)) {
+ OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
+ OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
+ } else {
+ OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
+ unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
+ OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
+
+ unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
+ }
+
break;
}
LLVM_FALLTHROUGH;
}
+ case AMDGPU::G_GEP:
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
case AMDGPU::G_MUL:
case AMDGPU::G_SHL:
+ case AMDGPU::G_LSHR:
+ case AMDGPU::G_ASHR:
case AMDGPU::G_UADDO:
case AMDGPU::G_SADDO:
case AMDGPU::G_USUBO:
@@ -421,6 +1676,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE:
+ case AMDGPU::G_UMULH:
+ case AMDGPU::G_SMULH:
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
LLVM_FALLTHROUGH;
@@ -431,11 +1692,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FPTOUI:
case AMDGPU::G_FMUL:
case AMDGPU::G_FMA:
+ case AMDGPU::G_FSQRT:
case AMDGPU::G_SITOFP:
case AMDGPU::G_UITOFP:
case AMDGPU::G_FPTRUNC:
+ case AMDGPU::G_FPEXT:
case AMDGPU::G_FEXP2:
case AMDGPU::G_FLOG2:
+ case AMDGPU::G_FCANONICALIZE:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_INTRINSIC_ROUND:
return getDefaultMappingVOP(MI);
@@ -473,7 +1737,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = nullptr;
break;
}
- case AMDGPU::G_MERGE_VALUES: {
+ case AMDGPU::G_MERGE_VALUES:
+ case AMDGPU::G_BUILD_VECTOR:
+ case AMDGPU::G_CONCAT_VECTORS: {
unsigned Bank = isSALUMapping(MI) ?
AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -502,8 +1768,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_TRUNC: {
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
unsigned Bank = getRegBankID(Src, MRI, *TRI);
unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
@@ -514,23 +1780,35 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ZEXT:
case AMDGPU::G_SEXT:
case AMDGPU::G_ANYEXT: {
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
- unsigned SrcBank = getRegBankID(Src, MRI, *TRI,
- SrcSize == 1 ? AMDGPU::SGPRRegBankID :
- AMDGPU::VGPRRegBankID);
- unsigned DstBank = SrcBank;
- if (SrcSize == 1) {
- if (SrcBank == AMDGPU::SGPRRegBankID)
- DstBank = AMDGPU::VGPRRegBankID;
- else
- DstBank = AMDGPU::SGPRRegBankID;
- }
-
- OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
- OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank, SrcSize);
+
+ unsigned DstBank;
+ const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
+ assert(SrcBank);
+ switch (SrcBank->getID()) {
+ case AMDGPU::SCCRegBankID:
+ case AMDGPU::SGPRRegBankID:
+ DstBank = AMDGPU::SGPRRegBankID;
+ break;
+ default:
+ DstBank = AMDGPU::VGPRRegBankID;
+ break;
+ }
+
+ // TODO: Should anyext be split into 32-bit part as well?
+ if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
+ OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
+ } else {
+ // Scalar extend can use 64-bit BFE, but VGPRs require extending to
+ // 32-bits, and then to 64.
+ OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
+ SrcSize);
+ }
break;
}
case AMDGPU::G_FCMP: {
@@ -542,16 +1820,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
break;
}
- case AMDGPU::G_GEP: {
- for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
- if (!MI.getOperand(i).isReg())
- continue;
-
- unsigned Size = MRI.getType(MI.getOperand(i).getReg()).getSizeInBits();
- OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
- }
- break;
- }
case AMDGPU::G_STORE: {
assert(MI.getOperand(0).isReg());
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -571,57 +1839,55 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_ICMP: {
+ auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
- unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID &&
- Op3Bank == AMDGPU::SGPRRegBankID ?
- AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+
+ bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
+ Op3Bank == AMDGPU::SGPRRegBankID &&
+ (Size == 32 || (Size == 64 &&
+ (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
+ MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64()));
+
+ unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+
OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
OpdsMapping[1] = nullptr; // Predicate Operand.
OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
break;
}
-
-
case AMDGPU::G_EXTRACT_VECTOR_ELT: {
- unsigned IdxOp = 2;
- int64_t Imm;
- // XXX - Do we really need to fully handle these? The constant case should
- // be legalized away before RegBankSelect?
-
- unsigned OutputBankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
+ unsigned OutputBankID = isSALUMapping(MI) ?
AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
-
+ unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
- OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits());
- OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(1).getReg()).getSizeInBits());
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
// The index can be either if the source vector is VGPR.
- OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
+ OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
break;
}
case AMDGPU::G_INSERT_VECTOR_ELT: {
- // XXX - Do we really need to fully handle these? The constant case should
- // be legalized away before RegBankSelect?
-
- int64_t Imm;
-
- unsigned IdxOp = MI.getOpcode() == AMDGPU::G_EXTRACT_VECTOR_ELT ? 2 : 3;
- unsigned BankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ?
- AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
-
-
+ unsigned OutputBankID = isSALUMapping(MI) ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
- // TODO: Can do SGPR indexing, which would obviate the need for the
- // isConstant check.
- for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
- unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
- OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
- }
+ unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
+ unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize);
+ // The index can be either if the source vector is VGPR.
+ OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
break;
}
case AMDGPU::G_UNMERGE_VALUES: {
@@ -637,14 +1903,70 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_INTRINSIC: {
- switch (MI.getOperand(1).getIntrinsicID()) {
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
default:
return getInvalidInstructionMapping();
case Intrinsic::maxnum:
case Intrinsic::minnum:
+ case Intrinsic::amdgcn_div_fmas:
+ case Intrinsic::amdgcn_trig_preop:
+ case Intrinsic::amdgcn_sin:
+ case Intrinsic::amdgcn_cos:
+ case Intrinsic::amdgcn_log_clamp:
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_rsq_legacy:
+ case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_ldexp:
+ case Intrinsic::amdgcn_frexp_mant:
+ case Intrinsic::amdgcn_frexp_exp:
+ case Intrinsic::amdgcn_fract:
case Intrinsic::amdgcn_cvt_pkrtz:
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16:
+ case Intrinsic::amdgcn_fmed3:
+ case Intrinsic::amdgcn_cubeid:
+ case Intrinsic::amdgcn_cubema:
+ case Intrinsic::amdgcn_cubesc:
+ case Intrinsic::amdgcn_cubetc:
+ case Intrinsic::amdgcn_sffbh:
+ case Intrinsic::amdgcn_fmad_ftz:
+ case Intrinsic::amdgcn_mbcnt_lo:
+ case Intrinsic::amdgcn_mbcnt_hi:
+ case Intrinsic::amdgcn_ubfe:
+ case Intrinsic::amdgcn_sbfe:
+ case Intrinsic::amdgcn_lerp:
+ case Intrinsic::amdgcn_sad_u8:
+ case Intrinsic::amdgcn_msad_u8:
+ case Intrinsic::amdgcn_sad_hi_u8:
+ case Intrinsic::amdgcn_sad_u16:
+ case Intrinsic::amdgcn_qsad_pk_u16_u8:
+ case Intrinsic::amdgcn_mqsad_pk_u16_u8:
+ case Intrinsic::amdgcn_mqsad_u32_u8:
+ case Intrinsic::amdgcn_cvt_pk_u8_f32:
+ case Intrinsic::amdgcn_alignbit:
+ case Intrinsic::amdgcn_alignbyte:
+ case Intrinsic::amdgcn_fdot2:
+ case Intrinsic::amdgcn_sdot2:
+ case Intrinsic::amdgcn_udot2:
+ case Intrinsic::amdgcn_sdot4:
+ case Intrinsic::amdgcn_udot4:
+ case Intrinsic::amdgcn_sdot8:
+ case Intrinsic::amdgcn_udot8:
+ case Intrinsic::amdgcn_fdiv_fast:
+ case Intrinsic::amdgcn_wwm:
+ case Intrinsic::amdgcn_wqm:
return getDefaultMappingVOP(MI);
- case Intrinsic::amdgcn_kernarg_segment_ptr: {
+ case Intrinsic::amdgcn_ds_permute:
+ case Intrinsic::amdgcn_ds_bpermute:
+ case Intrinsic::amdgcn_update_dpp:
+ return getDefaultMappingAllVGPR(MI);
+ case Intrinsic::amdgcn_kernarg_segment_ptr:
+ case Intrinsic::amdgcn_s_getpc:
+ case Intrinsic::amdgcn_groupstaticsize: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
@@ -652,16 +1974,142 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wqm_vote: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = OpdsMapping[2]
- = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
+ break;
+ }
+ case Intrinsic::amdgcn_s_buffer_load: {
+ // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
+ Register RSrc = MI.getOperand(2).getReg(); // SGPR
+ Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
+
+ unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
+ unsigned Size3 = MRI.getType(Offset).getSizeInBits();
+
+ unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
+ unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
+ OpdsMapping[1] = nullptr; // intrinsic id
+
+ // Lie and claim everything is legal, even though some need to be
+ // SGPRs. applyMapping will have to deal with it as a waterfall loop.
+ OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
+ OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
+ OpdsMapping[4] = nullptr;
+ break;
+ }
+ case Intrinsic::amdgcn_div_scale: {
+ unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
+
+ unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
+ OpdsMapping[3] = AMDGPU::getValueMapping(
+ getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
+ OpdsMapping[4] = AMDGPU::getValueMapping(
+ getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
+
+ break;
+ }
+ case Intrinsic::amdgcn_class: {
+ Register Src0Reg = MI.getOperand(2).getReg();
+ Register Src1Reg = MI.getOperand(3).getReg();
+ unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
+ unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
+ Src0Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
+ Src1Size);
+ break;
+ }
+ case Intrinsic::amdgcn_icmp:
+ case Intrinsic::amdgcn_fcmp: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ // This is not VCCRegBank because this is not used in boolean contexts.
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
+ unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
+ OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
+ break;
+ }
+ case Intrinsic::amdgcn_readlane: {
+ // This must be an SGPR, but accept a VGPR.
+ unsigned IdxReg = MI.getOperand(3).getReg();
+ unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
+ unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
+ LLVM_FALLTHROUGH;
+ }
+ case Intrinsic::amdgcn_readfirstlane: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
+ break;
+ }
+ case Intrinsic::amdgcn_writelane: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned SrcReg = MI.getOperand(2).getReg();
+ unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
+ unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ unsigned IdxReg = MI.getOperand(3).getReg();
+ unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
+ unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+
+ // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
+ // to legalize.
+ OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
+ OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
+ break;
+ }
+ case Intrinsic::amdgcn_if_break: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
}
break;
}
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
- switch (MI.getOperand(0).getIntrinsicID()) {
+ switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
default:
return getInvalidInstructionMapping();
+ case Intrinsic::amdgcn_s_getreg:
+ case Intrinsic::amdgcn_s_memtime:
+ case Intrinsic::amdgcn_s_memrealtime:
+ case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
+ case Intrinsic::amdgcn_ds_append:
+ case Intrinsic::amdgcn_ds_consume:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax:
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:
+ return getDefaultMappingAllVGPR(MI);
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+ unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ break;
+ }
case Intrinsic::amdgcn_exp_compr:
OpdsMapping[0] = nullptr; // IntrinsicID
// FIXME: These are immediate values which can't be read from registers.
@@ -688,24 +2136,82 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
break;
+ case Intrinsic::amdgcn_buffer_load: {
+ Register RSrc = MI.getOperand(2).getReg(); // SGPR
+ Register VIndex = MI.getOperand(3).getReg(); // VGPR
+ Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
+
+ unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
+ unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
+ unsigned Size4 = MRI.getType(Offset).getSizeInBits();
+
+ unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
+ unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
+ OpdsMapping[1] = nullptr; // intrinsic id
+
+ // Lie and claim everything is legal, even though some need to be
+ // SGPRs. applyMapping will have to deal with it as a waterfall loop.
+ OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
+ OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
+ OpdsMapping[5] = nullptr;
+ OpdsMapping[6] = nullptr;
+ break;
+ }
+ case Intrinsic::amdgcn_s_sendmsg:
+ case Intrinsic::amdgcn_s_sendmsghalt: {
+ // This must be an SGPR, but accept a VGPR.
+ unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
+ break;
+ }
+ case Intrinsic::amdgcn_end_cf: {
+ unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
}
break;
}
case AMDGPU::G_SELECT: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- unsigned Op1Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
AMDGPU::SGPRRegBankID);
- unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
- unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
- bool SGPRSrcs = Op1Bank == AMDGPU::SCCRegBankID &&
- Op2Bank == AMDGPU::SGPRRegBankID &&
+ unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
Op3Bank == AMDGPU::SGPRRegBankID;
- unsigned Bank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
- Op1Bank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
- OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
- OpdsMapping[1] = AMDGPU::getValueMapping(Op1Bank, 1);
- OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
- OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
+
+ unsigned CondBankDefault = SGPRSrcs ?
+ AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+ unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ CondBankDefault);
+ if (CondBank == AMDGPU::SGPRRegBankID)
+ CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+ else if (CondBank == AMDGPU::VGPRRegBankID)
+ CondBank = AMDGPU::VCCRegBankID;
+
+ unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+
+ assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID);
+
+ if (Size == 64) {
+ OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
+ OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
+ OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
+ OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
+ } else {
+ OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
+ OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
+ }
+
break;
}
@@ -737,6 +2243,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
}
- return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+ return getInstructionMapping(/*ID*/1, /*Cost*/1,
+ getOperandsMapping(OpdsMapping),
MI.getNumOperands());
}
+
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index d29f4bc79a51..f3a96e2a6128 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -1,9 +1,8 @@
//===- AMDGPURegisterBankInfo -----------------------------------*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -14,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
+#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#define GET_REGBANK_DECLARATIONS
@@ -22,6 +22,8 @@
namespace llvm {
+class LLT;
+class MachineIRBuilder;
class SIRegisterInfo;
class TargetRegisterInfo;
@@ -36,16 +38,53 @@ protected:
class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
const SIRegisterInfo *TRI;
+ void executeInWaterfallLoop(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const;
+
+ void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI,
+ unsigned OpIdx) const;
+ bool applyMappingWideLoad(MachineInstr &MI,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI) const;
+
/// See RegisterBankInfo::applyMapping.
void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr &MI) const;
- unsigned getRegBankID(unsigned Reg, const MachineRegisterInfo &MRI,
+ unsigned getRegBankID(Register Reg, const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
unsigned Default = AMDGPU::VGPRRegBankID) const;
+ /// Split 64-bit value \p Reg into two 32-bit halves and populate them into \p
+ /// Regs. This appropriately sets the regbank of the new registers.
+ void split64BitValueForMapping(MachineIRBuilder &B,
+ SmallVector<Register, 2> &Regs,
+ LLT HalfTy,
+ Register Reg) const;
+
+ template <unsigned NumOps>
+ struct OpRegBankEntry {
+ int8_t RegBanks[NumOps];
+ int16_t Cost;
+ };
+
+ template <unsigned NumOps>
+ InstructionMappings
+ addMappingFromTable(const MachineInstr &MI, const MachineRegisterInfo &MRI,
+ const std::array<unsigned, NumOps> RegSrcOpIdx,
+ ArrayRef<OpRegBankEntry<NumOps>> Table) const;
+
+ RegisterBankInfo::InstructionMappings
+ getInstrAlternativeMappingsIntrinsic(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
+
+ RegisterBankInfo::InstructionMappings
+ getInstrAlternativeMappingsIntrinsicWSideEffects(
+ const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
+
bool isSALUMapping(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
@@ -57,6 +96,9 @@ public:
unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
unsigned Size) const override;
+ unsigned getBreakDownCost(const ValueMapping &ValMapping,
+ const RegisterBank *CurBank = nullptr) const override;
+
const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 570379a820e1..9555694fb106 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -1,9 +1,8 @@
//=- AMDGPURegisterBank.td - Describe the AMDGPU Banks -------*- tablegen -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -15,7 +14,7 @@ def VGPRRegBank : RegisterBank<"VGPR",
[VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
>;
-def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS]>;
+def SCCRegBank : RegisterBank <"SCC", [SReg_32, SCC_CLASS]>;
// It is helpful to distinguish conditions from ordinary SGPRs.
def VCCRegBank : RegisterBank <"VCC", [SReg_64]>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 50f859addc2b..7cffdf1a4dcf 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -32,7 +31,10 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
- AMDGPU::sub15
+ AMDGPU::sub15, AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
+ AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, AMDGPU::sub24,
+ AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, AMDGPU::sub28, AMDGPU::sub29,
+ AMDGPU::sub30, AMDGPU::sub31
};
assert(Channel < array_lengthof(SubRegs));
@@ -83,7 +85,18 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
}
}
-unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const SIFrameLowering *TFI =
+ MF.getSubtarget<GCNSubtarget>().getFrameLowering();
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- return FuncInfo->getFrameOffsetReg();
+ return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
+ : FuncInfo->getStackPtrOffsetReg();
+}
+
+const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
+ return CSR_AMDGPU_AllVGPRs_RegMask;
+}
+
+const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
+ return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index 922d974f2ebd..3453a8c1b0b3 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -1,9 +1,8 @@
//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
index ceabae524414..ab71b7aa8a57 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
@@ -1,9 +1,8 @@
//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,7 +12,7 @@
let Namespace = "AMDGPU" in {
-foreach Index = 0-15 in {
+foreach Index = 0-31 in {
def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
}
diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index efe501cb73c2..4f095087a57f 100644
--- a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -1,9 +1,8 @@
//===- AMDGPURewriteOutArgumentsPass.cpp - Create struct returns ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 9dbd7751b4d8..f8703c36127a 100644
--- a/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -1,9 +1,8 @@
//===-- AMDGPUSearchableTables.td - ------------------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -49,6 +48,8 @@ def : SourceOfDivergence<int_amdgcn_workitem_id_z>;
def : SourceOfDivergence<int_amdgcn_interp_mov>;
def : SourceOfDivergence<int_amdgcn_interp_p1>;
def : SourceOfDivergence<int_amdgcn_interp_p2>;
+def : SourceOfDivergence<int_amdgcn_interp_p1_f16>;
+def : SourceOfDivergence<int_amdgcn_interp_p2_f16>;
def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
def : SourceOfDivergence<int_r600_read_tidig_x>;
@@ -70,8 +71,59 @@ def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_smin>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_umin>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_smax>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_umax>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_and>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_smin>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_umin>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_smax>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_umax>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_and>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
+def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
+def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;
+def : SourceOfDivergence<int_amdgcn_permlane16>;
+def : SourceOfDivergence<int_amdgcn_permlanex16>;
+def : SourceOfDivergence<int_amdgcn_mov_dpp>;
+def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
+def : SourceOfDivergence<int_amdgcn_update_dpp>;
+
+def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4f16>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_4x4x4i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x2bf16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x1f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16f16>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x4i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x16i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x2bf16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8bf16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x1f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2f32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4f16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8f16>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
foreach intr = AMDGPUImageDimAtomicIntrinsics in
def : SourceOfDivergence<intr>;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index ed0cc70c3d9a..1eb9b83456c5 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -41,12 +40,17 @@ using namespace llvm;
#undef AMDGPUSubtarget
#include "R600GenSubtargetInfo.inc"
+static cl::opt<bool> DisablePowerSched(
+ "amdgpu-disable-power-sched",
+ cl::desc("Disable scheduling to minimize mAI power bursts"),
+ cl::init(false));
+
GCNSubtarget::~GCNSubtarget() = default;
R600Subtarget &
R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS) {
- SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
+ SmallString<256> FullFS("+promote-alloca,");
FullFS += FS;
ParseSubtargetFeatures(GPU, FullFS);
@@ -65,7 +69,7 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
GCNSubtarget &
GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
- StringRef GPU, StringRef FS) {
+ StringRef GPU, StringRef FS) {
// Determine default and user-specified characteristics
// On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
// enabled, but some instructions do not respect them and they run at the
@@ -78,10 +82,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
// Similarly we want enable-prt-strict-null to be on by default and not to
// unset everything else if it is disabled
- SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
+ // Assuming ECC is enabled is the conservative default.
+ SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
- FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
+ FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
// FIXME: I don't think think Evergreen has any useful support for
// denormals, but should be checked. Should we issue a warning somewhere
@@ -94,6 +99,16 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
+ // Disable mutually exclusive bits.
+ if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
+ if (FS.find_lower("wavefrontsize16") == StringRef::npos)
+ FullFS += "-wavefrontsize16,";
+ if (FS.find_lower("wavefrontsize32") == StringRef::npos)
+ FullFS += "-wavefrontsize32,";
+ if (FS.find_lower("wavefrontsize64") == StringRef::npos)
+ FullFS += "-wavefrontsize64,";
+ }
+
FullFS += FS;
ParseSubtargetFeatures(GPU, FullFS);
@@ -124,8 +139,25 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
HasMovrel = true;
}
+ // Don't crash on invalid devices.
+ if (WavefrontSize == 0)
+ WavefrontSize = 64;
+
HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ if (DoesNotSupportXNACK && EnableXNACK) {
+ ToggleFeature(AMDGPU::FeatureXNACK);
+ EnableXNACK = false;
+ }
+
+ // ECC is on by default, but turn it off if the hardware doesn't support it
+ // anyway. This matters for the gfx9 targets with d16 loads, but don't support
+ // ECC.
+ if (DoesNotSupportSRAMECC && EnableSRAMECC) {
+ ToggleFeature(AMDGPU::FeatureSRAMECC);
+ EnableSRAMECC = false;
+ }
+
return *this;
}
@@ -152,8 +184,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
AMDGPUGenSubtargetInfo(TT, GPU, FS),
AMDGPUSubtarget(TT),
TargetTriple(TT),
- Gen(SOUTHERN_ISLANDS),
- IsaVersion(ISAVersion0_0_0),
+ Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
InstrItins(getInstrItineraryForCPU(GPU)),
LDSBankCount(0),
MaxPrivateElementSize(0),
@@ -162,7 +193,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HalfRate64Ops(false),
FP64FP16Denormals(false),
- DX10Clamp(false),
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
CodeObjectV3(false),
@@ -171,11 +201,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasApertureRegs(false),
EnableXNACK(false),
+ DoesNotSupportXNACK(false),
+ EnableCuMode(false),
TrapHandler(false),
- DebuggerInsertNops(false),
- DebuggerEmitPrologue(false),
- EnableHugePrivateBuffer(false),
EnableLoadStoreOpt(false),
EnableUnsafeDSOffsetFolding(false),
EnableSIScheduler(false),
@@ -186,8 +215,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FP64(false),
GCN3Encoding(false),
CIInsts(false),
- VIInsts(false),
+ GFX8Insts(false),
GFX9Insts(false),
+ GFX10Insts(false),
+ GFX7GFX8GFX9Insts(false),
SGPRInitBug(false),
HasSMemRealTime(false),
HasIntClamp(false),
@@ -202,19 +233,47 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasSDWAMac(false),
HasSDWAOutModsVOPC(false),
HasDPP(false),
+ HasDPP8(false),
HasR128A16(false),
+ HasNSAEncoding(false),
HasDLInsts(false),
- HasDotInsts(false),
+ HasDot1Insts(false),
+ HasDot2Insts(false),
+ HasDot3Insts(false),
+ HasDot4Insts(false),
+ HasDot5Insts(false),
+ HasDot6Insts(false),
+ HasMAIInsts(false),
+ HasPkFmacF16Inst(false),
+ HasAtomicFaddInsts(false),
EnableSRAMECC(false),
+ DoesNotSupportSRAMECC(false),
+ HasNoSdstCMPX(false),
+ HasVscnt(false),
+ HasRegisterBanking(false),
+ HasVOP3Literal(false),
+ HasNoDataDepHazard(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
FlatGlobalInsts(false),
FlatScratchInsts(false),
+ ScalarFlatScratchInsts(false),
AddNoCarryInsts(false),
HasUnpackedD16VMem(false),
+ LDSMisalignedBug(false),
ScalarizeGlobal(false),
+ HasVcmpxPermlaneHazard(false),
+ HasVMEMtoScalarWriteHazard(false),
+ HasSMEMtoVectorWriteHazard(false),
+ HasInstFwdPrefetchBug(false),
+ HasVcmpxExecWARHazard(false),
+ HasLdsBranchVmemWARHazard(false),
+ HasNSAtoVMEMBug(false),
+ HasOffset3fBug(false),
+ HasFlatSegmentOffsetBug(false),
+
FeatureDisable(false),
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),
@@ -226,12 +285,34 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
*this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
}
+unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
+ if (getGeneration() < GFX10)
+ return 1;
+
+ switch (Opcode) {
+ case AMDGPU::V_LSHLREV_B64:
+ case AMDGPU::V_LSHLREV_B64_gfx10:
+ case AMDGPU::V_LSHL_B64:
+ case AMDGPU::V_LSHRREV_B64:
+ case AMDGPU::V_LSHRREV_B64_gfx10:
+ case AMDGPU::V_LSHR_B64:
+ case AMDGPU::V_ASHRREV_I64:
+ case AMDGPU::V_ASHRREV_I64_gfx10:
+ case AMDGPU::V_ASHR_I64:
+ return 1;
+ }
+
+ return 2;
+}
+
unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
const Function &F) const {
if (NWaves == 1)
return getLocalMemorySize();
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+ if (!WorkGroupsPerCu)
+ return 0;
unsigned MaxWaves = getMaxWavesPerEU();
return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
}
@@ -240,6 +321,8 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
const Function &F) const {
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+ if (!WorkGroupsPerCu)
+ return 0;
unsigned MaxWaves = getMaxWavesPerEU();
unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
@@ -260,7 +343,8 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
- return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
+ return std::make_pair(getWavefrontSize() * 2,
+ std::max(getWavefrontSize() * 4, 256u));
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_LS:
case CallingConv::AMDGPU_HS:
@@ -280,12 +364,6 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
std::pair<unsigned, unsigned> Default =
getDefaultFlatWorkGroupSize(F.getCallingConv());
- // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
- // starts using "amdgpu-flat-work-group-size" attribute.
- Default.second = AMDGPU::getIntegerAttribute(
- F, "amdgpu-max-work-group-size", Default.second);
- Default.first = std::min(Default.first, Default.second);
-
// Requested minimum/maximum flat work group sizes.
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
F, "amdgpu-flat-work-group-size", Default);
@@ -319,10 +397,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
getMaxWavesPerEU(FlatWorkGroupSizes.second);
bool RequestedFlatWorkGroupSize = false;
- // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
- // starts using "amdgpu-flat-work-group-size" attribute.
- if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
- F.hasFnAttribute("amdgpu-flat-work-group-size")) {
+ if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
Default.first = MinImpliedByFlatWorkGroupSize;
RequestedFlatWorkGroupSize = true;
}
@@ -460,7 +535,6 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
FMA(false),
CaymanISA(false),
CFALUBug(false),
- DX10Clamp(false),
HasVertexCache(false),
R600ALUInst(false),
FP64(false),
@@ -486,7 +560,14 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackLaneMasks = true;
}
+bool GCNSubtarget::hasMadF16() const {
+ return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
+}
+
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
+ if (getGeneration() >= AMDGPUSubtarget::GFX10)
+ return 10;
+
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
if (SGPRs <= 80)
return 10;
@@ -533,6 +614,9 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ if (getGeneration() >= AMDGPUSubtarget::GFX10)
+ return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
+
if (MFI.hasFlatScratchInit()) {
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
@@ -631,9 +715,7 @@ struct MemOpClusterMutation : ScheduleDAGMutation {
MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
- void apply(ScheduleDAGInstrs *DAGInstrs) override {
- ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-
+ void apply(ScheduleDAGInstrs *DAG) override {
SUnit *SUa = nullptr;
// Search for two consequent memory operations and link them
// to prevent scheduler from moving them apart.
@@ -674,11 +756,130 @@ struct MemOpClusterMutation : ScheduleDAGMutation {
}
}
};
+
+struct FillMFMAShadowMutation : ScheduleDAGMutation {
+ const SIInstrInfo *TII;
+
+ ScheduleDAGMI *DAG;
+
+ FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
+
+ bool isSALU(const SUnit *SU) const {
+ const MachineInstr *MI = SU->getInstr();
+ return MI && TII->isSALU(*MI) && !MI->isTerminator();
+ }
+
+ bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
+ if (Pred->NodeNum < Succ->NodeNum)
+ return true;
+
+ SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
+
+ for (unsigned I = 0; I < Succs.size(); ++I) {
+ for (const SDep &SI : Succs[I]->Succs) {
+ const SUnit *SU = SI.getSUnit();
+ if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
+ Succs.push_back(SU);
+ }
+ }
+
+ SmallPtrSet<const SUnit*, 32> Visited;
+ while (!Preds.empty()) {
+ const SUnit *SU = Preds.pop_back_val();
+ if (llvm::find(Succs, SU) != Succs.end())
+ return false;
+ Visited.insert(SU);
+ for (const SDep &SI : SU->Preds)
+ if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
+ Preds.push_back(SI.getSUnit());
+ }
+
+ return true;
+ }
+
+ // Link as much SALU intructions in chain as possible. Return the size
+ // of the chain. Links up to MaxChain instructions.
+ unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
+ SmallPtrSetImpl<SUnit *> &Visited) const {
+ SmallVector<SUnit *, 8> Worklist({To});
+ unsigned Linked = 0;
+
+ while (!Worklist.empty() && MaxChain-- > 0) {
+ SUnit *SU = Worklist.pop_back_val();
+ if (!Visited.insert(SU).second)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
+ dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
+
+ if (SU->addPred(SDep(From, SDep::Artificial), false))
+ ++Linked;
+
+ for (SDep &SI : From->Succs) {
+ SUnit *SUv = SI.getSUnit();
+ if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
+ SUv->addPred(SDep(SU, SDep::Artificial), false);
+ }
+
+ for (SDep &SI : SU->Succs) {
+ SUnit *Succ = SI.getSUnit();
+ if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
+ Worklist.push_back(Succ);
+ }
+ }
+
+ return Linked;
+ }
+
+ void apply(ScheduleDAGInstrs *DAGInstrs) override {
+ const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasMAIInsts() || DisablePowerSched)
+ return;
+ DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+ const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+ if (!TSchedModel || DAG->SUnits.empty())
+ return;
+
+ // Scan for MFMA long latency instructions and try to add a dependency
+ // of available SALU instructions to give them a chance to fill MFMA
+ // shadow. That is desirable to fill MFMA shadow with SALU instructions
+ // rather than VALU to prevent power consumption bursts and throttle.
+ auto LastSALU = DAG->SUnits.begin();
+ auto E = DAG->SUnits.end();
+ SmallPtrSet<SUnit*, 32> Visited;
+ for (SUnit &SU : DAG->SUnits) {
+ MachineInstr &MAI = *SU.getInstr();
+ if (!TII->isMAI(MAI) ||
+ MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
+ MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
+ continue;
+
+ unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
+
+ LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
+ dbgs() << "Need " << Lat
+ << " instructions to cover latency.\n");
+
+ // Find up to Lat independent scalar instructions as early as
+ // possible such that they can be scheduled after this MFMA.
+ for ( ; Lat && LastSALU != E; ++LastSALU) {
+ if (Visited.count(&*LastSALU))
+ continue;
+
+ if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
+ continue;
+
+ Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
+ }
+ }
+ }
+};
} // namespace
void GCNSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
+ Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
}
const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 5584759e5580..78c3b823946d 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -1,9 +1,8 @@
//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
//
@@ -56,7 +55,8 @@ public:
SOUTHERN_ISLANDS = 4,
SEA_ISLANDS = 5,
VOLCANIC_ISLANDS = 6,
- GFX9 = 7
+ GFX9 = 7,
+ GFX10 = 8
};
private:
@@ -246,26 +246,6 @@ public:
class GCNSubtarget : public AMDGPUGenSubtargetInfo,
public AMDGPUSubtarget {
public:
- enum {
- ISAVersion0_0_0,
- ISAVersion6_0_0,
- ISAVersion6_0_1,
- ISAVersion7_0_0,
- ISAVersion7_0_1,
- ISAVersion7_0_2,
- ISAVersion7_0_3,
- ISAVersion7_0_4,
- ISAVersion8_0_1,
- ISAVersion8_0_2,
- ISAVersion8_0_3,
- ISAVersion8_1_0,
- ISAVersion9_0_0,
- ISAVersion9_0_2,
- ISAVersion9_0_4,
- ISAVersion9_0_6,
- ISAVersion9_0_9,
- };
-
enum TrapHandlerAbi {
TrapHandlerAbiNone = 0,
TrapHandlerAbiHsa = 1
@@ -297,7 +277,6 @@ protected:
// Basic subtarget description.
Triple TargetTriple;
unsigned Gen;
- unsigned IsaVersion;
InstrItineraryData InstrItins;
int LDSBankCount;
unsigned MaxPrivateElementSize;
@@ -308,7 +287,6 @@ protected:
// Dynamially set bits that enable features.
bool FP64FP16Denormals;
- bool DX10Clamp;
bool FlatForGlobal;
bool AutoWaitcntBeforeBarrier;
bool CodeObjectV3;
@@ -316,12 +294,11 @@ protected:
bool UnalignedBufferAccess;
bool HasApertureRegs;
bool EnableXNACK;
+ bool DoesNotSupportXNACK;
+ bool EnableCuMode;
bool TrapHandler;
- bool DebuggerInsertNops;
- bool DebuggerEmitPrologue;
// Used as options.
- bool EnableHugePrivateBuffer;
bool EnableLoadStoreOpt;
bool EnableUnsafeDSOffsetFolding;
bool EnableSIScheduler;
@@ -336,8 +313,10 @@ protected:
bool IsGCN;
bool GCN3Encoding;
bool CIInsts;
- bool VIInsts;
+ bool GFX8Insts;
bool GFX9Insts;
+ bool GFX10Insts;
+ bool GFX7GFX8GFX9Insts;
bool SGPRInitBug;
bool HasSMemRealTime;
bool HasIntClamp;
@@ -352,23 +331,51 @@ protected:
bool HasSDWAMac;
bool HasSDWAOutModsVOPC;
bool HasDPP;
+ bool HasDPP8;
bool HasR128A16;
+ bool HasNSAEncoding;
bool HasDLInsts;
- bool HasDotInsts;
+ bool HasDot1Insts;
+ bool HasDot2Insts;
+ bool HasDot3Insts;
+ bool HasDot4Insts;
+ bool HasDot5Insts;
+ bool HasDot6Insts;
+ bool HasMAIInsts;
+ bool HasPkFmacF16Inst;
+ bool HasAtomicFaddInsts;
bool EnableSRAMECC;
+ bool DoesNotSupportSRAMECC;
+ bool HasNoSdstCMPX;
+ bool HasVscnt;
+ bool HasRegisterBanking;
+ bool HasVOP3Literal;
+ bool HasNoDataDepHazard;
bool FlatAddressSpace;
bool FlatInstOffsets;
bool FlatGlobalInsts;
bool FlatScratchInsts;
+ bool ScalarFlatScratchInsts;
bool AddNoCarryInsts;
bool HasUnpackedD16VMem;
bool R600ALUInst;
bool CaymanISA;
bool CFALUBug;
+ bool LDSMisalignedBug;
bool HasVertexCache;
short TexVTXClauseSize;
bool ScalarizeGlobal;
+ bool HasVcmpxPermlaneHazard;
+ bool HasVMEMtoScalarWriteHazard;
+ bool HasSMEMtoVectorWriteHazard;
+ bool HasInstFwdPrefetchBug;
+ bool HasVcmpxExecWARHazard;
+ bool HasLdsBranchVmemWARHazard;
+ bool HasNSAtoVMEMBug;
+ bool HasOffset3fBug;
+ bool HasFlatSegmentOffsetBug;
+
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable;
@@ -378,6 +385,9 @@ private:
SITargetLowering TLInfo;
SIFrameLowering FrameLowering;
+ // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
+ static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
+
public:
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const GCNTargetMachine &TM);
@@ -437,6 +447,11 @@ public:
return Log2_32(WavefrontSize);
}
+ /// Return the number of high bits known to be zero fror a frame index.
+ unsigned getKnownHighZeroBitsForFrameIndex() const {
+ return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
+ }
+
int getLDSBankCount() const {
return LDSBankCount;
}
@@ -445,6 +460,8 @@ public:
return MaxPrivateElementSize;
}
+ unsigned getConstantBusLimit(unsigned Opcode) const;
+
bool hasIntClamp() const {
return HasIntClamp;
}
@@ -473,6 +490,12 @@ public:
return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
}
+ // Return true if the target only has the reverse operand versions of VALU
+ // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
+ bool hasOnlyRevVALUShifts() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
bool hasBFE() const {
return true;
}
@@ -525,14 +548,48 @@ public:
return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
}
- bool enableHugePrivateBuffer() const {
- return EnableHugePrivateBuffer;
+ /// True if the offset field of DS instructions works as expected. On SI, the
+ /// offset uses a 16-bit adder and does not always wrap properly.
+ bool hasUsableDSOffset() const {
+ return getGeneration() >= SEA_ISLANDS;
}
bool unsafeDSOffsetFoldingEnabled() const {
return EnableUnsafeDSOffsetFolding;
}
+ /// Condition output from div_scale is usable.
+ bool hasUsableDivScaleConditionOutput() const {
+ return getGeneration() != SOUTHERN_ISLANDS;
+ }
+
+ /// Extra wait hazard is needed in some cases before
+ /// s_cbranch_vccnz/s_cbranch_vccz.
+ bool hasReadVCCZBug() const {
+ return getGeneration() <= SEA_ISLANDS;
+ }
+
+ /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
+ /// was written by a VALU instruction.
+ bool hasSMRDReadVALUDefHazard() const {
+ return getGeneration() == SOUTHERN_ISLANDS;
+ }
+
+ /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
+ /// SGPR was written by a VALU Instruction.
+ bool hasVMEMReadSGPRVALUDefHazard() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
+ bool hasRFEHazards() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
+ /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
+ unsigned getSetRegWaitStates() const {
+ return getGeneration() <= SEA_ISLANDS ? 1 : 2;
+ }
+
bool dumpCode() const {
return DumpCode;
}
@@ -554,14 +611,6 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- bool enableDX10Clamp() const {
- return DX10Clamp;
- }
-
- bool enableIEEEBit(const MachineFunction &MF) const {
- return AMDGPU::isCompute(MF.getFunction().getCallingConv());
- }
-
bool useFlatForGlobal() const {
return FlatForGlobal;
}
@@ -572,6 +621,11 @@ public:
return CIInsts && EnableDS128;
}
+ /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
+ bool haveRoundOpsF64() const {
+ return CIInsts;
+ }
+
/// \returns If MUBUF instructions always perform range checking, even for
/// buffer resources used for private memory access.
bool privateMemoryResourceIsRangeChecked() const {
@@ -613,10 +667,18 @@ public:
return EnableXNACK;
}
+ bool isCuModeEnabled() const {
+ return EnableCuMode;
+ }
+
bool hasFlatAddressSpace() const {
return FlatAddressSpace;
}
+ bool hasFlatScrRegister() const {
+ return hasFlatAddressSpace();
+ }
+
bool hasFlatInstOffsets() const {
return FlatInstOffsets;
}
@@ -629,6 +691,14 @@ public:
return FlatScratchInsts;
}
+ bool hasScalarFlatScratchInsts() const {
+ return ScalarFlatScratchInsts;
+ }
+
+ bool hasFlatSegmentOffsetBug() const {
+ return HasFlatSegmentOffsetBug;
+ }
+
bool hasFlatLgkmVMemCountInOrder() const {
return getGeneration() > GFX9;
}
@@ -637,12 +707,34 @@ public:
return getGeneration() >= GFX9;
}
+ bool d16PreservesUnusedBits() const {
+ return hasD16LoadStore() && !isSRAMECCEnabled();
+ }
+
+ bool hasD16Images() const {
+ return getGeneration() >= VOLCANIC_ISLANDS;
+ }
+
/// Return if most LDS instructions have an m0 use that require m0 to be
/// iniitalized.
bool ldsRequiresM0Init() const {
return getGeneration() < GFX9;
}
+ // True if the hardware rewinds and replays GWS operations if a wave is
+ // preempted.
+ //
+ // If this is false, a GWS operation requires testing if a nack set the
+ // MEM_VIOL bit, and repeating if so.
+ bool hasGWSAutoReplay() const {
+ return getGeneration() >= GFX9;
+ }
+
+ /// \returns if target has ds_gws_sema_release_all instruction.
+ bool hasGWSSemaReleaseAll() const {
+ return CIInsts;
+ }
+
bool hasAddNoCarry() const {
return AddNoCarryInsts;
}
@@ -680,22 +772,74 @@ public:
return HasSDWAOutModsVOPC;
}
- bool vmemWriteNeedsExpWaitcnt() const {
- return getGeneration() < SEA_ISLANDS;
- }
-
bool hasDLInsts() const {
return HasDLInsts;
}
- bool hasDotInsts() const {
- return HasDotInsts;
+ bool hasDot1Insts() const {
+ return HasDot1Insts;
+ }
+
+ bool hasDot2Insts() const {
+ return HasDot2Insts;
+ }
+
+ bool hasDot3Insts() const {
+ return HasDot3Insts;
+ }
+
+ bool hasDot4Insts() const {
+ return HasDot4Insts;
+ }
+
+ bool hasDot5Insts() const {
+ return HasDot5Insts;
+ }
+
+ bool hasDot6Insts() const {
+ return HasDot6Insts;
+ }
+
+ bool hasMAIInsts() const {
+ return HasMAIInsts;
+ }
+
+ bool hasPkFmacF16Inst() const {
+ return HasPkFmacF16Inst;
+ }
+
+ bool hasAtomicFaddInsts() const {
+ return HasAtomicFaddInsts;
}
bool isSRAMECCEnabled() const {
return EnableSRAMECC;
}
+ bool hasNoSdstCMPX() const {
+ return HasNoSdstCMPX;
+ }
+
+ bool hasVscnt() const {
+ return HasVscnt;
+ }
+
+ bool hasRegisterBanking() const {
+ return HasRegisterBanking;
+ }
+
+ bool hasVOP3Literal() const {
+ return HasVOP3Literal;
+ }
+
+ bool hasNoDataDepHazard() const {
+ return HasNoDataDepHazard;
+ }
+
+ bool vmemWriteNeedsExpWaitcnt() const {
+ return getGeneration() < SEA_ISLANDS;
+ }
+
// Scratch is allocated in 256 dword per wave blocks for the entire
// wavefront. When viewed from the perspecive of an arbitrary workitem, this
// is 4-byte aligned.
@@ -792,29 +936,34 @@ public:
return HasScalarAtomics;
}
+ bool hasLDSFPAtomics() const {
+ return GFX8Insts;
+ }
bool hasDPP() const {
return HasDPP;
}
+ bool hasDPP8() const {
+ return HasDPP8;
+ }
+
bool hasR128A16() const {
return HasR128A16;
}
- bool enableSIScheduler() const {
- return EnableSIScheduler;
+ bool hasOffset3fBug() const {
+ return HasOffset3fBug;
}
- bool debuggerSupported() const {
- return debuggerInsertNops() && debuggerEmitPrologue();
+ bool hasNSAEncoding() const {
+ return HasNSAEncoding;
}
- bool debuggerInsertNops() const {
- return DebuggerInsertNops;
- }
+ bool hasMadF16() const;
- bool debuggerEmitPrologue() const {
- return DebuggerEmitPrologue;
+ bool enableSIScheduler() const {
+ return EnableSIScheduler;
}
bool loadStoreOptEnabled() const {
@@ -835,15 +984,48 @@ public:
}
bool hasSMovFedHazard() const {
- return getGeneration() >= AMDGPUSubtarget::GFX9;
+ return getGeneration() == AMDGPUSubtarget::GFX9;
}
bool hasReadM0MovRelInterpHazard() const {
- return getGeneration() >= AMDGPUSubtarget::GFX9;
+ return getGeneration() == AMDGPUSubtarget::GFX9;
}
bool hasReadM0SendMsgHazard() const {
- return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ getGeneration() <= AMDGPUSubtarget::GFX9;
+ }
+
+ bool hasVcmpxPermlaneHazard() const {
+ return HasVcmpxPermlaneHazard;
+ }
+
+ bool hasVMEMtoScalarWriteHazard() const {
+ return HasVMEMtoScalarWriteHazard;
+ }
+
+ bool hasSMEMtoVectorWriteHazard() const {
+ return HasSMEMtoVectorWriteHazard;
+ }
+
+ bool hasLDSMisalignedBug() const {
+ return LDSMisalignedBug && !EnableCuMode;
+ }
+
+ bool hasInstFwdPrefetchBug() const {
+ return HasInstFwdPrefetchBug;
+ }
+
+ bool hasVcmpxExecWARHazard() const {
+ return HasVcmpxExecWARHazard;
+ }
+
+ bool hasLdsBranchVmemWARHazard() const {
+ return HasLdsBranchVmemWARHazard;
+ }
+
+ bool hasNSAtoVMEMBug() const {
+ return HasNSAtoVMEMBug;
}
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
@@ -957,6 +1139,14 @@ public:
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
const override;
+ bool isWave32() const {
+ return WavefrontSize == 32;
+ }
+
+ const TargetRegisterClass *getBoolRC() const {
+ return getRegisterInfo()->getBoolRC();
+ }
+
/// \returns Maximum number of work groups per compute unit supported by the
/// subtarget and limited by given \p FlatWorkGroupSize.
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
@@ -994,7 +1184,6 @@ private:
bool FMA;
bool CaymanISA;
bool CFALUBug;
- bool DX10Clamp;
bool HasVertexCache;
bool R600ALUInst;
bool FP64;
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e8cefdbf74b9..0ea8db04c298 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -25,11 +24,14 @@
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
#include "R600MachineScheduler.h"
+#include "SIMachineFunctionInfo.h"
#include "SIMachineScheduler.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
@@ -67,6 +69,11 @@ EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
cl::desc("Run early if-conversion"),
cl::init(false));
+static cl::opt<bool>
+OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
+ cl::desc("Run pre-RA exec mask optimizations"),
+ cl::init(true));
+
static cl::opt<bool> EnableR600IfConvert(
"r600-if-convert",
cl::desc("Use if conversion pass"),
@@ -109,7 +116,7 @@ static cl::opt<bool> EnableSDWAPeephole(
static cl::opt<bool> EnableDPPCombine(
"amdgpu-dpp-combine",
cl::desc("Enable DPP combiner"),
- cl::init(false));
+ cl::init(true));
// Enable address space based alias analysis
static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
@@ -123,11 +130,11 @@ static cl::opt<bool, true> LateCFGStructurize(
cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
cl::Hidden);
-static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
+static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
"amdgpu-function-calls",
cl::desc("Enable AMDGPU function call support"),
cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
- cl::init(false),
+ cl::init(true),
cl::Hidden);
// Enable lib calls simplifications
@@ -143,6 +150,12 @@ static cl::opt<bool> EnableLowerKernelArguments(
cl::init(true),
cl::Hidden);
+static cl::opt<bool> EnableRegReassign(
+ "amdgpu-reassign-regs",
+ cl::desc("Enable register reassign optimizations on gfx10+"),
+ cl::init(true),
+ cl::Hidden);
+
// Enable atomic optimization
static cl::opt<bool> EnableAtomicOptimizations(
"amdgpu-atomic-optimizations",
@@ -157,6 +170,18 @@ static cl::opt<bool> EnableSIModeRegisterPass(
cl::init(true),
cl::Hidden);
+// Option is used in lit tests to prevent deadcoding of patterns inspected.
+static cl::opt<bool>
+EnableDCEInRA("amdgpu-dce-in-ra",
+ cl::init(true), cl::Hidden,
+ cl::desc("Enable machine DCE inside regalloc"));
+
+static cl::opt<bool> EnableScalarIRPasses(
+ "amdgpu-scalar-ir-passes",
+ cl::desc("Enable scalar IR passes"),
+ cl::init(true),
+ cl::Hidden);
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -172,6 +197,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUDAGToDAGISelPass(*PR);
initializeGCNDPPCombinePass(*PR);
initializeSILowerI1CopiesPass(*PR);
+ initializeSILowerSGPRSpillsPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
initializeSIFixupVectorISelPass(*PR);
@@ -192,6 +218,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
+ initializeAMDGPUPropagateAttributesEarlyPass(*PR);
+ initializeAMDGPUPropagateAttributesLatePass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
@@ -201,9 +229,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSILowerControlFlowPass(*PR);
initializeSIInsertSkipsPass(*PR);
initializeSIMemoryLegalizerPass(*PR);
- initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
- initializeSIFixWWMLivenessPass(*PR);
+ initializeSIPreAllocateWWMRegsPass(*PR);
initializeSIFormMemoryClausesPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
@@ -211,6 +238,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);
initializeAMDGPUInlinerPass(*PR);
+ initializeGCNRegBankReassignPass(*PR);
+ initializeGCNNSAReassignPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -295,10 +324,11 @@ static StringRef computeDataLayout(const Triple &TT) {
}
// 32-bit private, local, and region pointers. 64-bit global, constant and
- // flat.
+ // flat, non-integral buffer fat pointers.
return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+ "-ni:7";
}
LLVM_READNONE
@@ -306,8 +336,9 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
if (!GPU.empty())
return GPU;
+ // Need to default to a target with flat support for HSA.
if (TT.getArch() == Triple::amdgcn)
- return "generic";
+ return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
return "r600";
}
@@ -363,24 +394,25 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
bool EnableOpt = getOptLevel() > CodeGenOpt::None;
bool Internalize = InternalizeSymbols;
- bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls;
+ bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
- if (EnableAMDGPUFunctionCalls) {
+ if (EnableFunctionCalls) {
delete Builder.Inliner;
Builder.Inliner = createAMDGPUFunctionInliningPass();
}
Builder.addExtension(
PassManagerBuilder::EP_ModuleOptimizerEarly,
- [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
- legacy::PassManagerBase &PM) {
+ [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
+ legacy::PassManagerBase &PM) {
if (AMDGPUAA) {
PM.add(createAMDGPUAAWrapperPass());
PM.add(createAMDGPUExternalAAWrapperPass());
}
PM.add(createAMDGPUUnifyMetadataPass());
+ PM.add(createAMDGPUPropagateAttributesLatePass(this));
if (Internalize) {
PM.add(createInternalizePass(mustPreserveGV));
PM.add(createGlobalDCEPass());
@@ -392,15 +424,16 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
const auto &Opt = Options;
Builder.addExtension(
PassManagerBuilder::EP_EarlyAsPossible,
- [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
- legacy::PassManagerBase &PM) {
+ [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
+ legacy::PassManagerBase &PM) {
if (AMDGPUAA) {
PM.add(createAMDGPUAAWrapperPass());
PM.add(createAMDGPUExternalAAWrapperPass());
}
+ PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
PM.add(llvm::createAMDGPUUseNativeCallsPass());
if (LibCallSimplify)
- PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
+ PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
});
Builder.addExtension(
@@ -428,6 +461,11 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OL, bool JIT)
: AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
setRequiresStructuredCFG(true);
+
+ // Override the default since calls aren't supported for r600.
+ if (EnableFunctionCalls &&
+ EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
+ EnableFunctionCalls = false;
}
const R600Subtarget *R600TargetMachine::getSubtargetImpl(
@@ -528,8 +566,14 @@ public:
bool addPreISel() override;
bool addInstSelector() override;
bool addGCPasses() override;
+
+ std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
+std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
+ return getStandardCSEConfigForOpt(TM->getOptLevel());
+}
+
class R600PassConfig final : public AMDGPUPassConfig {
public:
R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
@@ -572,9 +616,10 @@ public:
bool addLegalizeMachineIR() override;
bool addRegBankSelect() override;
bool addGlobalInstructionSelect() override;
- void addFastRegAlloc(FunctionPass *RegAllocPass) override;
- void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+ void addFastRegAlloc() override;
+ void addOptimizedRegAlloc() override;
void addPreRegAlloc() override;
+ bool addPreRewrite() override;
void addPostRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
@@ -614,12 +659,16 @@ void AMDGPUPassConfig::addIRPasses() {
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
- addPass(createAtomicExpandPass());
-
// This must occur before inlining, as the inliner will not look through
// bitcast calls.
addPass(createAMDGPUFixFunctionBitcastsPass());
+ // A call to propagate attributes pass in the backend in case opt was not run.
+ addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
+
+ addPass(createAtomicExpandPass());
+
+
addPass(createAMDGPULowerIntrinsicsPass());
// Function calls are not supported, so make sure we inline everything.
@@ -652,7 +701,8 @@ void AMDGPUPassConfig::addIRPasses() {
if (EnableSROA)
addPass(createSROAPass());
- addStraightLineScalarOptimizationPasses();
+ if (EnableScalarIRPasses)
+ addStraightLineScalarOptimizationPasses();
if (EnableAMDGPUAliasAnalysis) {
addPass(createAMDGPUAAWrapperPass());
@@ -678,15 +728,20 @@ void AMDGPUPassConfig::addIRPasses() {
// %1 = shl %a, 2
//
// but EarlyCSE can do neither of them.
- if (getOptLevel() != CodeGenOpt::None)
+ if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
addEarlyCSEOrGVNPass();
}
void AMDGPUPassConfig::addCodeGenPrepare() {
+ if (TM->getTargetTriple().getArch() == Triple::amdgcn)
+ addPass(createAMDGPUAnnotateKernelFeaturesPass());
+
if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
+ addPass(&AMDGPUPerfHintAnalysisID);
+
TargetPassConfig::addCodeGenPrepare();
if (EnableLoadStoreVectorizer)
@@ -700,7 +755,8 @@ bool AMDGPUPassConfig::addPreISel() {
}
bool AMDGPUPassConfig::addInstSelector() {
- addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
+ // Defer the verifier until FinalizeISel.
+ addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
return false;
}
@@ -770,7 +826,6 @@ bool GCNPassConfig::addPreISel() {
// FIXME: We need to run a pass to propagate the attributes when calls are
// supported.
- addPass(createAMDGPUAnnotateKernelFeaturesPass());
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
@@ -783,6 +838,7 @@ bool GCNPassConfig::addPreISel() {
if (!LateCFGStructurize) {
addPass(createSIAnnotateControlFlowPass());
}
+ addPass(createLCSSAPass());
return false;
}
@@ -856,7 +912,7 @@ void GCNPassConfig::addPreRegAlloc() {
addPass(createSIWholeQuadModePass());
}
-void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+void GCNPassConfig::addFastRegAlloc() {
// FIXME: We have to disable the verifier here because of PHIElimination +
// TwoAddressInstructions disabling it.
@@ -865,28 +921,40 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
- // This must be run after SILowerControlFlow, since it needs to use the
- // machine-level CFG, but before register allocation.
- insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+ // This must be run just after RegisterCoalescing.
+ insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
- TargetPassConfig::addFastRegAlloc(RegAllocPass);
+ TargetPassConfig::addFastRegAlloc();
}
-void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
- insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
-
- insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
+void GCNPassConfig::addOptimizedRegAlloc() {
+ if (OptExecMaskPreRA) {
+ insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
+ insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
+ } else {
+ insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
+ }
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
- // This must be run after SILowerControlFlow, since it needs to use the
- // machine-level CFG, but before register allocation.
- insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+ // This must be run just after RegisterCoalescing.
+ insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
+
+ if (EnableDCEInRA)
+ insertPass(&RenameIndependentSubregsID, &DeadMachineInstructionElimID);
- TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
+ TargetPassConfig::addOptimizedRegAlloc();
+}
+
+bool GCNPassConfig::addPreRewrite() {
+ if (EnableRegReassign) {
+ addPass(&GCNNSAReassignID);
+ addPass(&GCNRegBankReassignID);
+ }
+ return true;
}
void GCNPassConfig::addPostRegAlloc() {
@@ -894,6 +962,9 @@ void GCNPassConfig::addPostRegAlloc() {
if (getOptLevel() > CodeGenOpt::None)
addPass(&SIOptimizeExecMaskingID);
TargetPassConfig::addPostRegAlloc();
+
+ // Equivalent of PEI for SGPRs.
+ addPass(&SILowerSGPRSpillsID);
}
void GCNPassConfig::addPreSched2() {
@@ -919,10 +990,164 @@ void GCNPassConfig::addPreEmitPass() {
addPass(&PostRAHazardRecognizerID);
addPass(&SIInsertSkipsPassID);
- addPass(createSIDebuggerInsertNopsPass());
addPass(&BranchRelaxationPassID);
}
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
return new GCNPassConfig(*this, PM);
}
+
+yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
+ return new yaml::SIMachineFunctionInfo();
+}
+
+yaml::MachineFunctionInfo *
+GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ return new yaml::SIMachineFunctionInfo(*MFI,
+ *MF.getSubtarget().getRegisterInfo());
+}
+
+bool GCNTargetMachine::parseMachineFunctionInfo(
+ const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error, SMRange &SourceRange) const {
+ const yaml::SIMachineFunctionInfo &YamlMFI =
+ reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
+ MachineFunction &MF = PFS.MF;
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ MFI->initializeBaseYamlFields(YamlMFI);
+
+ auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) {
+ if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) {
+ SourceRange = RegName.SourceRange;
+ return true;
+ }
+
+ return false;
+ };
+
+ auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
+ // Create a diagnostic for a the register string literal.
+ const MemoryBuffer &Buffer =
+ *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
+ Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
+ RegName.Value.size(), SourceMgr::DK_Error,
+ "incorrect register class for field", RegName.Value,
+ None, None);
+ SourceRange = RegName.SourceRange;
+ return true;
+ };
+
+ if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
+ parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) ||
+ parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
+ parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
+ return true;
+
+ if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
+ !AMDGPU::SReg_128RegClass.contains(MFI->ScratchRSrcReg)) {
+ return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
+ }
+
+ if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG &&
+ !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) {
+ return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg);
+ }
+
+ if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
+ !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
+ return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
+ }
+
+ if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
+ !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
+ return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
+ }
+
+ auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
+ const TargetRegisterClass &RC,
+ ArgDescriptor &Arg, unsigned UserSGPRs,
+ unsigned SystemSGPRs) {
+ // Skip parsing if it's not present.
+ if (!A)
+ return false;
+
+ if (A->IsRegister) {
+ unsigned Reg;
+ if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
+ SourceRange = A->RegisterName.SourceRange;
+ return true;
+ }
+ if (!RC.contains(Reg))
+ return diagnoseRegisterClass(A->RegisterName);
+ Arg = ArgDescriptor::createRegister(Reg);
+ } else
+ Arg = ArgDescriptor::createStack(A->StackOffset);
+ // Check and apply the optional mask.
+ if (A->Mask)
+ Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
+
+ MFI->NumUserSGPRs += UserSGPRs;
+ MFI->NumSystemSGPRs += SystemSGPRs;
+ return false;
+ };
+
+ if (YamlMFI.ArgInfo &&
+ (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
+ AMDGPU::SReg_128RegClass,
+ MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
+ AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
+ 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
+ MFI->ArgInfo.QueuePtr, 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
+ AMDGPU::SReg_64RegClass,
+ MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
+ AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
+ 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
+ AMDGPU::SReg_64RegClass,
+ MFI->ArgInfo.FlatScratchInit, 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
+ AMDGPU::SGPR_32RegClass,
+ MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
+ AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
+ 0, 1) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
+ AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
+ 0, 1) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
+ AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
+ 0, 1) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
+ AMDGPU::SGPR_32RegClass,
+ MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
+ AMDGPU::SGPR_32RegClass,
+ MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
+ AMDGPU::SReg_64RegClass,
+ MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
+ AMDGPU::SReg_64RegClass,
+ MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
+ AMDGPU::VGPR_32RegClass,
+ MFI->ArgInfo.WorkItemIDX, 0, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
+ AMDGPU::VGPR_32RegClass,
+ MFI->ArgInfo.WorkItemIDY, 0, 0) ||
+ parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
+ AMDGPU::VGPR_32RegClass,
+ MFI->ArgInfo.WorkItemIDZ, 0, 0)))
+ return true;
+
+ MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
+ MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
+
+ return false;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 62fbe71d1902..70fa3961236f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -1,9 +1,8 @@
//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -15,7 +14,6 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
-#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/StringMap.h"
@@ -95,7 +93,6 @@ public:
class GCNTargetMachine final : public AMDGPUTargetMachine {
private:
- AMDGPUIntrinsicInfo IntrinsicInfo;
mutable StringMap<std::unique_ptr<GCNSubtarget>> SubtargetMap;
public:
@@ -110,13 +107,17 @@ public:
TargetTransformInfo getTargetTransformInfo(const Function &F) override;
- const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
- return &IntrinsicInfo;
- }
-
bool useIPRA() const override {
return true;
}
+
+ yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
+ yaml::MachineFunctionInfo *
+ convertFuncInfoToYAML(const MachineFunction &MF) const override;
+ bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &,
+ PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error,
+ SMRange &SourceRange) const override;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index c4e1efde130b..6569980d2c75 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index a4ae1a2c18c2..819bebb7932d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -1,9 +1,8 @@
//===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 11e4ba4b5010..aaed280a1270 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -118,8 +117,10 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// Add a small bonus for each of such "if" statements.
if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
if (UP.Threshold < MaxBoost && Br->isConditional()) {
- if (L->isLoopExiting(Br->getSuccessor(0)) ||
- L->isLoopExiting(Br->getSuccessor(1)))
+ BasicBlock *Succ0 = Br->getSuccessor(0);
+ BasicBlock *Succ1 = Br->getSuccessor(1);
+ if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
+ (L->contains(Succ1) && L->isLoopExiting(Succ1)))
continue;
if (dependsOnLocalPhi(L, Br->getCondition())) {
UP.Threshold += UnrollThresholdIf;
@@ -141,7 +142,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
unsigned Threshold = 0;
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
Threshold = ThresholdPrivate;
- else if (AS == AMDGPUAS::LOCAL_ADDRESS)
+ else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
Threshold = ThresholdLocal;
else
continue;
@@ -159,7 +160,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
if (AllocaSize > MaxAlloca)
continue;
- } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
+ AS == AMDGPUAS::REGION_ADDRESS) {
LocalGEPsSeen++;
// Inhibit unroll for local memory if we have seen addressing not to
// a variable, most likely we will be unable to combine it.
@@ -254,7 +256,8 @@ unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
- AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
return 512;
}
@@ -308,6 +311,8 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
@@ -399,7 +404,7 @@ int GCNTTIImpl::getArithmeticInstrCost(
if (SLT == MVT::f64) {
int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
// Add cost of workaround.
- if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ if (!ST->hasUsableDivScaleConditionOutput())
Cost += 3 * getFullRateInstrCost();
return LT.first * Cost * NElts;
@@ -577,6 +582,8 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
return false;
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_icmp:
+ case Intrinsic::amdgcn_fcmp:
return true;
}
}
@@ -607,7 +614,7 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
}
bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
- const Function *Callee) const {
+ const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
const FeatureBitset &CallerBits =
TM.getSubtargetImpl(*Caller)->getFeatureBits();
@@ -616,7 +623,14 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
- return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
+ if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
+ return false;
+
+ // FIXME: dx10_clamp can just take the caller setting, but there seems to be
+ // no way to support merge for backend defined attributes.
+ AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
+ AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
+ return CallerMode.isInlineCompatible(CalleeMode);
}
void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 397c5c6fa6fb..6f1bf5a26f0d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -1,9 +1,8 @@
//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -78,13 +77,16 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
AMDGPU::FeatureUnalignedScratchAccess,
AMDGPU::FeatureAutoWaitcntBeforeBarrier,
- AMDGPU::FeatureDebuggerEmitPrologue,
- AMDGPU::FeatureDebuggerInsertNops,
// Property of the kernel/environment which can't actually differ.
AMDGPU::FeatureSGPRInitBug,
AMDGPU::FeatureXNACK,
AMDGPU::FeatureTrapHandler,
+ AMDGPU::FeatureCodeObjectV3,
+
+ // The default assumption needs to be ecc is enabled, but no directly
+ // exposed operations depend on it, so it can be safely inlined.
+ AMDGPU::FeatureSRAMECC,
// Perf-tuning features
AMDGPU::FeatureFastFMAF32,
@@ -178,8 +180,7 @@ public:
// don't use flat addressing.
if (IsGraphicsShader)
return -1;
- return ST->hasFlatAddressSpace() ?
- AMDGPUAS::FLAT_ADDRESS : AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+ return AMDGPUAS::FLAT_ADDRESS;
}
unsigned getVectorSplitCost() { return 0; }
@@ -190,7 +191,9 @@ public:
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
- unsigned getInliningThresholdMultiplier() { return 9; }
+ unsigned getInliningThresholdMultiplier() { return 7; }
+
+ int getInlinerVectorBonusPercent() { return 0; }
int getArithmeticReductionCost(unsigned Opcode,
Type *Ty,
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index ced3f6f567e2..396e0ed2e76c 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -199,14 +198,11 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
} else { // Conditional branch.
// Create a new transition block to hold the conditional branch.
- BasicBlock *TransitionBB = BasicBlock::Create(F.getContext(),
- "TransitionBlock", &F);
-
- // Move BI from BB to the new transition block.
- BI->removeFromParent();
- TransitionBB->getInstList().push_back(BI);
+ BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
- // Create a branch that will always branch to the transition block.
+ // Create a branch that will always branch to the transition block and
+ // references DummyReturnBB.
+ BB->getTerminator()->eraseFromParent();
BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
}
}
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index 1f6d9234c1ed..d4401a22a1ad 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 11cd49e5b3dc..12f2e9519c9e 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1,9 +1,8 @@
//===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h
index 289642aaa2d0..3e658a144c1f 100644
--- a/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -1,9 +1,8 @@
//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file AMDKernelCodeT.h
@@ -127,8 +126,12 @@ enum amd_code_property_mask_t {
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
- AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
- AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
+ AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT = 10,
+ AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32 = ((1 << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
+
+ AMD_CODE_PROPERTY_RESERVED1_SHIFT = 11,
+ AMD_CODE_PROPERTY_RESERVED1_WIDTH = 5,
AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
/// Control wave ID base counter for GDS ordered-append. Used to set
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 3f9af27a2e5e..6d678966c98e 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -13,6 +12,7 @@
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDKernelCodeTUtils.h"
@@ -69,7 +69,7 @@ namespace {
class AMDGPUAsmParser;
-enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL };
+enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_AGPR, IS_TTMP, IS_SPECIAL };
//===----------------------------------------------------------------------===//
// Operand
@@ -103,14 +103,14 @@ public:
int64_t getFPModifiersOperand() const {
int64_t Operand = 0;
- Operand |= Abs ? SISrcMods::ABS : 0;
- Operand |= Neg ? SISrcMods::NEG : 0;
+ Operand |= Abs ? SISrcMods::ABS : 0u;
+ Operand |= Neg ? SISrcMods::NEG : 0u;
return Operand;
}
int64_t getIntModifiersOperand() const {
int64_t Operand = 0;
- Operand |= Sext ? SISrcMods::SEXT : 0;
+ Operand |= Sext ? SISrcMods::SEXT : 0u;
return Operand;
}
@@ -140,21 +140,25 @@ public:
ImmTyInstOffset,
ImmTyOffset0,
ImmTyOffset1,
+ ImmTyDLC,
ImmTyGLC,
ImmTySLC,
ImmTyTFE,
ImmTyD16,
ImmTyClampSI,
ImmTyOModSI,
+ ImmTyDPP8,
ImmTyDppCtrl,
ImmTyDppRowMask,
ImmTyDppBankMask,
ImmTyDppBoundCtrl,
+ ImmTyDppFi,
ImmTySdwaDstSel,
ImmTySdwaSrc0Sel,
ImmTySdwaSrc1Sel,
ImmTySdwaDstUnused,
ImmTyDMask,
+ ImmTyDim,
ImmTyUNorm,
ImmTyDA,
ImmTyR128A16,
@@ -174,9 +178,15 @@ public:
ImmTyNegLo,
ImmTyNegHi,
ImmTySwizzle,
- ImmTyHigh
+ ImmTyGprIdxMode,
+ ImmTyHigh,
+ ImmTyBLGP,
+ ImmTyCBSZ,
+ ImmTyABID,
+ ImmTyEndpgm,
};
+private:
struct TokOp {
const char *Data;
unsigned Length;
@@ -191,7 +201,6 @@ public:
struct RegOp {
unsigned RegNo;
- bool IsForcedVOP3;
Modifiers Mods;
};
@@ -202,6 +211,7 @@ public:
const MCExpr *Expr;
};
+public:
bool isToken() const override {
if (Kind == Token)
return true;
@@ -231,32 +241,32 @@ public:
return isRegKind() && !hasModifiers();
}
- bool isRegOrImmWithInputMods(MVT type) const {
- return isRegKind() || isInlinableImm(type);
+ bool isRegOrImmWithInputMods(unsigned RCID, MVT type) const {
+ return isRegClass(RCID) || isInlinableImm(type) || isLiteralImm(type);
}
bool isRegOrImmWithInt16InputMods() const {
- return isRegOrImmWithInputMods(MVT::i16);
+ return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i16);
}
bool isRegOrImmWithInt32InputMods() const {
- return isRegOrImmWithInputMods(MVT::i32);
+ return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i32);
}
bool isRegOrImmWithInt64InputMods() const {
- return isRegOrImmWithInputMods(MVT::i64);
+ return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::i64);
}
bool isRegOrImmWithFP16InputMods() const {
- return isRegOrImmWithInputMods(MVT::f16);
+ return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f16);
}
bool isRegOrImmWithFP32InputMods() const {
- return isRegOrImmWithInputMods(MVT::f32);
+ return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f32);
}
bool isRegOrImmWithFP64InputMods() const {
- return isRegOrImmWithInputMods(MVT::f64);
+ return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64);
}
bool isVReg() const {
@@ -268,8 +278,12 @@ public:
isRegClass(AMDGPU::VReg_512RegClassID);
}
+ bool isVReg32() const {
+ return isRegClass(AMDGPU::VGPR_32RegClassID);
+ }
+
bool isVReg32OrOff() const {
- return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
+ return isOff() || isVReg32();
}
bool isSDWAOperand(MVT type) const;
@@ -289,6 +303,7 @@ public:
bool isClampSI() const { return isImmTy(ImmTyClampSI); }
bool isOModSI() const { return isImmTy(ImmTyOModSI); }
bool isDMask() const { return isImmTy(ImmTyDMask); }
+ bool isDim() const { return isImmTy(ImmTyDim); }
bool isUNorm() const { return isImmTy(ImmTyUNorm); }
bool isDA() const { return isImmTy(ImmTyDA); }
bool isR128A16() const { return isImmTy(ImmTyR128A16); }
@@ -301,13 +316,13 @@ public:
bool isIdxen() const { return isImmTy(ImmTyIdxen); }
bool isAddr64() const { return isImmTy(ImmTyAddr64); }
bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); }
- bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); }
+ bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<8>(getImm()); }
bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
- bool isOffsetU12() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isUInt<12>(getImm()); }
- bool isOffsetS13() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isInt<13>(getImm()); }
+ bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); }
bool isGDS() const { return isImmTy(ImmTyGDS); }
bool isLDS() const { return isImmTy(ImmTyLDS); }
+ bool isDLC() const { return isImmTy(ImmTyDLC); }
bool isGLC() const { return isImmTy(ImmTyGLC); }
bool isSLC() const { return isImmTy(ImmTySLC); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
@@ -316,6 +331,7 @@ public:
bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
+ bool isFI() const { return isImmTy(ImmTyDppFi); }
bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); }
bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); }
bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); }
@@ -339,6 +355,8 @@ public:
bool isRegClass(unsigned RCID) const;
+ bool isInlineValue() const;
+
bool isRegOrInlineNoMods(unsigned RCID, MVT type) const {
return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers();
}
@@ -359,6 +377,8 @@ public:
return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64);
}
+ bool isBoolReg() const;
+
bool isSCSrcF16() const {
return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16);
}
@@ -411,6 +431,11 @@ public:
return isSSrcF16();
}
+ bool isSSrcOrLdsB32() const {
+ return isRegOrInlineNoMods(AMDGPU::SRegOrLds_32RegClassID, MVT::i32) ||
+ isLiteralImm(MVT::i32) || isExpr();
+ }
+
bool isVCSrcB32() const {
return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
}
@@ -456,8 +481,7 @@ public:
}
bool isVSrcV2B16() const {
- llvm_unreachable("cannot happen");
- return isVSrcB16();
+ return isVSrcB16() || isLiteralImm(MVT::v2i16);
}
bool isVSrcF32() const {
@@ -473,8 +497,127 @@ public:
}
bool isVSrcV2F16() const {
- llvm_unreachable("cannot happen");
- return isVSrcF16();
+ return isVSrcF16() || isLiteralImm(MVT::v2f16);
+ }
+
+ bool isVISrcB32() const {
+ return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32);
+ }
+
+ bool isVISrcB16() const {
+ return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i16);
+ }
+
+ bool isVISrcV2B16() const {
+ return isVISrcB16();
+ }
+
+ bool isVISrcF32() const {
+ return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::f32);
+ }
+
+ bool isVISrcF16() const {
+ return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::f16);
+ }
+
+ bool isVISrcV2F16() const {
+ return isVISrcF16() || isVISrcB32();
+ }
+
+ bool isAISrcB32() const {
+ return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i32);
+ }
+
+ bool isAISrcB16() const {
+ return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i16);
+ }
+
+ bool isAISrcV2B16() const {
+ return isAISrcB16();
+ }
+
+ bool isAISrcF32() const {
+ return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::f32);
+ }
+
+ bool isAISrcF16() const {
+ return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::f16);
+ }
+
+ bool isAISrcV2F16() const {
+ return isAISrcF16() || isAISrcB32();
+ }
+
+ bool isAISrc_128B32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i32);
+ }
+
+ bool isAISrc_128B16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i16);
+ }
+
+ bool isAISrc_128V2B16() const {
+ return isAISrc_128B16();
+ }
+
+ bool isAISrc_128F32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::f32);
+ }
+
+ bool isAISrc_128F16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::f16);
+ }
+
+ bool isAISrc_128V2F16() const {
+ return isAISrc_128F16() || isAISrc_128B32();
+ }
+
+ bool isAISrc_512B32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i32);
+ }
+
+ bool isAISrc_512B16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i16);
+ }
+
+ bool isAISrc_512V2B16() const {
+ return isAISrc_512B16();
+ }
+
+ bool isAISrc_512F32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::f32);
+ }
+
+ bool isAISrc_512F16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::f16);
+ }
+
+ bool isAISrc_512V2F16() const {
+ return isAISrc_512F16() || isAISrc_512B32();
+ }
+
+ bool isAISrc_1024B32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::i32);
+ }
+
+ bool isAISrc_1024B16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::i16);
+ }
+
+ bool isAISrc_1024V2B16() const {
+ return isAISrc_1024B16();
+ }
+
+ bool isAISrc_1024F32() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::f32);
+ }
+
+ bool isAISrc_1024F16() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::f16);
+ }
+
+ bool isAISrc_1024V2F16() const {
+ return isAISrc_1024F16() || isAISrc_1024B32();
}
bool isKImmFP32() const {
@@ -504,10 +647,15 @@ public:
bool isSMRDOffset8() const;
bool isSMRDOffset20() const;
bool isSMRDLiteralOffset() const;
+ bool isDPP8() const;
bool isDPPCtrl() const;
+ bool isBLGP() const;
+ bool isCBSZ() const;
+ bool isABID() const;
bool isGPRIdxMode() const;
bool isS16Imm() const;
bool isU16Imm() const;
+ bool isEndpgm() const;
StringRef getExpressionAsToken() const {
assert(isExpr());
@@ -535,6 +683,7 @@ public:
}
unsigned getReg() const override {
+ assert(isRegKind());
return Reg.RegNo;
}
@@ -594,6 +743,10 @@ public:
void addRegOperands(MCInst &Inst, unsigned N) const;
+ void addBoolRegOperands(MCInst &Inst, unsigned N) const {
+ addRegOperands(Inst, N);
+ }
+
void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
if (isRegKind())
addRegOperands(Inst, N);
@@ -661,6 +814,7 @@ public:
case ImmTyInstOffset: OS << "InstOffset"; break;
case ImmTyOffset0: OS << "Offset0"; break;
case ImmTyOffset1: OS << "Offset1"; break;
+ case ImmTyDLC: OS << "DLC"; break;
case ImmTyGLC: OS << "GLC"; break;
case ImmTySLC: OS << "SLC"; break;
case ImmTyTFE: OS << "TFE"; break;
@@ -668,15 +822,18 @@ public:
case ImmTyFORMAT: OS << "FORMAT"; break;
case ImmTyClampSI: OS << "ClampSI"; break;
case ImmTyOModSI: OS << "OModSI"; break;
+ case ImmTyDPP8: OS << "DPP8"; break;
case ImmTyDppCtrl: OS << "DppCtrl"; break;
case ImmTyDppRowMask: OS << "DppRowMask"; break;
case ImmTyDppBankMask: OS << "DppBankMask"; break;
case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break;
+ case ImmTyDppFi: OS << "FI"; break;
case ImmTySdwaDstSel: OS << "SdwaDstSel"; break;
case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break;
case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break;
case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break;
case ImmTyDMask: OS << "DMask"; break;
+ case ImmTyDim: OS << "Dim"; break;
case ImmTyUNorm: OS << "UNorm"; break;
case ImmTyDA: OS << "DA"; break;
case ImmTyR128A16: OS << "R128A16"; break;
@@ -695,7 +852,12 @@ public:
case ImmTyNegLo: OS << "NegLo"; break;
case ImmTyNegHi: OS << "NegHi"; break;
case ImmTySwizzle: OS << "Swizzle"; break;
+ case ImmTyGprIdxMode: OS << "GprIdxMode"; break;
case ImmTyHigh: OS << "High"; break;
+ case ImmTyBLGP: OS << "BLGP"; break;
+ case ImmTyCBSZ: OS << "CBSZ"; break;
+ case ImmTyABID: OS << "ABID"; break;
+ case ImmTyEndpgm: OS << "Endpgm"; break;
}
}
@@ -747,12 +909,10 @@ public:
static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser,
unsigned RegNo, SMLoc S,
- SMLoc E,
- bool ForceVOP3) {
+ SMLoc E) {
auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser);
Op->Reg.RegNo = RegNo;
Op->Reg.Mods = Modifiers();
- Op->Reg.IsForcedVOP3 = ForceVOP3;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
@@ -817,6 +977,7 @@ public:
void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
switch (RegKind) {
case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
+ case IS_AGPR: // fall through
case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break;
default: break;
}
@@ -853,6 +1014,8 @@ private:
/// \param VCCUsed [in] Whether VCC special SGPR is reserved.
/// \param FlatScrUsed [in] Whether FLAT_SCRATCH special SGPR is reserved.
/// \param XNACKUsed [in] Whether XNACK_MASK special SGPR is reserved.
+ /// \param EnableWavefrontSize32 [in] Value of ENABLE_WAVEFRONT_SIZE32 kernel
+ /// descriptor field, if valid.
/// \param NextFreeVGPR [in] Max VGPR number referenced, plus one.
/// \param VGPRRange [in] Token range, used for VGPR diagnostics.
/// \param NextFreeSGPR [in] Max SGPR number referenced, plus one.
@@ -861,9 +1024,10 @@ private:
/// \param SGPRBlocks [out] Result SGPR block count.
bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed,
bool FlatScrUsed, bool XNACKUsed,
- unsigned NextFreeVGPR, SMRange VGPRRange,
- unsigned NextFreeSGPR, SMRange SGPRRange,
- unsigned &VGPRBlocks, unsigned &SGPRBlocks);
+ Optional<bool> EnableWavefrontSize32, unsigned NextFreeVGPR,
+ SMRange VGPRRange, unsigned NextFreeSGPR,
+ SMRange SGPRRange, unsigned &VGPRBlocks,
+ unsigned &SGPRBlocks);
bool ParseDirectiveAMDGCNTarget();
bool ParseDirectiveAMDHSAKernel();
bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
@@ -876,7 +1040,15 @@ private:
bool ParseDirectiveISAVersion();
bool ParseDirectiveHSAMetadata();
+ bool ParseDirectivePALMetadataBegin();
bool ParseDirectivePALMetadata();
+ bool ParseDirectiveAMDGPULDS();
+
+ /// Common code to parse out a block of text (typically YAML) between start and
+ /// end directives.
+ bool ParseToEndDirective(const char *AssemblerDirectiveBegin,
+ const char *AssemblerDirectiveEnd,
+ std::string &CollectString);
bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
RegisterKind RegKind, unsigned Reg1,
@@ -884,6 +1056,8 @@ private:
bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
unsigned& RegNum, unsigned& RegWidth,
unsigned *DwordRegIndex);
+ bool isRegister();
+ bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind);
void initializeGprCountSymbol(RegisterKind RegKind);
bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex,
@@ -897,6 +1071,10 @@ public:
enum AMDGPUMatchResultTy {
Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
};
+ enum OperandMode {
+ OperandMode_Default,
+ OperandMode_NSA,
+ };
using OptionalImmIndexMap = std::map<AMDGPUOperand::ImmTy, unsigned>;
@@ -908,7 +1086,7 @@ public:
if (getFeatureBits().none()) {
// Set default features.
- copySTI().ToggleFeature("SOUTHERN_ISLANDS");
+ copySTI().ToggleFeature("southern-islands");
}
setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits()));
@@ -924,6 +1102,10 @@ public:
MCSymbol *Sym =
Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
+ Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_minor"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
+ Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_stepping"));
+ Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
} else {
MCSymbol *Sym =
Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
@@ -969,6 +1151,10 @@ public:
return AMDGPU::isGFX9(getSTI());
}
+ bool isGFX10() const {
+ return AMDGPU::isGFX10(getSTI());
+ }
+
bool hasInv2PiInlineImm() const {
return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
}
@@ -978,7 +1164,11 @@ public:
}
bool hasSGPR102_SGPR103() const {
- return !isVI();
+ return !isVI() && !isGFX9();
+ }
+
+ bool hasSGPR104_SGPR105() const {
+ return isGFX10();
}
bool hasIntClamp() const {
@@ -1024,7 +1214,8 @@ public:
uint64_t &ErrorInfo,
bool MatchingInlineAsm) override;
bool ParseDirective(AsmToken DirectiveID) override;
- OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic);
+ OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic,
+ OperandMode Mode = OperandMode_Default);
StringRef parseMnemonicSuffix(StringRef Name);
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override;
@@ -1037,11 +1228,11 @@ public:
AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
bool (*ConvertResult)(int64_t &) = nullptr);
- OperandMatchResultTy parseOperandArrayWithPrefix(
- const char *Prefix,
- OperandVector &Operands,
- AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
- bool (*ConvertResult)(int64_t&) = nullptr);
+ OperandMatchResultTy
+ parseOperandArrayWithPrefix(const char *Prefix,
+ OperandVector &Operands,
+ AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+ bool (*ConvertResult)(int64_t&) = nullptr);
OperandMatchResultTy
parseNamedBit(const char *Name, OperandVector &Operands,
@@ -1049,10 +1240,15 @@ public:
OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
StringRef &Value);
- bool parseAbsoluteExpr(int64_t &Val, bool AbsMod = false);
- OperandMatchResultTy parseImm(OperandVector &Operands, bool AbsMod = false);
+ bool isModifier();
+ bool isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
+ bool isRegOrOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
+ bool isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
+ bool isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const;
+ bool parseSP3NegModifier();
+ OperandMatchResultTy parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false);
OperandMatchResultTy parseReg(OperandVector &Operands);
- OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool AbsMod = false);
+ OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false);
OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true);
OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true);
OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
@@ -1073,33 +1269,63 @@ private:
struct OperandInfoTy {
int64_t Id;
bool IsSymbolic = false;
+ bool IsDefined = false;
OperandInfoTy(int64_t Id_) : Id(Id_) {}
};
- bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId);
- bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+ bool parseSendMsgBody(OperandInfoTy &Msg, OperandInfoTy &Op, OperandInfoTy &Stream);
+ bool validateSendMsg(const OperandInfoTy &Msg,
+ const OperandInfoTy &Op,
+ const OperandInfoTy &Stream,
+ const SMLoc Loc);
+
+ bool parseHwregBody(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+ bool validateHwreg(const OperandInfoTy &HwReg,
+ const int64_t Offset,
+ const int64_t Width,
+ const SMLoc Loc);
void errorExpTgt();
OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
+ SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
- bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc);
+ bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands);
+ bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands);
+ bool validateSOPLiteral(const MCInst &Inst) const;
bool validateConstantBusLimitations(const MCInst &Inst);
bool validateEarlyClobberLimitations(const MCInst &Inst);
bool validateIntClampSupported(const MCInst &Inst);
bool validateMIMGAtomicDMask(const MCInst &Inst);
bool validateMIMGGatherDMask(const MCInst &Inst);
bool validateMIMGDataSize(const MCInst &Inst);
+ bool validateMIMGAddrSize(const MCInst &Inst);
bool validateMIMGD16(const MCInst &Inst);
+ bool validateMIMGDim(const MCInst &Inst);
+ bool validateLdsDirect(const MCInst &Inst);
+ bool validateOpSel(const MCInst &Inst);
+ bool validateVccOperand(unsigned Reg) const;
+ bool validateVOP3Literal(const MCInst &Inst) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
+ bool isId(const StringRef Id) const;
+ bool isId(const AsmToken &Token, const StringRef Id) const;
+ bool isToken(const AsmToken::TokenKind Kind) const;
bool trySkipId(const StringRef Id);
+ bool trySkipId(const StringRef Id, const AsmToken::TokenKind Kind);
bool trySkipToken(const AsmToken::TokenKind Kind);
bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg);
bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string");
+ void peekTokens(MutableArrayRef<AsmToken> Tokens);
+ AsmToken::TokenKind getTokenKind() const;
bool parseExpr(int64_t &Imm);
+ StringRef getTokenStr() const;
+ AsmToken peekToken();
+ AsmToken getToken() const;
+ SMLoc getLoc() const;
+ void lex();
public:
OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
@@ -1110,6 +1336,7 @@ public:
OperandMatchResultTy parseInterpSlot(OperandVector &Operands);
OperandMatchResultTy parseInterpAttr(OperandVector &Operands);
OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
+ OperandMatchResultTy parseBoolReg(OperandVector &Operands);
bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
const unsigned MinVal,
@@ -1124,20 +1351,23 @@ public:
bool parseSwizzleSwap(int64_t &Imm);
bool parseSwizzleReverse(int64_t &Imm);
+ OperandMatchResultTy parseGPRIdxMode(OperandVector &Operands);
+ int64_t parseGPRIdxMacro();
+
void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); }
void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
+ AMDGPUOperand::Ptr defaultDLC() const;
AMDGPUOperand::Ptr defaultGLC() const;
AMDGPUOperand::Ptr defaultSLC() const;
AMDGPUOperand::Ptr defaultSMRDOffset8() const;
AMDGPUOperand::Ptr defaultSMRDOffset20() const;
AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
- AMDGPUOperand::Ptr defaultOffsetU12() const;
- AMDGPUOperand::Ptr defaultOffsetS13() const;
+ AMDGPUOperand::Ptr defaultFlatOffset() const;
OperandMatchResultTy parseOModOperand(OperandVector &Operands);
@@ -1153,11 +1383,15 @@ public:
bool IsAtomic = false);
void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
+ OperandMatchResultTy parseDim(OperandVector &Operands);
+ OperandMatchResultTy parseDPP8(OperandVector &Operands);
OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
AMDGPUOperand::Ptr defaultRowMask() const;
AMDGPUOperand::Ptr defaultBankMask() const;
AMDGPUOperand::Ptr defaultBoundCtrl() const;
- void cvtDPP(MCInst &Inst, const OperandVector &Operands);
+ AMDGPUOperand::Ptr defaultFI() const;
+ void cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8 = false);
+ void cvtDPP8(MCInst &Inst, const OperandVector &Operands) { cvtDPP(Inst, Operands, true); }
OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix,
AMDGPUOperand::ImmTy Type);
@@ -1168,6 +1402,13 @@ public:
void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
uint64_t BasicInstType, bool skipVcc = false);
+
+ AMDGPUOperand::Ptr defaultBLGP() const;
+ AMDGPUOperand::Ptr defaultCBSZ() const;
+ AMDGPUOperand::Ptr defaultABID() const;
+
+ OperandMatchResultTy parseEndpgmOp(OperandVector &Operands);
+ AMDGPUOperand::Ptr defaultEndpgmImmOperands() const;
};
struct OptionalOperand {
@@ -1203,6 +1444,8 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
return &APFloat::IEEEsingle();
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
@@ -1215,6 +1458,12 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
return &APFloat::IEEEhalf();
default:
llvm_unreachable("unsupported fp type");
@@ -1243,7 +1492,20 @@ static bool canLosslesslyConvertToFPType(APFloat &FPLiteral, MVT VT) {
return true;
}
+static bool isSafeTruncation(int64_t Val, unsigned Size) {
+ return isUIntN(Size, Val) || isIntN(Size, Val);
+}
+
bool AMDGPUOperand::isInlinableImm(MVT type) const {
+
+ // This is a hack to enable named inline values like
+ // shared_base with both 32-bit and 64-bit operands.
+ // Note that these values are defined as
+ // 32-bit operands only.
+ if (isInlineValue()) {
+ return true;
+ }
+
if (!isImmTy(ImmTyNone)) {
// Only plain immediates are inlinable (e.g. "clamp" attribute is not)
return false;
@@ -1282,6 +1544,10 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
AsmParser->hasInv2PiInlineImm());
}
+ if (!isSafeTruncation(Imm.Val, type.getScalarSizeInBits())) {
+ return false;
+ }
+
if (type.getScalarSizeInBits() == 16) {
return AMDGPU::isInlinableLiteral16(
static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()),
@@ -1315,7 +1581,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
// FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP
// types.
- return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val);
+ return isSafeTruncation(Imm.Val, Size);
}
// We got fp literal token
@@ -1330,8 +1596,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
return false;
}
+ // We allow fp literals with f16x2 operands assuming that the specified
+ // literal goes into the lower half and the upper half is zero. We also
+ // require that the literal may be losslesly converted to f16.
+ MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
+ (type == MVT::v2i16)? MVT::i16 : type;
+
APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
- return canLosslesslyConvertToFPType(FPLiteral, type);
+ return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
}
bool AMDGPUOperand::isRegClass(unsigned RCID) const {
@@ -1340,9 +1612,9 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
bool AMDGPUOperand::isSDWAOperand(MVT type) const {
if (AsmParser->isVI())
- return isVReg();
- else if (AsmParser->isGFX9())
- return isRegKind() || isInlinableImm(type);
+ return isVReg32();
+ else if (AsmParser->isGFX9() || AsmParser->isGFX10())
+ return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(type);
else
return false;
}
@@ -1363,6 +1635,11 @@ bool AMDGPUOperand::isSDWAInt32Operand() const {
return isSDWAOperand(MVT::i32);
}
+bool AMDGPUOperand::isBoolReg() const {
+ return AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
+ isSCSrcB64() : isSCSrcB32();
+}
+
uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
{
assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
@@ -1441,12 +1718,20 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16: {
bool lost;
APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
// Convert literal to single precision
@@ -1456,11 +1741,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
// checked earlier in isLiteralImm()
uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue();
- if (OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
- OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) {
- ImmVal |= (ImmVal << 16);
- }
-
Inst.addOperand(MCOperand::createImm(ImmVal));
return;
}
@@ -1471,15 +1751,18 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
return;
}
- // We got int literal token.
+ // We got int literal token.
// Only sign extend inline immediates.
- // FIXME: No errors on truncation
switch (OpTy) {
case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
- if (isInt<32>(Val) &&
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ if (isSafeTruncation(Val, 32) &&
AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Val));
@@ -1505,7 +1788,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
- if (isInt<16>(Val) &&
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ if (isSafeTruncation(Val, 16) &&
AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Val));
@@ -1516,14 +1801,14 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
return;
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
- auto LiteralVal = static_cast<uint16_t>(Literal.getLoBits(16).getZExtValue());
- assert(AMDGPU::isInlinableLiteral16(LiteralVal,
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
+ assert(isSafeTruncation(Val, 16));
+ assert(AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
AsmParser->hasInv2PiInlineImm()));
- uint32_t ImmVal = static_cast<uint32_t>(LiteralVal) << 16 |
- static_cast<uint32_t>(LiteralVal);
- Inst.addOperand(MCOperand::createImm(ImmVal));
+ Inst.addOperand(MCOperand::createImm(Val));
return;
}
default:
@@ -1552,6 +1837,27 @@ void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const {
Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), AsmParser->getSTI())));
}
+static bool isInlineValue(unsigned Reg) {
+ switch (Reg) {
+ case AMDGPU::SRC_SHARED_BASE:
+ case AMDGPU::SRC_SHARED_LIMIT:
+ case AMDGPU::SRC_PRIVATE_BASE:
+ case AMDGPU::SRC_PRIVATE_LIMIT:
+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+ return true;
+ case AMDGPU::SRC_VCCZ:
+ case AMDGPU::SRC_EXECZ:
+ case AMDGPU::SRC_SCC:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool AMDGPUOperand::isInlineValue() const {
+ return isRegKind() && ::isInlineValue(getReg());
+}
+
//===----------------------------------------------------------------------===//
// AsmParser
//===----------------------------------------------------------------------===//
@@ -1585,6 +1891,15 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 8: return AMDGPU::SGPR_256RegClassID;
case 16: return AMDGPU::SGPR_512RegClassID;
}
+ } else if (Is == IS_AGPR) {
+ switch (RegWidth) {
+ default: return -1;
+ case 1: return AMDGPU::AGPR_32RegClassID;
+ case 2: return AMDGPU::AReg_64RegClassID;
+ case 4: return AMDGPU::AReg_128RegClassID;
+ case 16: return AMDGPU::AReg_512RegClassID;
+ case 32: return AMDGPU::AReg_1024RegClassID;
+ }
}
return -1;
}
@@ -1595,8 +1910,25 @@ static unsigned getSpecialRegForName(StringRef RegName) {
.Case("vcc", AMDGPU::VCC)
.Case("flat_scratch", AMDGPU::FLAT_SCR)
.Case("xnack_mask", AMDGPU::XNACK_MASK)
+ .Case("shared_base", AMDGPU::SRC_SHARED_BASE)
+ .Case("src_shared_base", AMDGPU::SRC_SHARED_BASE)
+ .Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT)
+ .Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT)
+ .Case("private_base", AMDGPU::SRC_PRIVATE_BASE)
+ .Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE)
+ .Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
+ .Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT)
+ .Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
+ .Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID)
+ .Case("lds_direct", AMDGPU::LDS_DIRECT)
+ .Case("src_lds_direct", AMDGPU::LDS_DIRECT)
.Case("m0", AMDGPU::M0)
- .Case("scc", AMDGPU::SCC)
+ .Case("vccz", AMDGPU::SRC_VCCZ)
+ .Case("src_vccz", AMDGPU::SRC_VCCZ)
+ .Case("execz", AMDGPU::SRC_EXECZ)
+ .Case("src_execz", AMDGPU::SRC_EXECZ)
+ .Case("scc", AMDGPU::SRC_SCC)
+ .Case("src_scc", AMDGPU::SRC_SCC)
.Case("tba", AMDGPU::TBA)
.Case("tma", AMDGPU::TMA)
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
@@ -1611,6 +1943,7 @@ static unsigned getSpecialRegForName(StringRef RegName) {
.Case("tma_hi", AMDGPU::TMA_HI)
.Case("tba_lo", AMDGPU::TBA_LO)
.Case("tba_hi", AMDGPU::TBA_HI)
+ .Case("null", AMDGPU::SGPR_NULL)
.Default(0);
}
@@ -1663,6 +1996,7 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
return false;
case IS_VGPR:
case IS_SGPR:
+ case IS_AGPR:
case IS_TTMP:
if (Reg1 != Reg + RegWidth) {
return false;
@@ -1674,6 +2008,53 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
}
}
+static const StringRef Registers[] = {
+ { "v" },
+ { "s" },
+ { "ttmp" },
+ { "acc" },
+ { "a" },
+};
+
+bool
+AMDGPUAsmParser::isRegister(const AsmToken &Token,
+ const AsmToken &NextToken) const {
+
+ // A list of consecutive registers: [s0,s1,s2,s3]
+ if (Token.is(AsmToken::LBrac))
+ return true;
+
+ if (!Token.is(AsmToken::Identifier))
+ return false;
+
+ // A single register like s0 or a range of registers like s[0:1]
+
+ StringRef RegName = Token.getString();
+
+ for (StringRef Reg : Registers) {
+ if (RegName.startswith(Reg)) {
+ if (Reg.size() < RegName.size()) {
+ unsigned RegNum;
+ // A single register with an index: rXX
+ if (!RegName.substr(Reg.size()).getAsInteger(10, RegNum))
+ return true;
+ } else {
+ // A range of registers: r[XX:YY].
+ if (NextToken.is(AsmToken::LBrac))
+ return true;
+ }
+ }
+ }
+
+ return getSpecialRegForName(RegName);
+}
+
+bool
+AMDGPUAsmParser::isRegister()
+{
+ return isRegister(getToken(), peekToken());
+}
+
bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
unsigned &RegNum, unsigned &RegWidth,
unsigned *DwordRegIndex) {
@@ -1692,6 +2073,9 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
} else if (RegName[0] == 's') {
RegNumIndex = 1;
RegKind = IS_SGPR;
+ } else if (RegName[0] == 'a') {
+ RegNumIndex = RegName.startswith("acc") ? 3 : 1;
+ RegKind = IS_AGPR;
} else if (RegName.startswith("ttmp")) {
RegNumIndex = strlen("ttmp");
RegKind = IS_TTMP;
@@ -1773,6 +2157,7 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
break;
case IS_VGPR:
case IS_SGPR:
+ case IS_AGPR:
case IS_TTMP:
{
unsigned Size = 1;
@@ -1859,6 +2244,8 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
unsigned Reg, RegNum, RegWidth, DwordRegIndex;
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
+ //FIXME: improve error messages (bug 41303).
+ Error(StartLoc, "not a valid operand.");
return nullptr;
}
if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
@@ -1866,201 +2253,260 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
return nullptr;
} else
KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
- return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
+ return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc);
}
-bool
-AMDGPUAsmParser::parseAbsoluteExpr(int64_t &Val, bool AbsMod) {
- if (AbsMod && getLexer().peekTok().is(AsmToken::Pipe) &&
- (getLexer().getKind() == AsmToken::Integer ||
- getLexer().getKind() == AsmToken::Real)) {
- // This is a workaround for handling operands like these:
- // |1.0|
- // |-1|
- // This syntax is not compatible with syntax of standard
- // MC expressions (due to the trailing '|').
-
- SMLoc EndLoc;
- const MCExpr *Expr;
+OperandMatchResultTy
+AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
+ // TODO: add syntactic sugar for 1/(2*PI)
- if (getParser().parsePrimaryExpr(Expr, EndLoc)) {
- return true;
- }
+ assert(!isRegister());
+ assert(!isModifier());
- return !Expr->evaluateAsAbsolute(Val);
+ const auto& Tok = getToken();
+ const auto& NextTok = peekToken();
+ bool IsReal = Tok.is(AsmToken::Real);
+ SMLoc S = getLoc();
+ bool Negate = false;
+
+ if (!IsReal && Tok.is(AsmToken::Minus) && NextTok.is(AsmToken::Real)) {
+ lex();
+ IsReal = true;
+ Negate = true;
}
- return getParser().parseAbsoluteExpression(Val);
-}
+ if (IsReal) {
+ // Floating-point expressions are not supported.
+ // Can only allow floating-point literals with an
+ // optional sign.
-OperandMatchResultTy
-AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) {
- // TODO: add syntactic sugar for 1/(2*PI)
- bool Minus = false;
- if (getLexer().getKind() == AsmToken::Minus) {
- const AsmToken NextToken = getLexer().peekTok();
- if (!NextToken.is(AsmToken::Integer) &&
- !NextToken.is(AsmToken::Real)) {
- return MatchOperand_NoMatch;
- }
- Minus = true;
- Parser.Lex();
- }
+ StringRef Num = getTokenStr();
+ lex();
- SMLoc S = Parser.getTok().getLoc();
- switch(getLexer().getKind()) {
- case AsmToken::Integer: {
- int64_t IntVal;
- if (parseAbsoluteExpr(IntVal, AbsMod))
+ APFloat RealVal(APFloat::IEEEdouble());
+ auto roundMode = APFloat::rmNearestTiesToEven;
+ if (RealVal.convertFromString(Num, roundMode) == APFloat::opInvalidOp) {
return MatchOperand_ParseFail;
- if (Minus)
- IntVal *= -1;
- Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S));
+ }
+ if (Negate)
+ RealVal.changeSign();
+
+ Operands.push_back(
+ AMDGPUOperand::CreateImm(this, RealVal.bitcastToAPInt().getZExtValue(), S,
+ AMDGPUOperand::ImmTyNone, true));
+
return MatchOperand_Success;
- }
- case AsmToken::Real: {
+
+ } else {
int64_t IntVal;
- if (parseAbsoluteExpr(IntVal, AbsMod))
- return MatchOperand_ParseFail;
+ const MCExpr *Expr;
+ SMLoc S = getLoc();
+
+ if (HasSP3AbsModifier) {
+ // This is a workaround for handling expressions
+ // as arguments of SP3 'abs' modifier, for example:
+ // |1.0|
+ // |-1|
+ // |1+x|
+ // This syntax is not compatible with syntax of standard
+ // MC expressions (due to the trailing '|').
+ SMLoc EndLoc;
+ if (getParser().parsePrimaryExpr(Expr, EndLoc))
+ return MatchOperand_ParseFail;
+ } else {
+ if (Parser.parseExpression(Expr))
+ return MatchOperand_ParseFail;
+ }
+
+ if (Expr->evaluateAsAbsolute(IntVal)) {
+ Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S));
+ } else {
+ Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
+ }
- APFloat F(BitsToDouble(IntVal));
- if (Minus)
- F.changeSign();
- Operands.push_back(
- AMDGPUOperand::CreateImm(this, F.bitcastToAPInt().getZExtValue(), S,
- AMDGPUOperand::ImmTyNone, true));
return MatchOperand_Success;
}
- default:
- return MatchOperand_NoMatch;
- }
+
+ return MatchOperand_NoMatch;
}
OperandMatchResultTy
AMDGPUAsmParser::parseReg(OperandVector &Operands) {
+ if (!isRegister())
+ return MatchOperand_NoMatch;
+
if (auto R = parseRegister()) {
assert(R->isReg());
- R->Reg.IsForcedVOP3 = isForcedVOP3();
Operands.push_back(std::move(R));
return MatchOperand_Success;
}
- return MatchOperand_NoMatch;
+ return MatchOperand_ParseFail;
}
OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool AbsMod) {
- auto res = parseImm(Operands, AbsMod);
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod) {
+ auto res = parseReg(Operands);
if (res != MatchOperand_NoMatch) {
return res;
+ } else if (isModifier()) {
+ return MatchOperand_NoMatch;
+ } else {
+ return parseImm(Operands, HasSP3AbsMod);
}
+}
- return parseReg(Operands);
+bool
+AMDGPUAsmParser::isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const {
+ if (Token.is(AsmToken::Identifier) && NextToken.is(AsmToken::LParen)) {
+ const auto &str = Token.getString();
+ return str == "abs" || str == "neg" || str == "sext";
+ }
+ return false;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
- bool AllowImm) {
- bool Negate = false, Negate2 = false, Abs = false, Abs2 = false;
+bool
+AMDGPUAsmParser::isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const {
+ return Token.is(AsmToken::Identifier) && NextToken.is(AsmToken::Colon);
+}
- if (getLexer().getKind()== AsmToken::Minus) {
- const AsmToken NextToken = getLexer().peekTok();
+bool
+AMDGPUAsmParser::isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const {
+ return isNamedOperandModifier(Token, NextToken) || Token.is(AsmToken::Pipe);
+}
- // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead.
- if (NextToken.is(AsmToken::Minus)) {
- Error(Parser.getTok().getLoc(), "invalid syntax, expected 'neg' modifier");
- return MatchOperand_ParseFail;
- }
+bool
+AMDGPUAsmParser::isRegOrOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const {
+ return isRegister(Token, NextToken) || isOperandModifier(Token, NextToken);
+}
+
+// Check if this is an operand modifier or an opcode modifier
+// which may look like an expression but it is not. We should
+// avoid parsing these modifiers as expressions. Currently
+// recognized sequences are:
+// |...|
+// abs(...)
+// neg(...)
+// sext(...)
+// -reg
+// -|...|
+// -abs(...)
+// name:...
+// Note that simple opcode modifiers like 'gds' may be parsed as
+// expressions; this is a special case. See getExpressionAsToken.
+//
+bool
+AMDGPUAsmParser::isModifier() {
- // '-' followed by an integer literal N should be interpreted as integer
- // negation rather than a floating-point NEG modifier applied to N.
- // Beside being contr-intuitive, such use of floating-point NEG modifier
- // results in different meaning of integer literals used with VOP1/2/C
- // and VOP3, for example:
- // v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF
- // v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001
- // Negative fp literals should be handled likewise for unifomtity
- if (!NextToken.is(AsmToken::Integer) && !NextToken.is(AsmToken::Real)) {
- Parser.Lex();
- Negate = true;
- }
+ AsmToken Tok = getToken();
+ AsmToken NextToken[2];
+ peekTokens(NextToken);
+
+ return isOperandModifier(Tok, NextToken[0]) ||
+ (Tok.is(AsmToken::Minus) && isRegOrOperandModifier(NextToken[0], NextToken[1])) ||
+ isOpcodeModifierWithVal(Tok, NextToken[0]);
+}
+
+// Check if the current token is an SP3 'neg' modifier.
+// Currently this modifier is allowed in the following context:
+//
+// 1. Before a register, e.g. "-v0", "-v[...]" or "-[v0,v1]".
+// 2. Before an 'abs' modifier: -abs(...)
+// 3. Before an SP3 'abs' modifier: -|...|
+//
+// In all other cases "-" is handled as a part
+// of an expression that follows the sign.
+//
+// Note: When "-" is followed by an integer literal,
+// this is interpreted as integer negation rather
+// than a floating-point NEG modifier applied to N.
+// Beside being contr-intuitive, such use of floating-point
+// NEG modifier would have resulted in different meaning
+// of integer literals used with VOP1/2/C and VOP3,
+// for example:
+// v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF
+// v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001
+// Negative fp literals with preceding "-" are
+// handled likewise for unifomtity
+//
+bool
+AMDGPUAsmParser::parseSP3NegModifier() {
+
+ AsmToken NextToken[2];
+ peekTokens(NextToken);
+
+ if (isToken(AsmToken::Minus) &&
+ (isRegister(NextToken[0], NextToken[1]) ||
+ NextToken[0].is(AsmToken::Pipe) ||
+ isId(NextToken[0], "abs"))) {
+ lex();
+ return true;
}
- if (getLexer().getKind() == AsmToken::Identifier &&
- Parser.getTok().getString() == "neg") {
- if (Negate) {
- Error(Parser.getTok().getLoc(), "expected register or immediate");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Negate2 = true;
- if (getLexer().isNot(AsmToken::LParen)) {
- Error(Parser.getTok().getLoc(), "expected left paren after neg");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
+ return false;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
+ bool AllowImm) {
+ bool Neg, SP3Neg;
+ bool Abs, SP3Abs;
+ SMLoc Loc;
+
+ // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead.
+ if (isToken(AsmToken::Minus) && peekToken().is(AsmToken::Minus)) {
+ Error(getLoc(), "invalid syntax, expected 'neg' modifier");
+ return MatchOperand_ParseFail;
}
- if (getLexer().getKind() == AsmToken::Identifier &&
- Parser.getTok().getString() == "abs") {
- Parser.Lex();
- Abs2 = true;
- if (getLexer().isNot(AsmToken::LParen)) {
- Error(Parser.getTok().getLoc(), "expected left paren after abs");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
+ SP3Neg = parseSP3NegModifier();
+
+ Loc = getLoc();
+ Neg = trySkipId("neg");
+ if (Neg && SP3Neg) {
+ Error(Loc, "expected register or immediate");
+ return MatchOperand_ParseFail;
}
+ if (Neg && !skipToken(AsmToken::LParen, "expected left paren after neg"))
+ return MatchOperand_ParseFail;
- if (getLexer().getKind() == AsmToken::Pipe) {
- if (Abs2) {
- Error(Parser.getTok().getLoc(), "expected register or immediate");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Abs = true;
+ Abs = trySkipId("abs");
+ if (Abs && !skipToken(AsmToken::LParen, "expected left paren after abs"))
+ return MatchOperand_ParseFail;
+
+ Loc = getLoc();
+ SP3Abs = trySkipToken(AsmToken::Pipe);
+ if (Abs && SP3Abs) {
+ Error(Loc, "expected register or immediate");
+ return MatchOperand_ParseFail;
}
OperandMatchResultTy Res;
if (AllowImm) {
- Res = parseRegOrImm(Operands, Abs);
+ Res = parseRegOrImm(Operands, SP3Abs);
} else {
Res = parseReg(Operands);
}
if (Res != MatchOperand_Success) {
- return Res;
+ return (SP3Neg || Neg || SP3Abs || Abs)? MatchOperand_ParseFail : Res;
}
- AMDGPUOperand::Modifiers Mods;
- if (Abs) {
- if (getLexer().getKind() != AsmToken::Pipe) {
- Error(Parser.getTok().getLoc(), "expected vertical bar");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Mods.Abs = true;
- }
- if (Abs2) {
- if (getLexer().isNot(AsmToken::RParen)) {
- Error(Parser.getTok().getLoc(), "expected closing parentheses");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Mods.Abs = true;
- }
+ if (SP3Abs && !skipToken(AsmToken::Pipe, "expected vertical bar"))
+ return MatchOperand_ParseFail;
+ if (Abs && !skipToken(AsmToken::RParen, "expected closing parentheses"))
+ return MatchOperand_ParseFail;
+ if (Neg && !skipToken(AsmToken::RParen, "expected closing parentheses"))
+ return MatchOperand_ParseFail;
- if (Negate) {
- Mods.Neg = true;
- } else if (Negate2) {
- if (getLexer().isNot(AsmToken::RParen)) {
- Error(Parser.getTok().getLoc(), "expected closing parentheses");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Mods.Neg = true;
- }
+ AMDGPUOperand::Modifiers Mods;
+ Mods.Abs = Abs || SP3Abs;
+ Mods.Neg = Neg || SP3Neg;
if (Mods.hasFPModifiers()) {
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+ if (Op.isExpr()) {
+ Error(Op.getStartLoc(), "expected an absolute expression");
+ return MatchOperand_ParseFail;
+ }
Op.setModifiers(Mods);
}
return MatchOperand_Success;
@@ -2069,18 +2515,9 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
OperandMatchResultTy
AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands,
bool AllowImm) {
- bool Sext = false;
-
- if (getLexer().getKind() == AsmToken::Identifier &&
- Parser.getTok().getString() == "sext") {
- Parser.Lex();
- Sext = true;
- if (getLexer().isNot(AsmToken::LParen)) {
- Error(Parser.getTok().getLoc(), "expected left paren after sext");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- }
+ bool Sext = trySkipId("sext");
+ if (Sext && !skipToken(AsmToken::LParen, "expected left paren after sext"))
+ return MatchOperand_ParseFail;
OperandMatchResultTy Res;
if (AllowImm) {
@@ -2089,21 +2526,21 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands,
Res = parseReg(Operands);
}
if (Res != MatchOperand_Success) {
- return Res;
+ return Sext? MatchOperand_ParseFail : Res;
}
+ if (Sext && !skipToken(AsmToken::RParen, "expected closing parentheses"))
+ return MatchOperand_ParseFail;
+
AMDGPUOperand::Modifiers Mods;
- if (Sext) {
- if (getLexer().isNot(AsmToken::RParen)) {
- Error(Parser.getTok().getLoc(), "expected closing parentheses");
- return MatchOperand_ParseFail;
- }
- Parser.Lex();
- Mods.Sext = true;
- }
+ Mods.Sext = Sext;
if (Mods.hasIntModifiers()) {
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+ if (Op.isExpr()) {
+ Error(Op.getStartLoc(), "expected an absolute expression");
+ return MatchOperand_ParseFail;
+ }
Op.setModifiers(Mods);
}
@@ -2121,21 +2558,24 @@ AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) {
}
OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
+ auto Loc = getLoc();
+ if (trySkipId("off")) {
+ Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Loc,
+ AMDGPUOperand::ImmTyOff, false));
+ return MatchOperand_Success;
+ }
+
+ if (!isRegister())
+ return MatchOperand_NoMatch;
+
std::unique_ptr<AMDGPUOperand> Reg = parseRegister();
if (Reg) {
Operands.push_back(std::move(Reg));
return MatchOperand_Success;
}
- const AsmToken &Tok = Parser.getTok();
- if (Tok.getString() == "off") {
- Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Tok.getLoc(),
- AMDGPUOperand::ImmTyOff, false));
- Parser.Lex();
- return MatchOperand_Success;
- }
+ return MatchOperand_ParseFail;
- return MatchOperand_NoMatch;
}
unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
@@ -2163,15 +2603,6 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
}
}
- if ((TSFlags & SIInstrFlags::FLAT) && !hasFlatOffsets()) {
- // FIXME: Produces error without correct column reported.
- auto OpNum =
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset);
- const auto &Op = Inst.getOperand(OpNum);
- if (Op.getImm() != 0)
- return Match_InvalidOperand;
- }
-
return Match_Success;
}
@@ -2214,7 +2645,10 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
switch (Reg) {
case AMDGPU::FLAT_SCR:
case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
case AMDGPU::M0:
+ case AMDGPU::SGPR_NULL:
return Reg;
default:
break;
@@ -2248,7 +2682,11 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
case 2: {
const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType;
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
- OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) {
+ OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
+ OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 ||
+ OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 ||
+ OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
+ OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) {
return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
} else {
return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
@@ -2272,6 +2710,8 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
const unsigned Opcode = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opcode);
unsigned ConstantBusUseCount = 0;
+ unsigned NumLiterals = 0;
+ unsigned LiteralSize;
if (Desc.TSFlags &
(SIInstrFlags::VOPC |
@@ -2283,8 +2723,10 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
++ConstantBusUseCount;
}
+ SmallDenseSet<unsigned> SGPRsUsed;
unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst);
if (SGPRUsed != AMDGPU::NoRegister) {
+ SGPRsUsed.insert(SGPRUsed);
++ConstantBusUseCount;
}
@@ -2307,16 +2749,41 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
// flat_scratch_lo, flat_scratch_hi
// are theoretically valid but they are disabled anyway.
// Note that this code mimics SIInstrInfo::verifyInstruction
- if (Reg != SGPRUsed) {
+ if (!SGPRsUsed.count(Reg)) {
+ SGPRsUsed.insert(Reg);
++ConstantBusUseCount;
}
- SGPRUsed = Reg;
} else { // Expression or a literal
- ++ConstantBusUseCount;
+
+ if (Desc.OpInfo[OpIdx].OperandType == MCOI::OPERAND_IMMEDIATE)
+ continue; // special operand like VINTERP attr_chan
+
+ // An instruction may use only one literal.
+ // This has been validated on the previous step.
+ // See validateVOP3Literal.
+ // This literal may be used as more than one operand.
+ // If all these operands are of the same size,
+ // this literal counts as one scalar value.
+ // Otherwise it counts as 2 scalar values.
+ // See "GFX10 Shader Programming", section 3.6.2.3.
+
+ unsigned Size = AMDGPU::getOperandSize(Desc, OpIdx);
+ if (Size < 4) Size = 4;
+
+ if (NumLiterals == 0) {
+ NumLiterals = 1;
+ LiteralSize = Size;
+ } else if (LiteralSize != Size) {
+ NumLiterals = 2;
+ }
}
}
}
}
+ ConstantBusUseCount += NumLiterals;
+
+ if (isGFX10())
+ return ConstantBusUseCount <= 2;
return ConstantBusUseCount <= 1;
}
@@ -2405,6 +2872,46 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
return (VDataSize / 4) == DataSize + TFESize;
}
+bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0 || !isGFX10())
+ return true;
+
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+ int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
+ int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+ int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
+
+ assert(VAddr0Idx != -1);
+ assert(SrsrcIdx != -1);
+ assert(DimIdx != -1);
+ assert(SrsrcIdx > VAddr0Idx);
+
+ unsigned Dim = Inst.getOperand(DimIdx).getImm();
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
+ bool IsNSA = SrsrcIdx - VAddr0Idx > 1;
+ unsigned VAddrSize =
+ IsNSA ? SrsrcIdx - VAddr0Idx
+ : AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4;
+
+ unsigned AddrSize = BaseOpcode->NumExtraArgs +
+ (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
+ (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ if (!IsNSA) {
+ if (AddrSize > 8)
+ AddrSize = 16;
+ else if (AddrSize > 4)
+ AddrSize = 8;
+ }
+
+ return VAddrSize == AddrSize;
+}
+
bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
@@ -2461,8 +2968,346 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
return true;
}
+bool AMDGPUAsmParser::validateMIMGDim(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+
+ int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
+ if (DimIdx < 0)
+ return true;
+
+ long Imm = Inst.getOperand(DimIdx).getImm();
+ if (Imm < 0 || Imm >= 8)
+ return false;
+
+ return true;
+}
+
+static bool IsRevOpcode(const unsigned Opcode)
+{
+ switch (Opcode) {
+ case AMDGPU::V_SUBREV_F32_e32:
+ case AMDGPU::V_SUBREV_F32_e64:
+ case AMDGPU::V_SUBREV_F32_e32_gfx10:
+ case AMDGPU::V_SUBREV_F32_e32_gfx6_gfx7:
+ case AMDGPU::V_SUBREV_F32_e32_vi:
+ case AMDGPU::V_SUBREV_F32_e64_gfx10:
+ case AMDGPU::V_SUBREV_F32_e64_gfx6_gfx7:
+ case AMDGPU::V_SUBREV_F32_e64_vi:
+
+ case AMDGPU::V_SUBREV_I32_e32:
+ case AMDGPU::V_SUBREV_I32_e64:
+ case AMDGPU::V_SUBREV_I32_e32_gfx6_gfx7:
+ case AMDGPU::V_SUBREV_I32_e64_gfx6_gfx7:
+
+ case AMDGPU::V_SUBBREV_U32_e32:
+ case AMDGPU::V_SUBBREV_U32_e64:
+ case AMDGPU::V_SUBBREV_U32_e32_gfx6_gfx7:
+ case AMDGPU::V_SUBBREV_U32_e32_vi:
+ case AMDGPU::V_SUBBREV_U32_e64_gfx6_gfx7:
+ case AMDGPU::V_SUBBREV_U32_e64_vi:
+
+ case AMDGPU::V_SUBREV_U32_e32:
+ case AMDGPU::V_SUBREV_U32_e64:
+ case AMDGPU::V_SUBREV_U32_e32_gfx9:
+ case AMDGPU::V_SUBREV_U32_e32_vi:
+ case AMDGPU::V_SUBREV_U32_e64_gfx9:
+ case AMDGPU::V_SUBREV_U32_e64_vi:
+
+ case AMDGPU::V_SUBREV_F16_e32:
+ case AMDGPU::V_SUBREV_F16_e64:
+ case AMDGPU::V_SUBREV_F16_e32_gfx10:
+ case AMDGPU::V_SUBREV_F16_e32_vi:
+ case AMDGPU::V_SUBREV_F16_e64_gfx10:
+ case AMDGPU::V_SUBREV_F16_e64_vi:
+
+ case AMDGPU::V_SUBREV_U16_e32:
+ case AMDGPU::V_SUBREV_U16_e64:
+ case AMDGPU::V_SUBREV_U16_e32_vi:
+ case AMDGPU::V_SUBREV_U16_e64_vi:
+
+ case AMDGPU::V_SUBREV_CO_U32_e32_gfx9:
+ case AMDGPU::V_SUBREV_CO_U32_e64_gfx10:
+ case AMDGPU::V_SUBREV_CO_U32_e64_gfx9:
+
+ case AMDGPU::V_SUBBREV_CO_U32_e32_gfx9:
+ case AMDGPU::V_SUBBREV_CO_U32_e64_gfx9:
+
+ case AMDGPU::V_SUBREV_NC_U32_e32_gfx10:
+ case AMDGPU::V_SUBREV_NC_U32_e64_gfx10:
+
+ case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_e64_gfx10:
+
+ case AMDGPU::V_LSHRREV_B32_e32:
+ case AMDGPU::V_LSHRREV_B32_e64:
+ case AMDGPU::V_LSHRREV_B32_e32_gfx6_gfx7:
+ case AMDGPU::V_LSHRREV_B32_e64_gfx6_gfx7:
+ case AMDGPU::V_LSHRREV_B32_e32_vi:
+ case AMDGPU::V_LSHRREV_B32_e64_vi:
+ case AMDGPU::V_LSHRREV_B32_e32_gfx10:
+ case AMDGPU::V_LSHRREV_B32_e64_gfx10:
+
+ case AMDGPU::V_ASHRREV_I32_e32:
+ case AMDGPU::V_ASHRREV_I32_e64:
+ case AMDGPU::V_ASHRREV_I32_e32_gfx10:
+ case AMDGPU::V_ASHRREV_I32_e32_gfx6_gfx7:
+ case AMDGPU::V_ASHRREV_I32_e32_vi:
+ case AMDGPU::V_ASHRREV_I32_e64_gfx10:
+ case AMDGPU::V_ASHRREV_I32_e64_gfx6_gfx7:
+ case AMDGPU::V_ASHRREV_I32_e64_vi:
+
+ case AMDGPU::V_LSHLREV_B32_e32:
+ case AMDGPU::V_LSHLREV_B32_e64:
+ case AMDGPU::V_LSHLREV_B32_e32_gfx10:
+ case AMDGPU::V_LSHLREV_B32_e32_gfx6_gfx7:
+ case AMDGPU::V_LSHLREV_B32_e32_vi:
+ case AMDGPU::V_LSHLREV_B32_e64_gfx10:
+ case AMDGPU::V_LSHLREV_B32_e64_gfx6_gfx7:
+ case AMDGPU::V_LSHLREV_B32_e64_vi:
+
+ case AMDGPU::V_LSHLREV_B16_e32:
+ case AMDGPU::V_LSHLREV_B16_e64:
+ case AMDGPU::V_LSHLREV_B16_e32_vi:
+ case AMDGPU::V_LSHLREV_B16_e64_vi:
+ case AMDGPU::V_LSHLREV_B16_gfx10:
+
+ case AMDGPU::V_LSHRREV_B16_e32:
+ case AMDGPU::V_LSHRREV_B16_e64:
+ case AMDGPU::V_LSHRREV_B16_e32_vi:
+ case AMDGPU::V_LSHRREV_B16_e64_vi:
+ case AMDGPU::V_LSHRREV_B16_gfx10:
+
+ case AMDGPU::V_ASHRREV_I16_e32:
+ case AMDGPU::V_ASHRREV_I16_e64:
+ case AMDGPU::V_ASHRREV_I16_e32_vi:
+ case AMDGPU::V_ASHRREV_I16_e64_vi:
+ case AMDGPU::V_ASHRREV_I16_gfx10:
+
+ case AMDGPU::V_LSHLREV_B64:
+ case AMDGPU::V_LSHLREV_B64_gfx10:
+ case AMDGPU::V_LSHLREV_B64_vi:
+
+ case AMDGPU::V_LSHRREV_B64:
+ case AMDGPU::V_LSHRREV_B64_gfx10:
+ case AMDGPU::V_LSHRREV_B64_vi:
+
+ case AMDGPU::V_ASHRREV_I64:
+ case AMDGPU::V_ASHRREV_I64_gfx10:
+ case AMDGPU::V_ASHRREV_I64_vi:
+
+ case AMDGPU::V_PK_LSHLREV_B16:
+ case AMDGPU::V_PK_LSHLREV_B16_gfx10:
+ case AMDGPU::V_PK_LSHLREV_B16_vi:
+
+ case AMDGPU::V_PK_LSHRREV_B16:
+ case AMDGPU::V_PK_LSHRREV_B16_gfx10:
+ case AMDGPU::V_PK_LSHRREV_B16_vi:
+ case AMDGPU::V_PK_ASHRREV_I16:
+ case AMDGPU::V_PK_ASHRREV_I16_gfx10:
+ case AMDGPU::V_PK_ASHRREV_I16_vi:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
+
+ using namespace SIInstrFlags;
+ const unsigned Opcode = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opcode);
+
+ // lds_direct register is defined so that it can be used
+ // with 9-bit operands only. Ignore encodings which do not accept these.
+ if ((Desc.TSFlags & (VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA)) == 0)
+ return true;
+
+ const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+ const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+ const int SrcIndices[] = { Src1Idx, Src2Idx };
+
+ // lds_direct cannot be specified as either src1 or src2.
+ for (int SrcIdx : SrcIndices) {
+ if (SrcIdx == -1) break;
+ const MCOperand &Src = Inst.getOperand(SrcIdx);
+ if (Src.isReg() && Src.getReg() == LDS_DIRECT) {
+ return false;
+ }
+ }
+
+ if (Src0Idx == -1)
+ return true;
+
+ const MCOperand &Src = Inst.getOperand(Src0Idx);
+ if (!Src.isReg() || Src.getReg() != LDS_DIRECT)
+ return true;
+
+ // lds_direct is specified as src0. Check additional limitations.
+ return (Desc.TSFlags & SIInstrFlags::SDWA) == 0 && !IsRevOpcode(Opcode);
+}
+
+SMLoc AMDGPUAsmParser::getFlatOffsetLoc(const OperandVector &Operands) const {
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ if (Op.isFlatOffset())
+ return Op.getStartLoc();
+ }
+ return getLoc();
+}
+
+bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
+ const OperandVector &Operands) {
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ if ((TSFlags & SIInstrFlags::FLAT) == 0)
+ return true;
+
+ auto Opcode = Inst.getOpcode();
+ auto OpNum = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset);
+ assert(OpNum != -1);
+
+ const auto &Op = Inst.getOperand(OpNum);
+ if (!hasFlatOffsets() && Op.getImm() != 0) {
+ Error(getFlatOffsetLoc(Operands),
+ "flat offset modifier is not supported on this GPU");
+ return false;
+ }
+
+ // Address offset is 12-bit signed for GFX10, 13-bit for GFX9.
+ // For FLAT segment the offset must be positive;
+ // MSB is ignored and forced to zero.
+ unsigned OffsetSize = isGFX9() ? 13 : 12;
+ if (TSFlags & SIInstrFlags::IsNonFlatSeg) {
+ if (!isIntN(OffsetSize, Op.getImm())) {
+ Error(getFlatOffsetLoc(Operands),
+ isGFX9() ? "expected a 13-bit signed offset" :
+ "expected a 12-bit signed offset");
+ return false;
+ }
+ } else {
+ if (!isUIntN(OffsetSize - 1, Op.getImm())) {
+ Error(getFlatOffsetLoc(Operands),
+ isGFX9() ? "expected a 12-bit unsigned offset" :
+ "expected an 11-bit unsigned offset");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
+ unsigned Opcode = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opcode);
+ if (!(Desc.TSFlags & (SIInstrFlags::SOP2 | SIInstrFlags::SOPC)))
+ return true;
+
+ const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+
+ const int OpIndices[] = { Src0Idx, Src1Idx };
+
+ unsigned NumLiterals = 0;
+ uint32_t LiteralValue;
+
+ for (int OpIdx : OpIndices) {
+ if (OpIdx == -1) break;
+
+ const MCOperand &MO = Inst.getOperand(OpIdx);
+ if (MO.isImm() &&
+ // Exclude special imm operands (like that used by s_set_gpr_idx_on)
+ AMDGPU::isSISrcOperand(Desc, OpIdx) &&
+ !isInlineConstant(Inst, OpIdx)) {
+ uint32_t Value = static_cast<uint32_t>(MO.getImm());
+ if (NumLiterals == 0 || LiteralValue != Value) {
+ LiteralValue = Value;
+ ++NumLiterals;
+ }
+ }
+ }
+
+ return NumLiterals <= 1;
+}
+
+bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) {
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+ if (OpSel & ~3)
+ return false;
+ }
+ return true;
+}
+
+// Check if VCC register matches wavefront size
+bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const {
+ auto FB = getFeatureBits();
+ return (FB[AMDGPU::FeatureWavefrontSize64] && Reg == AMDGPU::VCC) ||
+ (FB[AMDGPU::FeatureWavefrontSize32] && Reg == AMDGPU::VCC_LO);
+}
+
+// VOP3 literal is only allowed in GFX10+ and only one can be used
+bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
+ unsigned Opcode = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opcode);
+ if (!(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)))
+ return true;
+
+ const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+ const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+ const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+ unsigned NumLiterals = 0;
+ uint32_t LiteralValue;
+
+ for (int OpIdx : OpIndices) {
+ if (OpIdx == -1) break;
+
+ const MCOperand &MO = Inst.getOperand(OpIdx);
+ if (!MO.isImm() || !AMDGPU::isSISrcOperand(Desc, OpIdx))
+ continue;
+
+ if (!isInlineConstant(Inst, OpIdx)) {
+ uint32_t Value = static_cast<uint32_t>(MO.getImm());
+ if (NumLiterals == 0 || LiteralValue != Value) {
+ LiteralValue = Value;
+ ++NumLiterals;
+ }
+ }
+ }
+
+ return !NumLiterals ||
+ (NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]);
+}
+
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
- const SMLoc &IDLoc) {
+ const SMLoc &IDLoc,
+ const OperandVector &Operands) {
+ if (!validateLdsDirect(Inst)) {
+ Error(IDLoc,
+ "invalid use of lds_direct");
+ return false;
+ }
+ if (!validateSOPLiteral(Inst)) {
+ Error(IDLoc,
+ "only one literal operand is allowed");
+ return false;
+ }
+ if (!validateVOP3Literal(Inst)) {
+ Error(IDLoc,
+ "invalid literal operand");
+ return false;
+ }
if (!validateConstantBusLimitations(Inst)) {
Error(IDLoc,
"invalid operand (violates constant bus restrictions)");
@@ -2478,17 +3323,31 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"integer clamping is not supported on this GPU");
return false;
}
+ if (!validateOpSel(Inst)) {
+ Error(IDLoc,
+ "invalid op_sel operand");
+ return false;
+ }
// For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
if (!validateMIMGD16(Inst)) {
Error(IDLoc,
"d16 modifier is not supported on this GPU");
return false;
}
+ if (!validateMIMGDim(Inst)) {
+ Error(IDLoc, "dim modifier is required on this GPU");
+ return false;
+ }
if (!validateMIMGDataSize(Inst)) {
Error(IDLoc,
"image data size does not match dmask and tfe");
return false;
}
+ if (!validateMIMGAddrSize(Inst)) {
+ Error(IDLoc,
+ "image address size does not match dim and a16");
+ return false;
+ }
if (!validateMIMGAtomicDMask(Inst)) {
Error(IDLoc,
"invalid atomic image dmask");
@@ -2499,11 +3358,15 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"invalid image_gather dmask: only one bit must be set");
return false;
}
+ if (!validateFlatOffset(Inst, Operands)) {
+ return false;
+ }
return true;
}
-static std::string AMDGPUMnemonicSpellCheck(StringRef S, uint64_t FBS,
+static std::string AMDGPUMnemonicSpellCheck(StringRef S,
+ const FeatureBitset &FBS,
unsigned VariantID = 0);
bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -2538,7 +3401,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
switch (Result) {
default: break;
case Match_Success:
- if (!validateInstruction(Inst, IDLoc)) {
+ if (!validateInstruction(Inst, IDLoc, Operands)) {
return true;
}
Inst.setLoc(IDLoc);
@@ -2549,7 +3412,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return Error(IDLoc, "instruction not supported on this GPU");
case Match_MnemonicFail: {
- uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+ FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
std::string Suggestion = AMDGPUMnemonicSpellCheck(
((AMDGPUOperand &)*Operands[0]).getToken(), FBS);
return Error(IDLoc, "invalid instruction" + Suggestion,
@@ -2632,32 +3495,39 @@ bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) {
bool AMDGPUAsmParser::calculateGPRBlocks(
const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed,
- bool XNACKUsed, unsigned NextFreeVGPR, SMRange VGPRRange,
- unsigned NextFreeSGPR, SMRange SGPRRange, unsigned &VGPRBlocks,
- unsigned &SGPRBlocks) {
+ bool XNACKUsed, Optional<bool> EnableWavefrontSize32, unsigned NextFreeVGPR,
+ SMRange VGPRRange, unsigned NextFreeSGPR, SMRange SGPRRange,
+ unsigned &VGPRBlocks, unsigned &SGPRBlocks) {
// TODO(scott.linder): These calculations are duplicated from
// AMDGPUAsmPrinter::getSIProgramInfo and could be unified.
IsaVersion Version = getIsaVersion(getSTI().getCPU());
unsigned NumVGPRs = NextFreeVGPR;
unsigned NumSGPRs = NextFreeSGPR;
- unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(&getSTI());
- if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
- NumSGPRs > MaxAddressableNumSGPRs)
- return OutOfRangeError(SGPRRange);
+ if (Version.Major >= 10)
+ NumSGPRs = 0;
+ else {
+ unsigned MaxAddressableNumSGPRs =
+ IsaInfo::getAddressableNumSGPRs(&getSTI());
- NumSGPRs +=
- IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed);
+ if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
+ NumSGPRs > MaxAddressableNumSGPRs)
+ return OutOfRangeError(SGPRRange);
- if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
- NumSGPRs > MaxAddressableNumSGPRs)
- return OutOfRangeError(SGPRRange);
+ NumSGPRs +=
+ IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed);
- if (Features.test(FeatureSGPRInitBug))
- NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+ if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
+ NumSGPRs > MaxAddressableNumSGPRs)
+ return OutOfRangeError(SGPRRange);
- VGPRBlocks = IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs);
+ if (Features.test(FeatureSGPRInitBug))
+ NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+ }
+
+ VGPRBlocks =
+ IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs, EnableWavefrontSize32);
SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs);
return false;
@@ -2674,7 +3544,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (getParser().parseIdentifier(KernelName))
return true;
- kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor();
+ kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(&getSTI());
StringSet<> Seen;
@@ -2688,6 +3558,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
bool ReserveVCC = true;
bool ReserveFlatScr = true;
bool ReserveXNACK = hasXNACK();
+ Optional<bool> EnableWavefrontSize32;
while (true) {
while (getLexer().is(AsmToken::EndOfStatement))
@@ -2736,37 +3607,45 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
Val, ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 4;
} else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
Val, ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
Val, ValRange);
- UserSGPRCount++;
+ UserSGPRCount += 1;
+ } else if (ID == ".amdhsa_wavefront_size32") {
+ if (IVersion.Major < 10)
+ return getParser().Error(IDRange.Start, "directive requires gfx10+",
+ IDRange);
+ EnableWavefrontSize32 = Val;
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
+ Val, ValRange);
} else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
PARSE_BITS_ENTRY(
KD.compute_pgm_rsrc2,
@@ -2841,6 +3720,24 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
ValRange);
+ } else if (ID == ".amdhsa_workgroup_processor_mode") {
+ if (IVersion.Major < 10)
+ return getParser().Error(IDRange.Start, "directive requires gfx10+",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_WGP_MODE, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_memory_ordered") {
+ if (IVersion.Major < 10)
+ return getParser().Error(IDRange.Start, "directive requires gfx10+",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_MEM_ORDERED, Val,
+ ValRange);
+ } else if (ID == ".amdhsa_forward_progress") {
+ if (IVersion.Major < 10)
+ return getParser().Error(IDRange.Start, "directive requires gfx10+",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val,
+ ValRange);
} else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
PARSE_BITS_ENTRY(
KD.compute_pgm_rsrc2,
@@ -2888,8 +3785,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
unsigned VGPRBlocks;
unsigned SGPRBlocks;
if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
- ReserveXNACK, NextFreeVGPR, VGPRRange, NextFreeSGPR,
- SGPRRange, VGPRBlocks, SGPRBlocks))
+ ReserveXNACK, EnableWavefrontSize32, NextFreeVGPR,
+ VGPRRange, NextFreeSGPR, SGPRRange, VGPRBlocks,
+ SGPRBlocks))
return true;
if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
@@ -2994,6 +3892,46 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
return TokError(Err.str());
}
Lex();
+
+ if (ID == "enable_wavefront_size32") {
+ if (Header.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) {
+ if (!isGFX10())
+ return TokError("enable_wavefront_size32=1 is only allowed on GFX10+");
+ if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
+ return TokError("enable_wavefront_size32=1 requires +WavefrontSize32");
+ } else {
+ if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64])
+ return TokError("enable_wavefront_size32=0 requires +WavefrontSize64");
+ }
+ }
+
+ if (ID == "wavefront_size") {
+ if (Header.wavefront_size == 5) {
+ if (!isGFX10())
+ return TokError("wavefront_size=5 is only allowed on GFX10+");
+ if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
+ return TokError("wavefront_size=5 requires +WavefrontSize32");
+ } else if (Header.wavefront_size == 6) {
+ if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64])
+ return TokError("wavefront_size=6 requires +WavefrontSize64");
+ }
+ }
+
+ if (ID == "enable_wgp_mode") {
+ if (G_00B848_WGP_MODE(Header.compute_pgm_resource_registers) && !isGFX10())
+ return TokError("enable_wgp_mode=1 is only allowed on GFX10+");
+ }
+
+ if (ID == "enable_mem_ordered") {
+ if (G_00B848_MEM_ORDERED(Header.compute_pgm_resource_registers) && !isGFX10())
+ return TokError("enable_mem_ordered=1 is only allowed on GFX10+");
+ }
+
+ if (ID == "enable_fwd_progress") {
+ if (G_00B848_FWD_PROGRESS(Header.compute_pgm_resource_registers) && !isGFX10())
+ return TokError("enable_fwd_progress=1 is only allowed on GFX10+");
+ }
+
return false;
}
@@ -3081,14 +4019,35 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
}
std::string HSAMetadataString;
- raw_string_ostream YamlStream(HSAMetadataString);
+ if (ParseToEndDirective(AssemblerDirectiveBegin, AssemblerDirectiveEnd,
+ HSAMetadataString))
+ return true;
+
+ if (IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
+ return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+ } else {
+ if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString))
+ return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+ }
+
+ return false;
+}
+
+/// Common code to parse out a block of text (typically YAML) between start and
+/// end directives.
+bool AMDGPUAsmParser::ParseToEndDirective(const char *AssemblerDirectiveBegin,
+ const char *AssemblerDirectiveEnd,
+ std::string &CollectString) {
+
+ raw_string_ostream CollectStream(CollectString);
getLexer().setSkipSpace(false);
bool FoundEnd = false;
while (!getLexer().is(AsmToken::Eof)) {
while (getLexer().is(AsmToken::Space)) {
- YamlStream << getLexer().getTok().getString();
+ CollectStream << getLexer().getTok().getString();
Lex();
}
@@ -3101,8 +4060,8 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
}
}
- YamlStream << Parser.parseStringToEndOfStatement()
- << getContext().getAsmInfo()->getSeparatorString();
+ CollectStream << Parser.parseStringToEndOfStatement()
+ << getContext().getAsmInfo()->getSeparatorString();
Parser.eatToEndOfStatement();
}
@@ -3111,22 +4070,27 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
if (getLexer().is(AsmToken::Eof) && !FoundEnd) {
return TokError(Twine("expected directive ") +
- Twine(HSAMD::AssemblerDirectiveEnd) + Twine(" not found"));
+ Twine(AssemblerDirectiveEnd) + Twine(" not found"));
}
- YamlStream.flush();
+ CollectStream.flush();
+ return false;
+}
- if (IsaInfo::hasCodeObjectV3(&getSTI())) {
- if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
- return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
- } else {
- if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString))
- return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
- }
+/// Parse the assembler directive for new MsgPack-format PAL metadata.
+bool AMDGPUAsmParser::ParseDirectivePALMetadataBegin() {
+ std::string String;
+ if (ParseToEndDirective(AMDGPU::PALMD::AssemblerDirectiveBegin,
+ AMDGPU::PALMD::AssemblerDirectiveEnd, String))
+ return true;
+ auto PALMetadata = getTargetStreamer().getPALMetadata();
+ if (!PALMetadata->setFromString(String))
+ return Error(getParser().getTok().getLoc(), "invalid PAL metadata");
return false;
}
+/// Parse the assembler directive for old linear-format PAL metadata.
bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
if (getSTI().getTargetTriple().getOS() != Triple::AMDPAL) {
return Error(getParser().getTok().getLoc(),
@@ -3134,19 +4098,82 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadata() {
"not available on non-amdpal OSes")).str());
}
- PALMD::Metadata PALMetadata;
+ auto PALMetadata = getTargetStreamer().getPALMetadata();
+ PALMetadata->setLegacy();
for (;;) {
- uint32_t Value;
+ uint32_t Key, Value;
+ if (ParseAsAbsoluteExpression(Key)) {
+ return TokError(Twine("invalid value in ") +
+ Twine(PALMD::AssemblerDirective));
+ }
+ if (getLexer().isNot(AsmToken::Comma)) {
+ return TokError(Twine("expected an even number of values in ") +
+ Twine(PALMD::AssemblerDirective));
+ }
+ Lex();
if (ParseAsAbsoluteExpression(Value)) {
return TokError(Twine("invalid value in ") +
Twine(PALMD::AssemblerDirective));
}
- PALMetadata.push_back(Value);
+ PALMetadata->setRegister(Key, Value);
if (getLexer().isNot(AsmToken::Comma))
break;
Lex();
}
- getTargetStreamer().EmitPALMetadata(PALMetadata);
+ return false;
+}
+
+/// ParseDirectiveAMDGPULDS
+/// ::= .amdgpu_lds identifier ',' size_expression [',' align_expression]
+bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
+ if (getParser().checkForValidSection())
+ return true;
+
+ StringRef Name;
+ SMLoc NameLoc = getLexer().getLoc();
+ if (getParser().parseIdentifier(Name))
+ return TokError("expected identifier in directive");
+
+ MCSymbol *Symbol = getContext().getOrCreateSymbol(Name);
+ if (parseToken(AsmToken::Comma, "expected ','"))
+ return true;
+
+ unsigned LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(&getSTI());
+
+ int64_t Size;
+ SMLoc SizeLoc = getLexer().getLoc();
+ if (getParser().parseAbsoluteExpression(Size))
+ return true;
+ if (Size < 0)
+ return Error(SizeLoc, "size must be non-negative");
+ if (Size > LocalMemorySize)
+ return Error(SizeLoc, "size is too large");
+
+ int64_t Align = 4;
+ if (getLexer().is(AsmToken::Comma)) {
+ Lex();
+ SMLoc AlignLoc = getLexer().getLoc();
+ if (getParser().parseAbsoluteExpression(Align))
+ return true;
+ if (Align < 0 || !isPowerOf2_64(Align))
+ return Error(AlignLoc, "alignment must be a power of two");
+
+ // Alignment larger than the size of LDS is possible in theory, as long
+ // as the linker manages to place to symbol at address 0, but we do want
+ // to make sure the alignment fits nicely into a 32-bit integer.
+ if (Align >= 1u << 31)
+ return Error(AlignLoc, "alignment is too large");
+ }
+
+ if (parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.amdgpu_lds' directive"))
+ return true;
+
+ Symbol->redefineIfPossible();
+ if (!Symbol->isUndefined())
+ return Error(NameLoc, "invalid symbol redefinition");
+
+ getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align);
return false;
}
@@ -3183,6 +4210,12 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
return ParseDirectiveHSAMetadata();
}
+ if (IDVal == ".amdgpu_lds")
+ return ParseDirectiveAMDGPULDS();
+
+ if (IDVal == PALMD::AssemblerDirectiveBegin)
+ return ParseDirectivePALMetadataBegin();
+
if (IDVal == PALMD::AssemblerDirective)
return ParseDirectivePALMetadata();
@@ -3195,21 +4228,36 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true);
R.isValid(); ++R) {
if (*R == RegNo)
- return isGFX9();
+ return isGFX9() || isGFX10();
+ }
+
+ // GFX10 has 2 more SGPRs 104 and 105.
+ for (MCRegAliasIterator R(AMDGPU::SGPR104_SGPR105, &MRI, true);
+ R.isValid(); ++R) {
+ if (*R == RegNo)
+ return hasSGPR104_SGPR105();
}
switch (RegNo) {
+ case AMDGPU::SRC_SHARED_BASE:
+ case AMDGPU::SRC_SHARED_LIMIT:
+ case AMDGPU::SRC_PRIVATE_BASE:
+ case AMDGPU::SRC_PRIVATE_LIMIT:
+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+ return !isCI() && !isSI() && !isVI();
case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
case AMDGPU::TMA:
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
- return !isGFX9();
+ return !isGFX9() && !isGFX10();
case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
- return !isCI() && !isSI() && hasXNACK();
+ return !isCI() && !isSI() && !isGFX10() && hasXNACK();
+ case AMDGPU::SGPR_NULL:
+ return isGFX10();
default:
break;
}
@@ -3217,8 +4265,10 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
if (isCI())
return true;
- if (isSI()) {
- // No flat_scr
+ if (isSI() || isGFX10()) {
+ // No flat_scr on SI.
+ // On GFX10 flat scratch is not a valid register operand and can only be
+ // accessed with s_setreg/s_getreg.
switch (RegNo) {
case AMDGPU::FLAT_SCR:
case AMDGPU::FLAT_SCR_LO:
@@ -3234,14 +4284,15 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
R.isValid(); ++R) {
if (*R == RegNo)
- return false;
+ return hasSGPR102_SGPR103();
}
return true;
}
OperandMatchResultTy
-AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
+ OperandMode Mode) {
// Try to parse with a custom parser
OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
@@ -3255,28 +4306,36 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
getLexer().is(AsmToken::EndOfStatement))
return ResTy;
- ResTy = parseRegOrImm(Operands);
+ if (Mode == OperandMode_NSA && getLexer().is(AsmToken::LBrac)) {
+ unsigned Prefix = Operands.size();
+ SMLoc LBraceLoc = getTok().getLoc();
+ Parser.Lex(); // eat the '['
- if (ResTy == MatchOperand_Success)
- return ResTy;
+ for (;;) {
+ ResTy = parseReg(Operands);
+ if (ResTy != MatchOperand_Success)
+ return ResTy;
- const auto &Tok = Parser.getTok();
- SMLoc S = Tok.getLoc();
+ if (getLexer().is(AsmToken::RBrac))
+ break;
- const MCExpr *Expr = nullptr;
- if (!Parser.parseExpression(Expr)) {
- Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
- return MatchOperand_Success;
- }
+ if (getLexer().isNot(AsmToken::Comma))
+ return MatchOperand_ParseFail;
+ Parser.Lex();
+ }
- // Possibly this is an instruction flag like 'gds'.
- if (Tok.getKind() == AsmToken::Identifier) {
- Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), S));
- Parser.Lex();
+ if (Operands.size() - Prefix > 1) {
+ Operands.insert(Operands.begin() + Prefix,
+ AMDGPUOperand::CreateToken(this, "[", LBraceLoc));
+ Operands.push_back(AMDGPUOperand::CreateToken(this, "]",
+ getTok().getLoc()));
+ }
+
+ Parser.Lex(); // eat the ']'
return MatchOperand_Success;
}
- return MatchOperand_NoMatch;
+ return parseRegOrImm(Operands);
}
StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) {
@@ -3308,8 +4367,13 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
Name = parseMnemonicSuffix(Name);
Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc));
+ bool IsMIMG = Name.startswith("image_");
+
while (!getLexer().is(AsmToken::EndOfStatement)) {
- OperandMatchResultTy Res = parseOperand(Operands, Name);
+ OperandMode Mode = OperandMode_Default;
+ if (IsMIMG && isGFX10() && Operands.size() == 2)
+ Mode = OperandMode_NSA;
+ OperandMatchResultTy Res = parseOperand(Operands, Name, Mode);
// Eat the comma or space if there is one.
if (getLexer().is(AsmToken::Comma))
@@ -3318,12 +4382,14 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
switch (Res) {
case MatchOperand_Success: break;
case MatchOperand_ParseFail:
+ // FIXME: use real operand location rather than the current location.
Error(getLexer().getLoc(), "failed parsing operand.");
while (!getLexer().is(AsmToken::EndOfStatement)) {
Parser.Lex();
}
return true;
case MatchOperand_NoMatch:
+ // FIXME: use real operand location rather than the current location.
Error(getLexer().getLoc(), "not a valid operand.");
while (!getLexer().is(AsmToken::EndOfStatement)) {
Parser.Lex();
@@ -3340,46 +4406,19 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
//===----------------------------------------------------------------------===//
OperandMatchResultTy
-AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
- switch(getLexer().getKind()) {
- default: return MatchOperand_NoMatch;
- case AsmToken::Identifier: {
- StringRef Name = Parser.getTok().getString();
- if (!Name.equals(Prefix)) {
- return MatchOperand_NoMatch;
- }
-
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Colon))
- return MatchOperand_ParseFail;
-
- Parser.Lex();
-
- bool IsMinus = false;
- if (getLexer().getKind() == AsmToken::Minus) {
- Parser.Lex();
- IsMinus = true;
- }
+AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &IntVal) {
- if (getLexer().isNot(AsmToken::Integer))
- return MatchOperand_ParseFail;
-
- if (getParser().parseAbsoluteExpression(Int))
- return MatchOperand_ParseFail;
+ if (!trySkipId(Prefix, AsmToken::Colon))
+ return MatchOperand_NoMatch;
- if (IsMinus)
- Int = -Int;
- break;
- }
- }
- return MatchOperand_Success;
+ return parseExpr(IntVal) ? MatchOperand_Success : MatchOperand_ParseFail;
}
OperandMatchResultTy
AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy,
bool (*ConvertResult)(int64_t&)) {
- SMLoc S = Parser.getTok().getLoc();
+ SMLoc S = getLoc();
int64_t Value = 0;
OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value);
@@ -3387,59 +4426,55 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
return Res;
if (ConvertResult && !ConvertResult(Value)) {
- return MatchOperand_ParseFail;
+ Error(S, "invalid " + StringRef(Prefix) + " value.");
}
Operands.push_back(AMDGPUOperand::CreateImm(this, Value, S, ImmTy));
return MatchOperand_Success;
}
-OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix(
- const char *Prefix,
- OperandVector &Operands,
- AMDGPUOperand::ImmTy ImmTy,
- bool (*ConvertResult)(int64_t&)) {
- StringRef Name = Parser.getTok().getString();
- if (!Name.equals(Prefix))
+OperandMatchResultTy
+AMDGPUAsmParser::parseOperandArrayWithPrefix(const char *Prefix,
+ OperandVector &Operands,
+ AMDGPUOperand::ImmTy ImmTy,
+ bool (*ConvertResult)(int64_t&)) {
+ SMLoc S = getLoc();
+ if (!trySkipId(Prefix, AsmToken::Colon))
return MatchOperand_NoMatch;
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Colon))
- return MatchOperand_ParseFail;
-
- Parser.Lex();
- if (getLexer().isNot(AsmToken::LBrac))
+ if (!skipToken(AsmToken::LBrac, "expected a left square bracket"))
return MatchOperand_ParseFail;
- Parser.Lex();
unsigned Val = 0;
- SMLoc S = Parser.getTok().getLoc();
+ const unsigned MaxSize = 4;
// FIXME: How to verify the number of elements matches the number of src
// operands?
- for (int I = 0; I < 4; ++I) {
- if (I != 0) {
- if (getLexer().is(AsmToken::RBrac))
- break;
+ for (int I = 0; ; ++I) {
+ int64_t Op;
+ SMLoc Loc = getLoc();
+ if (!parseExpr(Op))
+ return MatchOperand_ParseFail;
- if (getLexer().isNot(AsmToken::Comma))
- return MatchOperand_ParseFail;
- Parser.Lex();
+ if (Op != 0 && Op != 1) {
+ Error(Loc, "invalid " + StringRef(Prefix) + " value.");
+ return MatchOperand_ParseFail;
}
- if (getLexer().isNot(AsmToken::Integer))
- return MatchOperand_ParseFail;
+ Val |= (Op << I);
- int64_t Op;
- if (getParser().parseAbsoluteExpression(Op))
+ if (trySkipToken(AsmToken::RBrac))
+ break;
+
+ if (I + 1 == MaxSize) {
+ Error(getLoc(), "expected a closing square bracket");
return MatchOperand_ParseFail;
+ }
- if (Op != 0 && Op != 1)
+ if (!skipToken(AsmToken::Comma, "expected a comma"))
return MatchOperand_ParseFail;
- Val |= (Op << I);
}
- Parser.Lex();
Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy));
return MatchOperand_Success;
}
@@ -3459,7 +4494,7 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
if (Tok == Name) {
if (Tok == "r128" && isGFX9())
Error(S, "r128 modifier is not supported on this GPU");
- if (Tok == "a16" && !isGFX9())
+ if (Tok == "a16" && !isGFX9() && !isGFX10())
Error(S, "a16 modifier is not supported on this GPU");
Bit = 1;
Parser.Lex();
@@ -3476,6 +4511,9 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
}
}
+ if (!isGFX10() && ImmTy == AMDGPUOperand::ImmTyDLC)
+ return MatchOperand_ParseFail;
+
Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy));
return MatchOperand_Success;
}
@@ -3616,7 +4654,8 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
}
AMDGPUOperand::ImmTy OffsetType =
- (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si ||
+ (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx10 ||
+ Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7 ||
Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle :
AMDGPUOperand::ImmTyOffset;
@@ -3716,20 +4755,18 @@ encodeCnt(
}
bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
- StringRef CntName = Parser.getTok().getString();
- int64_t CntVal;
- Parser.Lex();
- if (getLexer().isNot(AsmToken::LParen))
- return true;
+ SMLoc CntLoc = getLoc();
+ StringRef CntName = getTokenStr();
- Parser.Lex();
- if (getLexer().isNot(AsmToken::Integer))
- return true;
+ if (!skipToken(AsmToken::Identifier, "expected a counter name") ||
+ !skipToken(AsmToken::LParen, "expected a left parenthesis"))
+ return false;
- SMLoc ValLoc = Parser.getTok().getLoc();
- if (getParser().parseAbsoluteExpression(CntVal))
- return true;
+ int64_t CntVal;
+ SMLoc ValLoc = getLoc();
+ if (!parseExpr(CntVal))
+ return false;
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
@@ -3742,265 +4779,240 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeExpcnt, decodeExpcnt);
} else if (CntName == "lgkmcnt" || CntName == "lgkmcnt_sat") {
Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeLgkmcnt, decodeLgkmcnt);
+ } else {
+ Error(CntLoc, "invalid counter name " + CntName);
+ return false;
}
if (Failed) {
Error(ValLoc, "too large value for " + CntName);
- return true;
+ return false;
}
- if (getLexer().isNot(AsmToken::RParen)) {
- return true;
- }
+ if (!skipToken(AsmToken::RParen, "expected a closing parenthesis"))
+ return false;
- Parser.Lex();
- if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) {
- const AsmToken NextToken = getLexer().peekTok();
- if (NextToken.is(AsmToken::Identifier)) {
- Parser.Lex();
+ if (trySkipToken(AsmToken::Amp) || trySkipToken(AsmToken::Comma)) {
+ if (isToken(AsmToken::EndOfStatement)) {
+ Error(getLoc(), "expected a counter name");
+ return false;
}
}
- return false;
+ return true;
}
OperandMatchResultTy
AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
int64_t Waitcnt = getWaitcntBitMask(ISA);
- SMLoc S = Parser.getTok().getLoc();
-
- switch(getLexer().getKind()) {
- default: return MatchOperand_ParseFail;
- case AsmToken::Integer:
- // The operand can be an integer value.
- if (getParser().parseAbsoluteExpression(Waitcnt))
- return MatchOperand_ParseFail;
- break;
+ SMLoc S = getLoc();
- case AsmToken::Identifier:
- do {
- if (parseCnt(Waitcnt))
- return MatchOperand_ParseFail;
- } while(getLexer().isNot(AsmToken::EndOfStatement));
- break;
+ // If parse failed, do not return error code
+ // to avoid excessive error messages.
+ if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) {
+ while (parseCnt(Waitcnt) && !isToken(AsmToken::EndOfStatement));
+ } else {
+ parseExpr(Waitcnt);
}
+
Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S));
return MatchOperand_Success;
}
-bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
- int64_t &Width) {
- using namespace llvm::AMDGPU::Hwreg;
+bool
+AMDGPUOperand::isSWaitCnt() const {
+ return isImm();
+}
- if (Parser.getTok().getString() != "hwreg")
- return true;
- Parser.Lex();
+//===----------------------------------------------------------------------===//
+// hwreg
+//===----------------------------------------------------------------------===//
- if (getLexer().isNot(AsmToken::LParen))
- return true;
- Parser.Lex();
+bool
+AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg,
+ int64_t &Offset,
+ int64_t &Width) {
+ using namespace llvm::AMDGPU::Hwreg;
- if (getLexer().is(AsmToken::Identifier)) {
+ // The register may be specified by name or using a numeric code
+ if (isToken(AsmToken::Identifier) &&
+ (HwReg.Id = getHwregId(getTokenStr())) >= 0) {
HwReg.IsSymbolic = true;
- HwReg.Id = ID_UNKNOWN_;
- const StringRef tok = Parser.getTok().getString();
- int Last = ID_SYMBOLIC_LAST_;
- if (isSI() || isCI() || isVI())
- Last = ID_SYMBOLIC_FIRST_GFX9_;
- for (int i = ID_SYMBOLIC_FIRST_; i < Last; ++i) {
- if (tok == IdSymbolic[i]) {
- HwReg.Id = i;
- break;
- }
- }
- Parser.Lex();
- } else {
- HwReg.IsSymbolic = false;
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(HwReg.Id))
- return true;
- }
-
- if (getLexer().is(AsmToken::RParen)) {
- Parser.Lex();
+ lex(); // skip message name
+ } else if (!parseExpr(HwReg.Id)) {
return false;
}
- // optional params
- if (getLexer().isNot(AsmToken::Comma))
- return true;
- Parser.Lex();
-
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(Offset))
+ if (trySkipToken(AsmToken::RParen))
return true;
- if (getLexer().isNot(AsmToken::Comma))
- return true;
- Parser.Lex();
+ // parse optional params
+ return
+ skipToken(AsmToken::Comma, "expected a comma or a closing parenthesis") &&
+ parseExpr(Offset) &&
+ skipToken(AsmToken::Comma, "expected a comma") &&
+ parseExpr(Width) &&
+ skipToken(AsmToken::RParen, "expected a closing parenthesis");
+}
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(Width))
- return true;
+bool
+AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg,
+ const int64_t Offset,
+ const int64_t Width,
+ const SMLoc Loc) {
- if (getLexer().isNot(AsmToken::RParen))
- return true;
- Parser.Lex();
+ using namespace llvm::AMDGPU::Hwreg;
- return false;
+ if (HwReg.IsSymbolic && !isValidHwreg(HwReg.Id, getSTI())) {
+ Error(Loc, "specified hardware register is not supported on this GPU");
+ return false;
+ } else if (!isValidHwreg(HwReg.Id)) {
+ Error(Loc, "invalid code of hardware register: only 6-bit values are legal");
+ return false;
+ } else if (!isValidHwregOffset(Offset)) {
+ Error(Loc, "invalid bit offset: only 5-bit values are legal");
+ return false;
+ } else if (!isValidHwregWidth(Width)) {
+ Error(Loc, "invalid bitfield width: only values from 1 to 32 are legal");
+ return false;
+ }
+ return true;
}
-OperandMatchResultTy AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
+OperandMatchResultTy
+AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
using namespace llvm::AMDGPU::Hwreg;
- int64_t Imm16Val = 0;
- SMLoc S = Parser.getTok().getLoc();
-
- switch(getLexer().getKind()) {
- default: return MatchOperand_NoMatch;
- case AsmToken::Integer:
- // The operand can be an integer value.
- if (getParser().parseAbsoluteExpression(Imm16Val))
- return MatchOperand_NoMatch;
- if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
- Error(S, "invalid immediate: only 16-bit values are legal");
- // Do not return error code, but create an imm operand anyway and proceed
- // to the next operand, if any. That avoids unneccessary error messages.
- }
- break;
-
- case AsmToken::Identifier: {
- OperandInfoTy HwReg(ID_UNKNOWN_);
- int64_t Offset = OFFSET_DEFAULT_;
- int64_t Width = WIDTH_M1_DEFAULT_ + 1;
- if (parseHwregConstruct(HwReg, Offset, Width))
- return MatchOperand_ParseFail;
- if (HwReg.Id < 0 || !isUInt<ID_WIDTH_>(HwReg.Id)) {
- if (HwReg.IsSymbolic)
- Error(S, "invalid symbolic name of hardware register");
- else
- Error(S, "invalid code of hardware register: only 6-bit values are legal");
- }
- if (Offset < 0 || !isUInt<OFFSET_WIDTH_>(Offset))
- Error(S, "invalid bit offset: only 5-bit values are legal");
- if ((Width-1) < 0 || !isUInt<WIDTH_M1_WIDTH_>(Width-1))
- Error(S, "invalid bitfield width: only values from 1 to 32 are legal");
- Imm16Val = (HwReg.Id << ID_SHIFT_) | (Offset << OFFSET_SHIFT_) | ((Width-1) << WIDTH_M1_SHIFT_);
- }
- break;
+ int64_t ImmVal = 0;
+ SMLoc Loc = getLoc();
+
+ // If parse failed, do not return error code
+ // to avoid excessive error messages.
+ if (trySkipId("hwreg", AsmToken::LParen)) {
+ OperandInfoTy HwReg(ID_UNKNOWN_);
+ int64_t Offset = OFFSET_DEFAULT_;
+ int64_t Width = WIDTH_DEFAULT_;
+ if (parseHwregBody(HwReg, Offset, Width) &&
+ validateHwreg(HwReg, Offset, Width, Loc)) {
+ ImmVal = encodeHwreg(HwReg.Id, Offset, Width);
+ }
+ } else if (parseExpr(ImmVal)) {
+ if (ImmVal < 0 || !isUInt<16>(ImmVal))
+ Error(Loc, "invalid immediate: only 16-bit values are legal");
}
- Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTyHwreg));
- return MatchOperand_Success;
-}
-bool AMDGPUOperand::isSWaitCnt() const {
- return isImm();
+ Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg));
+ return MatchOperand_Success;
}
bool AMDGPUOperand::isHwreg() const {
return isImmTy(ImmTyHwreg);
}
-bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId) {
+//===----------------------------------------------------------------------===//
+// sendmsg
+//===----------------------------------------------------------------------===//
+
+bool
+AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
+ OperandInfoTy &Op,
+ OperandInfoTy &Stream) {
using namespace llvm::AMDGPU::SendMsg;
- if (Parser.getTok().getString() != "sendmsg")
- return true;
- Parser.Lex();
+ if (isToken(AsmToken::Identifier) && (Msg.Id = getMsgId(getTokenStr())) >= 0) {
+ Msg.IsSymbolic = true;
+ lex(); // skip message name
+ } else if (!parseExpr(Msg.Id)) {
+ return false;
+ }
- if (getLexer().isNot(AsmToken::LParen))
- return true;
- Parser.Lex();
+ if (trySkipToken(AsmToken::Comma)) {
+ Op.IsDefined = true;
+ if (isToken(AsmToken::Identifier) &&
+ (Op.Id = getMsgOpId(Msg.Id, getTokenStr())) >= 0) {
+ lex(); // skip operation name
+ } else if (!parseExpr(Op.Id)) {
+ return false;
+ }
- if (getLexer().is(AsmToken::Identifier)) {
- Msg.IsSymbolic = true;
- Msg.Id = ID_UNKNOWN_;
- const std::string tok = Parser.getTok().getString();
- for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) {
- switch(i) {
- default: continue; // Omit gaps.
- case ID_INTERRUPT: case ID_GS: case ID_GS_DONE: case ID_SYSMSG: break;
- }
- if (tok == IdSymbolic[i]) {
- Msg.Id = i;
- break;
- }
+ if (trySkipToken(AsmToken::Comma)) {
+ Stream.IsDefined = true;
+ if (!parseExpr(Stream.Id))
+ return false;
}
- Parser.Lex();
- } else {
- Msg.IsSymbolic = false;
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(Msg.Id))
- return true;
- if (getLexer().is(AsmToken::Integer))
- if (getParser().parseAbsoluteExpression(Msg.Id))
- Msg.Id = ID_UNKNOWN_;
}
- if (Msg.Id == ID_UNKNOWN_) // Don't know how to parse the rest.
- return false;
- if (!(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG)) {
- if (getLexer().isNot(AsmToken::RParen))
- return true;
- Parser.Lex();
+ return skipToken(AsmToken::RParen, "expected a closing parenthesis");
+}
+
+bool
+AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
+ const OperandInfoTy &Op,
+ const OperandInfoTy &Stream,
+ const SMLoc S) {
+ using namespace llvm::AMDGPU::SendMsg;
+
+ // Validation strictness depends on whether message is specified
+ // in a symbolc or in a numeric form. In the latter case
+ // only encoding possibility is checked.
+ bool Strict = Msg.IsSymbolic;
+
+ if (!isValidMsgId(Msg.Id, getSTI(), Strict)) {
+ Error(S, "invalid message id");
+ return false;
+ } else if (Strict && (msgRequiresOp(Msg.Id) != Op.IsDefined)) {
+ Error(S, Op.IsDefined ?
+ "message does not support operations" :
+ "missing message operation");
+ return false;
+ } else if (!isValidMsgOp(Msg.Id, Op.Id, Strict)) {
+ Error(S, "invalid operation id");
+ return false;
+ } else if (Strict && !msgSupportsStream(Msg.Id, Op.Id) && Stream.IsDefined) {
+ Error(S, "message operation does not support streams");
+ return false;
+ } else if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, Strict)) {
+ Error(S, "invalid message stream id");
return false;
}
+ return true;
+}
- if (getLexer().isNot(AsmToken::Comma))
- return true;
- Parser.Lex();
+OperandMatchResultTy
+AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
+ using namespace llvm::AMDGPU::SendMsg;
- assert(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG);
- Operation.Id = ID_UNKNOWN_;
- if (getLexer().is(AsmToken::Identifier)) {
- Operation.IsSymbolic = true;
- const char* const *S = (Msg.Id == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic;
- const int F = (Msg.Id == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_;
- const int L = (Msg.Id == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_;
- const StringRef Tok = Parser.getTok().getString();
- for (int i = F; i < L; ++i) {
- if (Tok == S[i]) {
- Operation.Id = i;
- break;
- }
+ int64_t ImmVal = 0;
+ SMLoc Loc = getLoc();
+
+ // If parse failed, do not return error code
+ // to avoid excessive error messages.
+ if (trySkipId("sendmsg", AsmToken::LParen)) {
+ OperandInfoTy Msg(ID_UNKNOWN_);
+ OperandInfoTy Op(OP_NONE_);
+ OperandInfoTy Stream(STREAM_ID_NONE_);
+ if (parseSendMsgBody(Msg, Op, Stream) &&
+ validateSendMsg(Msg, Op, Stream, Loc)) {
+ ImmVal = encodeMsg(Msg.Id, Op.Id, Stream.Id);
}
- Parser.Lex();
- } else {
- Operation.IsSymbolic = false;
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(Operation.Id))
- return true;
+ } else if (parseExpr(ImmVal)) {
+ if (ImmVal < 0 || !isUInt<16>(ImmVal))
+ Error(Loc, "invalid immediate: only 16-bit values are legal");
}
- if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
- // Stream id is optional.
- if (getLexer().is(AsmToken::RParen)) {
- Parser.Lex();
- return false;
- }
-
- if (getLexer().isNot(AsmToken::Comma))
- return true;
- Parser.Lex();
-
- if (getLexer().isNot(AsmToken::Integer))
- return true;
- if (getParser().parseAbsoluteExpression(StreamId))
- return true;
- }
+ Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTySendMsg));
+ return MatchOperand_Success;
+}
- if (getLexer().isNot(AsmToken::RParen))
- return true;
- Parser.Lex();
- return false;
+bool AMDGPUOperand::isSendMsg() const {
+ return isImmTy(ImmTySendMsg);
}
+//===----------------------------------------------------------------------===//
+// v_interp
+//===----------------------------------------------------------------------===//
+
OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) {
if (getLexer().getKind() != AsmToken::Identifier)
return MatchOperand_NoMatch;
@@ -4062,6 +5074,10 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
return MatchOperand_Success;
}
+//===----------------------------------------------------------------------===//
+// exp
+//===----------------------------------------------------------------------===//
+
void AMDGPUAsmParser::errorExpTgt() {
Error(Parser.getTok().getLoc(), "invalid exp target");
}
@@ -4094,13 +5110,18 @@ OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str,
if (Str.getAsInteger(10, Val))
return MatchOperand_ParseFail;
- if (Val > 3)
+ if (Val > 4 || (Val == 4 && !isGFX10()))
errorExpTgt();
Val += 12;
return MatchOperand_Success;
}
+ if (isGFX10() && Str == "prim") {
+ Val = 20;
+ return MatchOperand_Success;
+ }
+
if (Str.startswith("param")) {
Str = Str.drop_front(5);
if (Str.getAsInteger(10, Val))
@@ -4141,98 +5162,39 @@ OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
return MatchOperand_Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
- using namespace llvm::AMDGPU::SendMsg;
-
- int64_t Imm16Val = 0;
- SMLoc S = Parser.getTok().getLoc();
+//===----------------------------------------------------------------------===//
+// parser helpers
+//===----------------------------------------------------------------------===//
- switch(getLexer().getKind()) {
- default:
- return MatchOperand_NoMatch;
- case AsmToken::Integer:
- // The operand can be an integer value.
- if (getParser().parseAbsoluteExpression(Imm16Val))
- return MatchOperand_NoMatch;
- if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
- Error(S, "invalid immediate: only 16-bit values are legal");
- // Do not return error code, but create an imm operand anyway and proceed
- // to the next operand, if any. That avoids unneccessary error messages.
- }
- break;
- case AsmToken::Identifier: {
- OperandInfoTy Msg(ID_UNKNOWN_);
- OperandInfoTy Operation(OP_UNKNOWN_);
- int64_t StreamId = STREAM_ID_DEFAULT_;
- if (parseSendMsgConstruct(Msg, Operation, StreamId))
- return MatchOperand_ParseFail;
- do {
- // Validate and encode message ID.
- if (! ((ID_INTERRUPT <= Msg.Id && Msg.Id <= ID_GS_DONE)
- || Msg.Id == ID_SYSMSG)) {
- if (Msg.IsSymbolic)
- Error(S, "invalid/unsupported symbolic name of message");
- else
- Error(S, "invalid/unsupported code of message");
- break;
- }
- Imm16Val = (Msg.Id << ID_SHIFT_);
- // Validate and encode operation ID.
- if (Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) {
- if (! (OP_GS_FIRST_ <= Operation.Id && Operation.Id < OP_GS_LAST_)) {
- if (Operation.IsSymbolic)
- Error(S, "invalid symbolic name of GS_OP");
- else
- Error(S, "invalid code of GS_OP: only 2-bit values are legal");
- break;
- }
- if (Operation.Id == OP_GS_NOP
- && Msg.Id != ID_GS_DONE) {
- Error(S, "invalid GS_OP: NOP is for GS_DONE only");
- break;
- }
- Imm16Val |= (Operation.Id << OP_SHIFT_);
- }
- if (Msg.Id == ID_SYSMSG) {
- if (! (OP_SYS_FIRST_ <= Operation.Id && Operation.Id < OP_SYS_LAST_)) {
- if (Operation.IsSymbolic)
- Error(S, "invalid/unsupported symbolic name of SYSMSG_OP");
- else
- Error(S, "invalid/unsupported code of SYSMSG_OP");
- break;
- }
- Imm16Val |= (Operation.Id << OP_SHIFT_);
- }
- // Validate and encode stream ID.
- if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
- if (! (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_)) {
- Error(S, "invalid stream id: only 2-bit values are legal");
- break;
- }
- Imm16Val |= (StreamId << STREAM_ID_SHIFT_);
- }
- } while (false);
- }
- break;
- }
- Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTySendMsg));
- return MatchOperand_Success;
+bool
+AMDGPUAsmParser::isId(const AsmToken &Token, const StringRef Id) const {
+ return Token.is(AsmToken::Identifier) && Token.getString() == Id;
}
-bool AMDGPUOperand::isSendMsg() const {
- return isImmTy(ImmTySendMsg);
+bool
+AMDGPUAsmParser::isId(const StringRef Id) const {
+ return isId(getToken(), Id);
}
-//===----------------------------------------------------------------------===//
-// parser helpers
-//===----------------------------------------------------------------------===//
+bool
+AMDGPUAsmParser::isToken(const AsmToken::TokenKind Kind) const {
+ return getTokenKind() == Kind;
+}
bool
AMDGPUAsmParser::trySkipId(const StringRef Id) {
- if (getLexer().getKind() == AsmToken::Identifier &&
- Parser.getTok().getString() == Id) {
- Parser.Lex();
+ if (isId(Id)) {
+ lex();
+ return true;
+ }
+ return false;
+}
+
+bool
+AMDGPUAsmParser::trySkipId(const StringRef Id, const AsmToken::TokenKind Kind) {
+ if (isId(Id) && peekToken().is(Kind)) {
+ lex();
+ lex();
return true;
}
return false;
@@ -4240,8 +5202,8 @@ AMDGPUAsmParser::trySkipId(const StringRef Id) {
bool
AMDGPUAsmParser::trySkipToken(const AsmToken::TokenKind Kind) {
- if (getLexer().getKind() == Kind) {
- Parser.Lex();
+ if (isToken(Kind)) {
+ lex();
return true;
}
return false;
@@ -4251,7 +5213,7 @@ bool
AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind,
const StringRef ErrMsg) {
if (!trySkipToken(Kind)) {
- Error(Parser.getTok().getLoc(), ErrMsg);
+ Error(getLoc(), ErrMsg);
return false;
}
return true;
@@ -4264,17 +5226,54 @@ AMDGPUAsmParser::parseExpr(int64_t &Imm) {
bool
AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) {
- SMLoc S = Parser.getTok().getLoc();
- if (getLexer().getKind() == AsmToken::String) {
- Val = Parser.getTok().getStringContents();
- Parser.Lex();
+ if (isToken(AsmToken::String)) {
+ Val = getToken().getStringContents();
+ lex();
return true;
} else {
- Error(S, ErrMsg);
+ Error(getLoc(), ErrMsg);
return false;
}
}
+AsmToken
+AMDGPUAsmParser::getToken() const {
+ return Parser.getTok();
+}
+
+AsmToken
+AMDGPUAsmParser::peekToken() {
+ return getLexer().peekTok();
+}
+
+void
+AMDGPUAsmParser::peekTokens(MutableArrayRef<AsmToken> Tokens) {
+ auto TokCount = getLexer().peekTokens(Tokens);
+
+ for (auto Idx = TokCount; Idx < Tokens.size(); ++Idx)
+ Tokens[Idx] = AsmToken(AsmToken::Error, "");
+}
+
+AsmToken::TokenKind
+AMDGPUAsmParser::getTokenKind() const {
+ return getLexer().getKind();
+}
+
+SMLoc
+AMDGPUAsmParser::getLoc() const {
+ return getToken().getLoc();
+}
+
+StringRef
+AMDGPUAsmParser::getTokenStr() const {
+ return getToken().getString();
+}
+
+void
+AMDGPUAsmParser::lex() {
+ Parser.Lex();
+}
+
//===----------------------------------------------------------------------===//
// swizzle
//===----------------------------------------------------------------------===//
@@ -4322,8 +5321,8 @@ AMDGPUAsmParser::parseSwizzleQuadPerm(int64_t &Imm) {
if (parseSwizzleOperands(LANE_NUM, Lane, 0, LANE_MAX,
"expected a 2-bit lane id")) {
Imm = QUAD_PERM_ENC;
- for (auto i = 0; i < LANE_NUM; ++i) {
- Imm |= Lane[i] << (LANE_SHIFT * i);
+ for (unsigned I = 0; I < LANE_NUM; ++I) {
+ Imm |= Lane[I] << (LANE_SHIFT * I);
}
return true;
}
@@ -4519,6 +5518,88 @@ AMDGPUOperand::isSwizzle() const {
}
//===----------------------------------------------------------------------===//
+// VGPR Index Mode
+//===----------------------------------------------------------------------===//
+
+int64_t AMDGPUAsmParser::parseGPRIdxMacro() {
+
+ using namespace llvm::AMDGPU::VGPRIndexMode;
+
+ if (trySkipToken(AsmToken::RParen)) {
+ return OFF;
+ }
+
+ int64_t Imm = 0;
+
+ while (true) {
+ unsigned Mode = 0;
+ SMLoc S = Parser.getTok().getLoc();
+
+ for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
+ if (trySkipId(IdSymbolic[ModeId])) {
+ Mode = 1 << ModeId;
+ break;
+ }
+ }
+
+ if (Mode == 0) {
+ Error(S, (Imm == 0)?
+ "expected a VGPR index mode or a closing parenthesis" :
+ "expected a VGPR index mode");
+ break;
+ }
+
+ if (Imm & Mode) {
+ Error(S, "duplicate VGPR index mode");
+ break;
+ }
+ Imm |= Mode;
+
+ if (trySkipToken(AsmToken::RParen))
+ break;
+ if (!skipToken(AsmToken::Comma,
+ "expected a comma or a closing parenthesis"))
+ break;
+ }
+
+ return Imm;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) {
+
+ int64_t Imm = 0;
+ SMLoc S = Parser.getTok().getLoc();
+
+ if (getLexer().getKind() == AsmToken::Identifier &&
+ Parser.getTok().getString() == "gpr_idx" &&
+ getLexer().peekTok().is(AsmToken::LParen)) {
+
+ Parser.Lex();
+ Parser.Lex();
+
+ // If parse failed, trigger an error but do not return error code
+ // to avoid excessive error messages.
+ Imm = parseGPRIdxMacro();
+
+ } else {
+ if (getParser().parseAbsoluteExpression(Imm))
+ return MatchOperand_NoMatch;
+ if (Imm < 0 || !isUInt<4>(Imm)) {
+ Error(S, "invalid immediate: only 4-bit values are legal");
+ }
+ }
+
+ Operands.push_back(
+ AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyGprIdxMode));
+ return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isGPRIdxMode() const {
+ return isImmTy(ImmTyGprIdxMode);
+}
+
+//===----------------------------------------------------------------------===//
// sopp branch targets
//===----------------------------------------------------------------------===//
@@ -4546,9 +5627,22 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
}
//===----------------------------------------------------------------------===//
+// Boolean holding registers
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) {
+ return parseReg(Operands);
+}
+
+//===----------------------------------------------------------------------===//
// mubuf
//===----------------------------------------------------------------------===//
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDLC() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDLC);
+}
+
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC);
}
@@ -4566,13 +5660,19 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
bool HasLdsModifier = false;
OptionalImmIndexMap OptionalIdx;
assert(IsAtomicReturn ? IsAtomic : true);
+ unsigned FirstOperandIdx = 1;
- for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ for (unsigned i = FirstOperandIdx, e = Operands.size(); i != e; ++i) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
// Add the register arguments
if (Op.isReg()) {
Op.addRegOperands(Inst, 1);
+ // Insert a tied src for atomic return dst.
+ // This cannot be postponed as subsequent calls to
+ // addImmOperands rely on correct number of MC operands.
+ if (IsAtomicReturn && i == FirstOperandIdx)
+ Op.addRegOperands(Inst, 1);
continue;
}
@@ -4582,7 +5682,7 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
continue;
}
- HasLdsModifier = Op.isLDS();
+ HasLdsModifier |= Op.isLDS();
// Handle tokens like 'offen' which are sometimes hard-coded into the
// asm string. There are no MCInst operands for these.
@@ -4610,12 +5710,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
}
}
- // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns.
- if (IsAtomicReturn) {
- MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning.
- Inst.insert(I, *I);
- }
-
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
if (!IsAtomic) { // glc is hard-coded.
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
@@ -4625,6 +5719,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
if (!IsLdsOpcode) { // tfe is not legal with lds opcodes
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
}
+
+ if (isGFX10())
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
}
void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
@@ -4662,6 +5759,9 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+
+ if (isGFX10())
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
}
//===----------------------------------------------------------------------===//
@@ -4692,19 +5792,26 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
Op.addRegOperands(Inst, 1);
} else if (Op.isImmModifier()) {
OptionalIdx[Op.getImmTy()] = I;
- } else {
+ } else if (!Op.isToken()) {
llvm_unreachable("unexpected operand type");
}
}
+ bool IsGFX10 = isGFX10();
+
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
+ if (IsGFX10)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+ if (IsGFX10)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+ if (!IsGFX10)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16);
}
@@ -4742,11 +5849,7 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
}
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const {
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFlatOffset() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
}
@@ -4801,7 +5904,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"lds", AMDGPUOperand::ImmTyLDS, true, nullptr},
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
{"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
- {"dfmt", AMDGPUOperand::ImmTyFORMAT, false, nullptr},
+ {"dlc", AMDGPUOperand::ImmTyDLC, true, nullptr},
+ {"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr},
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
@@ -4816,9 +5920,11 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr},
{"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr},
+ {"dim", AMDGPUOperand::ImmTyDim, false, nullptr},
{"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
{"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
{"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl},
+ {"fi", AMDGPUOperand::ImmTyDppFi, false, nullptr},
{"dst_sel", AMDGPUOperand::ImmTySdwaDstSel, false, nullptr},
{"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
{"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
@@ -4828,7 +5934,10 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr},
{"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr},
{"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr},
- {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr}
+ {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr},
+ {"blgp", AMDGPUOperand::ImmTyBLGP, false, nullptr},
+ {"cbsz", AMDGPUOperand::ImmTyCBSZ, false, nullptr},
+ {"abid", AMDGPUOperand::ImmTyABID, false, nullptr}
};
OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
@@ -4884,7 +5993,9 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
Op.Type == AMDGPUOperand::ImmTyNegHi) {
res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type,
Op.ConvertResult);
- } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT) {
+ } else if (Op.Type == AMDGPUOperand::ImmTyDim) {
+ res = parseDim(Operands);
+ } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT && !isGFX10()) {
res = parseDfmtNfmt(Operands);
} else {
res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
@@ -4964,7 +6075,7 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands)
} else if (Op.isInterpSlot() ||
Op.isInterpAttr() ||
Op.isAttrChan()) {
- Inst.addOperand(MCOperand::createImm(Op.Imm.Val));
+ Inst.addOperand(MCOperand::createImm(Op.getImm()));
} else if (Op.isImmModifier()) {
OptionalIdx[Op.getImmTy()] = I;
} else {
@@ -5029,14 +6140,17 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
}
- // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906):
+ // Special case v_mac_{f16, f32} and v_fmac_{f16, f32} (gfx906/gfx10+):
// it has src2 register operand that is tied to dst operand
// we don't allow modifiers for this operand in assembler so src2_modifiers
// should be 0.
- if (Opc == AMDGPU::V_MAC_F32_e64_si ||
+ if (Opc == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 ||
+ Opc == AMDGPU::V_MAC_F32_e64_gfx10 ||
Opc == AMDGPU::V_MAC_F32_e64_vi ||
Opc == AMDGPU::V_MAC_F16_e64_vi ||
- Opc == AMDGPU::V_FMAC_F32_e64_vi) {
+ Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
+ Opc == AMDGPU::V_FMAC_F32_e64_vi ||
+ Opc == AMDGPU::V_FMAC_F16_e64_gfx10) {
auto it = Inst.begin();
std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
@@ -5137,6 +6251,10 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
// dpp
//===----------------------------------------------------------------------===//
+bool AMDGPUOperand::isDPP8() const {
+ return isImmTy(ImmTyDPP8);
+}
+
bool AMDGPUOperand::isDPPCtrl() const {
using namespace AMDGPU::DPP;
@@ -5154,13 +6272,27 @@ bool AMDGPUOperand::isDPPCtrl() const {
(Imm == DppCtrl::ROW_MIRROR) ||
(Imm == DppCtrl::ROW_HALF_MIRROR) ||
(Imm == DppCtrl::BCAST15) ||
- (Imm == DppCtrl::BCAST31);
+ (Imm == DppCtrl::BCAST31) ||
+ (Imm >= DppCtrl::ROW_SHARE_FIRST && Imm <= DppCtrl::ROW_SHARE_LAST) ||
+ (Imm >= DppCtrl::ROW_XMASK_FIRST && Imm <= DppCtrl::ROW_XMASK_LAST);
}
return false;
}
-bool AMDGPUOperand::isGPRIdxMode() const {
- return isImm() && isUInt<4>(getImm());
+//===----------------------------------------------------------------------===//
+// mAI
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isBLGP() const {
+ return isImm() && getImmTy() == ImmTyBLGP && isUInt<3>(getImm());
+}
+
+bool AMDGPUOperand::isCBSZ() const {
+ return isImm() && getImmTy() == ImmTyCBSZ && isUInt<3>(getImm());
+}
+
+bool AMDGPUOperand::isABID() const {
+ return isImm() && getImmTy() == ImmTyABID && isUInt<4>(getImm());
}
bool AMDGPUOperand::isS16Imm() const {
@@ -5171,6 +6303,108 @@ bool AMDGPUOperand::isU16Imm() const {
return isImm() && isUInt<16>(getImm());
}
+OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
+ if (!isGFX10())
+ return MatchOperand_NoMatch;
+
+ SMLoc S = Parser.getTok().getLoc();
+
+ if (getLexer().isNot(AsmToken::Identifier))
+ return MatchOperand_NoMatch;
+ if (getLexer().getTok().getString() != "dim")
+ return MatchOperand_NoMatch;
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Colon))
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+
+ // We want to allow "dim:1D" etc., but the initial 1 is tokenized as an
+ // integer.
+ std::string Token;
+ if (getLexer().is(AsmToken::Integer)) {
+ SMLoc Loc = getLexer().getTok().getEndLoc();
+ Token = getLexer().getTok().getString();
+ Parser.Lex();
+ if (getLexer().getTok().getLoc() != Loc)
+ return MatchOperand_ParseFail;
+ }
+ if (getLexer().isNot(AsmToken::Identifier))
+ return MatchOperand_ParseFail;
+ Token += getLexer().getTok().getString();
+
+ StringRef DimId = Token;
+ if (DimId.startswith("SQ_RSRC_IMG_"))
+ DimId = DimId.substr(12);
+
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByAsmSuffix(DimId);
+ if (!DimInfo)
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, DimInfo->Encoding, S,
+ AMDGPUOperand::ImmTyDim));
+ return MatchOperand_Success;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
+ SMLoc S = Parser.getTok().getLoc();
+ StringRef Prefix;
+
+ if (getLexer().getKind() == AsmToken::Identifier) {
+ Prefix = Parser.getTok().getString();
+ } else {
+ return MatchOperand_NoMatch;
+ }
+
+ if (Prefix != "dpp8")
+ return parseDPPCtrl(Operands);
+ if (!isGFX10())
+ return MatchOperand_NoMatch;
+
+ // dpp8:[%d,%d,%d,%d,%d,%d,%d,%d]
+
+ int64_t Sels[8];
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Colon))
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::LBrac))
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+ if (getParser().parseAbsoluteExpression(Sels[0]))
+ return MatchOperand_ParseFail;
+ if (0 > Sels[0] || 7 < Sels[0])
+ return MatchOperand_ParseFail;
+
+ for (size_t i = 1; i < 8; ++i) {
+ if (getLexer().isNot(AsmToken::Comma))
+ return MatchOperand_ParseFail;
+
+ Parser.Lex();
+ if (getParser().parseAbsoluteExpression(Sels[i]))
+ return MatchOperand_ParseFail;
+ if (0 > Sels[i] || 7 < Sels[i])
+ return MatchOperand_ParseFail;
+ }
+
+ if (getLexer().isNot(AsmToken::RBrac))
+ return MatchOperand_ParseFail;
+ Parser.Lex();
+
+ unsigned DPP8 = 0;
+ for (size_t i = 0; i < 8; ++i)
+ DPP8 |= (Sels[i] << (i * 3));
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, DPP8, S, AMDGPUOperand::ImmTyDPP8));
+ return MatchOperand_Success;
+}
+
OperandMatchResultTy
AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
using namespace AMDGPU::DPP;
@@ -5201,10 +6435,21 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
&& Prefix != "wave_rol"
&& Prefix != "wave_shr"
&& Prefix != "wave_ror"
- && Prefix != "row_bcast") {
+ && Prefix != "row_bcast"
+ && Prefix != "row_share"
+ && Prefix != "row_xmask") {
return MatchOperand_NoMatch;
}
+ if (!isGFX10() && (Prefix == "row_share" || Prefix == "row_xmask"))
+ return MatchOperand_NoMatch;
+
+ if (!isVI() && !isGFX9() &&
+ (Prefix == "wave_shl" || Prefix == "wave_shr" ||
+ Prefix == "wave_rol" || Prefix == "wave_ror" ||
+ Prefix == "row_bcast"))
+ return MatchOperand_NoMatch;
+
Parser.Lex();
if (getLexer().isNot(AsmToken::Colon))
return MatchOperand_ParseFail;
@@ -5262,6 +6507,10 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
} else {
return MatchOperand_ParseFail;
}
+ } else if (Prefix == "row_share" && 0 <= Int && Int <= 15) {
+ Int |= DppCtrl::ROW_SHARE_FIRST;
+ } else if (Prefix == "row_xmask" && 0 <= Int && Int <= 15) {
+ Int |= DppCtrl::ROW_XMASK_FIRST;
} else {
return MatchOperand_ParseFail;
}
@@ -5276,6 +6525,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const {
return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask);
}
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultEndpgmImmOperands() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyEndpgm);
+}
+
AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const {
return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask);
}
@@ -5284,7 +6537,11 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl);
}
-void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi);
+}
+
+void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
OptionalImmIndexMap OptionalIdx;
unsigned I = 1;
@@ -5293,6 +6550,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
}
+ int Fi = 0;
for (unsigned E = Operands.size(); I != E; ++I) {
auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
MCOI::TIED_TO);
@@ -5303,25 +6561,49 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
}
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
- if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
+ if (Op.isReg() && validateVccOperand(Op.getReg())) {
// VOP2b (v_add_u32, v_sub_u32 ...) dpp use "vcc" token.
// Skip it.
continue;
- } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
- Op.addRegWithFPInputModsOperands(Inst, 2);
- } else if (Op.isDPPCtrl()) {
- Op.addImmOperands(Inst, 1);
- } else if (Op.isImm()) {
- // Handle optional arguments
- OptionalIdx[Op.getImmTy()] = I;
+ }
+
+ if (IsDPP8) {
+ if (Op.isDPP8()) {
+ Op.addImmOperands(Inst, 1);
+ } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isFI()) {
+ Fi = Op.getImm();
+ } else if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
+ } else {
+ llvm_unreachable("Invalid operand type");
+ }
} else {
- llvm_unreachable("Invalid operand type");
+ if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ Op.addRegWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isDPPCtrl()) {
+ Op.addImmOperands(Inst, 1);
+ } else if (Op.isImm()) {
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = I;
+ } else {
+ llvm_unreachable("Invalid operand type");
+ }
}
}
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
+ if (IsDPP8) {
+ using namespace llvm::AMDGPU::DPP;
+ Inst.addOperand(MCOperand::createImm(Fi? DPP8_FI_1 : DPP8_FI_0));
+ } else {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::fi) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppFi);
+ }
+ }
}
//===----------------------------------------------------------------------===//
@@ -5422,7 +6704,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
- if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
+ if (skipVcc && !skippedVcc && Op.isReg() &&
+ (Op.getReg() == AMDGPU::VCC || Op.getReg() == AMDGPU::VCC_LO)) {
// VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
// Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3)
// or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand.
@@ -5448,7 +6731,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
skippedVcc = false;
}
- if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&
+ if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx10 &&
+ Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&
Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
// v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments
switch (BasicInstType) {
@@ -5474,7 +6758,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
break;
case SIInstrFlags::VOPC:
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::clamp) != -1)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
break;
@@ -5495,6 +6780,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
}
}
+//===----------------------------------------------------------------------===//
+// mAI
+//===----------------------------------------------------------------------===//
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBLGP() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyBLGP);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCBSZ() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCBSZ);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultABID() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyABID);
+}
+
/// Force static initialization.
extern "C" void LLVMInitializeAMDGPUAsmParser() {
RegisterMCAsmParser<AMDGPUAsmParser> A(getTheAMDGPUTarget());
@@ -5552,3 +6853,28 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
return Match_InvalidOperand;
}
}
+
+//===----------------------------------------------------------------------===//
+// endpgm
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) {
+ SMLoc S = Parser.getTok().getLoc();
+ int64_t Imm = 0;
+
+ if (!parseExpr(Imm)) {
+ // The operand is optional, if not present default to 0
+ Imm = 0;
+ }
+
+ if (!isUInt<16>(Imm)) {
+ Error(S, "expected a 16-bit value");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(
+ AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyEndpgm));
+ return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); }
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 51c2abeac2ff..62a19d848af2 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -1,37 +1,22 @@
//===-- BUFInstructions.td - Buffer Instruction Defintions ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
+def MUBUFAddr64 : ComplexPattern<i64, 8, "SelectMUBUFAddr64">;
def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
-def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
+def MUBUFOffset : ComplexPattern<i64, 7, "SelectMUBUFOffset">;
def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
-class MubufLoad <SDPatternOperator op> : PatFrag <
- (ops node:$ptr), (op node:$ptr), [{
- auto const AS = cast<MemSDNode>(N)->getAddressSpace();
- return AS == AMDGPUAS::GLOBAL_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS;
-}]>;
-
-def mubuf_load : MubufLoad <load>;
-def mubuf_az_extloadi8 : MubufLoad <az_extloadi8>;
-def mubuf_sextloadi8 : MubufLoad <sextloadi8>;
-def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>;
-def mubuf_sextloadi16 : MubufLoad <sextloadi16>;
-def mubuf_load_atomic : MubufLoad <atomic_load>;
-
def BUFAddrKind {
int Offset = 0;
int OffEn = 1;
@@ -97,7 +82,9 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_vdata = 1;
bits<1> has_vaddr = 1;
bits<1> has_glc = 1;
+ bits<1> has_dlc = 1;
bits<1> glc_value = 0; // the value for glc if no such operand
+ bits<1> dlc_value = 0; // the value for dlc if no such operand
bits<1> has_srsrc = 1;
bits<1> has_soffset = 1;
bits<1> has_offset = 1;
@@ -120,6 +107,7 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
bits<12> offset;
bits<1> glc;
+ bits<1> dlc;
bits<7> format;
bits<8> vaddr;
bits<8> vdata;
@@ -138,17 +126,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe),
+ offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe)
+ offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc)
);
dag InsData = !if(!empty(vaddrList),
(ins vdataClass:$vdata, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe),
+ SLC:$slc, TFE:$tfe, DLC:$dlc),
(ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe)
+ SLC:$slc, TFE:$tfe, DLC:$dlc)
);
dag ret = !if(!empty(vdataList), InsNoData, InsData);
}
@@ -199,7 +187,7 @@ class MTBUF_Load_Pseudo <string opName,
: MTBUF_Pseudo<opName,
(outs vdataClass:$vdata),
getMTBUFIns<addrKindCopy>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -214,13 +202,13 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set load_vt:$vdata,
(ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format,
- i1:$glc, i1:$slc, i1:$tfe)))]>,
+ i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>,
MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set load_vt:$vdata,
(ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
- i8:$format, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>,
MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
@@ -245,7 +233,7 @@ class MTBUF_Store_Pseudo <string opName,
: MTBUF_Pseudo<opName,
(outs),
getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -260,13 +248,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe))]>,
+ i1:$slc, i1:$tfe, i1:$dlc))]>,
MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe))]>,
+ i1:$slc, i1:$tfe, i1:$dlc))]>,
MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
@@ -324,7 +312,9 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_vdata = 1;
bits<1> has_vaddr = 1;
bits<1> has_glc = 1;
+ bits<1> has_dlc = 1;
bits<1> glc_value = 0; // the value for glc if no such operand
+ bits<1> dlc_value = 0; // the value for dlc if no such operand
bits<1> has_srsrc = 1;
bits<1> has_soffset = 1;
bits<1> has_offset = 1;
@@ -333,7 +323,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<4> dwords = 0;
}
-class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
+class MUBUF_Real <MUBUF_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
let isPseudo = 0;
@@ -348,6 +338,7 @@ class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
bits<12> offset;
bits<1> glc;
+ bits<1> dlc;
bits<8> vaddr;
bits<8> vdata;
bits<7> srsrc;
@@ -358,7 +349,7 @@ class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
// For cache invalidation instructions.
-class MUBUF_Invalidate <string opName, SDPatternOperator node> :
+class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
MUBUF_Pseudo<opName, (outs), (ins), "", [(node)]> {
let AsmMatchConverter = "";
@@ -373,7 +364,9 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node> :
let has_vdata = 0;
let has_vaddr = 0;
let has_glc = 0;
+ let has_dlc = 0;
let glc_value = 0;
+ let dlc_value = 0;
let has_srsrc = 0;
let has_soffset = 0;
let has_offset = 0;
@@ -400,7 +393,7 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
);
dag ret = !con(
!if(!empty(vdataList), InsNoData, InsData),
- !if(isLds, (ins), (ins TFE:$tfe))
+ !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc))
);
}
@@ -460,7 +453,7 @@ class MUBUF_Load_Pseudo <string opName,
!con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
!if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
" $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
- !if(isLds, " lds", "$tfe"),
+ !if(isLds, " lds", "$tfe") # "$dlc",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # !if(isLds, "_lds", "") #
@@ -477,6 +470,24 @@ class MUBUF_Load_Pseudo <string opName,
let dwords = getMUBUFDwords<vdataClass>.ret;
}
+class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat <
+ (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
+ (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))
+>;
+
+class MUBUF_Addr64_Load_Pat <Instruction inst,
+ ValueType load_vt = i32,
+ SDPatternOperator ld = null_frag> : Pat <
+ (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
+ (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))
+>;
+
+multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> {
+ def : MUBUF_Offset_Load_Pat<!cast<Instruction>(BaseInst#"_OFFSET"), load_vt, ld>;
+ def : MUBUF_Addr64_Load_Pat<!cast<Instruction>(BaseInst#"_ADDR64"), load_vt, ld>;
+}
+
+
// FIXME: tfe can't be an operand because it requires a separate
// opcode because it needs an N+1 register class dest register.
multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
@@ -485,20 +496,10 @@ multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
bit TiedDest = 0,
bit isLds = 0> {
- def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
- TiedDest, isLds,
- !if(isLds,
- [],
- [(set load_vt:$vdata,
- (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+ def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>,
MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
- def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
- TiedDest, isLds,
- !if(isLds,
- [],
- [(set load_vt:$vdata,
- (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>,
+ def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, TiedDest, isLds>,
MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
@@ -531,7 +532,7 @@ class MUBUF_Store_Pseudo <string opName,
: MUBUF_Pseudo<opName,
(outs),
getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -547,12 +548,12 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>,
MUBUFAddr64Table<0, NAME>;
def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>,
MUBUFAddr64Table<1, NAME>;
def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
@@ -638,6 +639,7 @@ class MUBUF_Atomic_Pseudo<string opName,
let hasSideEffects = 1;
let DisableWQM = 1;
let has_glc = 0;
+ let has_dlc = 0;
let has_tfe = 0;
let maybeAtomic = 1;
}
@@ -656,6 +658,7 @@ class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let glc_value = 0;
+ let dlc_value = 0;
let AsmMatchConverter = "cvtMubufAtomic";
}
@@ -673,6 +676,7 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> {
let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
let glc_value = 1;
+ let dlc_value = 0;
let Constraints = "$vdata = $vdata_in";
let DisableEncoding = "$vdata_in";
let AsmMatchConverter = "cvtMubufAtomicReturn";
@@ -681,34 +685,53 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
RegisterClass vdataClass,
ValueType vdataType,
- SDPatternOperator atomic> {
+ SDPatternOperator atomic,
+ bit isFP = getIsFP<vdataType>.ret> {
+ let FPAtomic = isFP in
def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
MUBUFAddr64Table <0, NAME>;
+
+ let FPAtomic = isFP in
def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
MUBUFAddr64Table <1, NAME>;
+
+ let FPAtomic = isFP in
def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+
+ let FPAtomic = isFP in
+
def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+
+ let FPAtomic = isFP in
def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
}
multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
RegisterClass vdataClass,
ValueType vdataType,
- SDPatternOperator atomic> {
+ SDPatternOperator atomic,
+ bit isFP = getIsFP<vdataType>.ret> {
+ let FPAtomic = isFP in
def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set vdataType:$vdata,
(atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
vdataType:$vdata_in))]>,
MUBUFAddr64Table <0, NAME # "_RTN">;
+ let FPAtomic = isFP in
def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set vdataType:$vdata,
(atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
vdataType:$vdata_in))]>,
MUBUFAddr64Table <1, NAME # "_RTN">;
+ let FPAtomic = isFP in
def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+
+ let FPAtomic = isFP in
def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+
+ let FPAtomic = isFP in
def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
}
@@ -804,34 +827,45 @@ let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
} // End HasPackedD16VMem.
defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
+ "buffer_load_ubyte", VGPR_32, i32
>;
defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
+ "buffer_load_sbyte", VGPR_32, i32
>;
defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
+ "buffer_load_ushort", VGPR_32, i32
>;
defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
+ "buffer_load_sshort", VGPR_32, i32
>;
defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_dword", VGPR_32, i32, mubuf_load
+ "buffer_load_dword", VGPR_32, i32
>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
+ "buffer_load_dwordx2", VReg_64, v2i32
>;
defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx3", VReg_96, untyped, mubuf_load
+ "buffer_load_dwordx3", VReg_96, v3i32
>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
+ "buffer_load_dwordx4", VReg_128, v4i32
>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", i32, load_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
+
// This is not described in AMD documentation,
// but 'lds' versions of these opcodes are available
// in at least GFX8+ chips. See Bug 37653.
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8GFX9 in {
defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
"buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1
>;
@@ -856,7 +890,7 @@ defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores <
"buffer_store_dwordx2", VReg_64, v2i32, store_global
>;
defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx3", VReg_96, untyped, store_global
+ "buffer_store_dwordx3", VReg_96, v3i32, store_global
>;
defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
"buffer_store_dwordx4", VReg_128, v4i32, store_global
@@ -940,11 +974,11 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
>;
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8GFX9 in {
def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
}
-let SubtargetPredicate = isSI in { // isn't on CI & VI
+let SubtargetPredicate = isGFX6 in { // isn't on CI & VI
/*
defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">;
@@ -1006,17 +1040,28 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
int_amdgcn_buffer_wbinvl1>;
+let SubtargetPredicate = HasAtomicFaddInsts in {
+
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
+ "buffer_atomic_add_f32", VGPR_32, f32, atomic_add_global
+>;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
+ "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global
+>;
+
+} // End SubtargetPredicate = HasAtomicFaddInsts
+
//===----------------------------------------------------------------------===//
// MTBUF Instructions
//===----------------------------------------------------------------------===//
defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>;
defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>;
-defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>;
defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>;
defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>;
defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>;
defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
@@ -1041,19 +1086,21 @@ let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>;
} // End HasPackedD16VMem.
-let SubtargetPredicate = isCIVI in {
+let SubtargetPredicate = isGFX7Plus in {
//===----------------------------------------------------------------------===//
// Instruction definitions for CI and newer.
//===----------------------------------------------------------------------===//
-// Remaining instructions:
-// BUFFER_LOAD_DWORDX3
-// BUFFER_STORE_DWORDX3
def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
int_amdgcn_buffer_wbinvl1_vol>;
-} // End let SubtargetPredicate = isCIVI
+} // End let SubtargetPredicate = isGFX7Plus
+
+let SubtargetPredicate = isGFX10Plus in {
+ def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">;
+ def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">;
+} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
// MUBUF Patterns
@@ -1067,6 +1114,10 @@ def extract_slc : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
}]>;
+def extract_dlc : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
+}]>;
+
//===----------------------------------------------------------------------===//
// buffer_load/store_format patterns
//===----------------------------------------------------------------------===//
@@ -1077,21 +1128,21 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
imm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
(vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
imm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
imm:$cachepolicy, imm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1100,7 +1151,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
}
@@ -1108,6 +1159,8 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, i32, "BUFFER_LOAD_FORMAT_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2i32, "BUFFER_LOAD_FORMAT_XY">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v3f32, "BUFFER_LOAD_FORMAT_XYZ">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v3i32, "BUFFER_LOAD_FORMAT_XYZ">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4i32, "BUFFER_LOAD_FORMAT_XYZW">;
@@ -1131,8 +1184,14 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
@@ -1140,21 +1199,23 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
imm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
imm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (as_i16imm $offset), (extract_glc $cachepolicy),
+ (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
imm:$cachepolicy, imm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (as_i16imm $offset), (extract_glc $cachepolicy),
+ (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1163,8 +1224,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy),
+ (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
}
@@ -1172,6 +1233,8 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMAT_XYZW">;
@@ -1195,42 +1258,47 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
//===----------------------------------------------------------------------===//
// buffer_atomic patterns
//===----------------------------------------------------------------------===//
-multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
+multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
+ string opcode> {
def : GCNPat<
- (name i32:$vdata_in, v4i32:$rsrc, 0,
+ (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ imm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ imm:$cachepolicy, imm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (name i32:$vdata_in, v4i32:$rsrc, 0,
+ (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ imm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ imm:$cachepolicy, imm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
$vdata_in,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
@@ -1238,16 +1306,66 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
>;
}
-defm : BufferAtomicPatterns<SIbuffer_atomic_swap, "BUFFER_ATOMIC_SWAP">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_add, "BUFFER_ATOMIC_ADD">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_sub, "BUFFER_ATOMIC_SUB">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smin, "BUFFER_ATOMIC_SMIN">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umin, "BUFFER_ATOMIC_UMIN">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_smax, "BUFFER_ATOMIC_SMAX">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_umax, "BUFFER_ATOMIC_UMAX">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_and, "BUFFER_ATOMIC_AND">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_or, "BUFFER_ATOMIC_OR">;
-defm : BufferAtomicPatterns<SIbuffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i32, "BUFFER_ATOMIC_SWAP">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_add, i32, "BUFFER_ATOMIC_ADD">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i32, "BUFFER_ATOMIC_SUB">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i32, "BUFFER_ATOMIC_SMIN">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i32, "BUFFER_ATOMIC_UMIN">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i32, "BUFFER_ATOMIC_SMAX">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i32, "BUFFER_ATOMIC_UMAX">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_and, i32, "BUFFER_ATOMIC_AND">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i64, "BUFFER_ATOMIC_SMIN_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i64, "BUFFER_ATOMIC_UMIN_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i64, "BUFFER_ATOMIC_SMAX_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i64, "BUFFER_ATOMIC_UMAX_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
+
+multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
+ string opcode> {
+ def : GCNPat<
+ (name vt:$vdata_in, v4i32:$rsrc, 0,
+ 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset,
+ (as_i16imm $offset), (extract_slc $cachepolicy))
+ >;
+
+ def : GCNPat<
+ (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
+ (as_i16imm $offset), (extract_slc $cachepolicy))
+ >;
+
+ def : GCNPat<
+ (name vt:$vdata_in, v4i32:$rsrc, 0,
+ i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
+ (as_i16imm $offset), (extract_slc $cachepolicy))
+ >;
+
+ def : GCNPat<
+ (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
+ i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm),
+ (!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
+ $vdata_in,
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
+ >;
+}
+
+defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">;
+defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
def : GCNPat<
(SIbuffer_atomic_cmpswap
@@ -1298,12 +1416,11 @@ def : GCNPat<
sub0)
>;
-
class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
PatFrag constant_ld> : GCNPat <
(vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
>;
multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
@@ -1311,43 +1428,47 @@ multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Ins
def : GCNPat <
(vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$slc))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0)
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0)
>;
def : GCNPat <
(vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
- (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0)
+ (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0)
>;
}
-let SubtargetPredicate = isSICI in {
+let SubtargetPredicate = isGFX6GFX7 in {
def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
-def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, extloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, zextloadi8_constant>;
def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
-def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, extloadi16_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, zextloadi16_constant>;
-defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>;
-defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>;
-} // End SubtargetPredicate = isSICI
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, atomic_load_32_global>;
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, atomic_load_64_global>;
+} // End SubtargetPredicate = isGFX6GFX7
multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
PatFrag ld> {
def : GCNPat <
(vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
- (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
+ (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
>;
}
let OtherPredicates = [Has16BitInsts] in {
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_global>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_global>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_global>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_OFFSET, i16, mubuf_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_OFFSET, i16, load_global>;
} // End OtherPredicates = [Has16BitInsts]
@@ -1357,111 +1478,79 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
def : GCNPat <
(vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset))),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0)
>;
def : GCNPat <
(vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
- (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0)
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0)
>;
}
// XXX - Is it possible to have a complex pattern in a PatFrag?
-multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen,
+multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
MUBUF_Pseudo InstrOffset,
- ValueType vt, PatFrag ld> {
- def : GCNPat <
- (build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset)))),
- (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-
- def : GCNPat <
- (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset)))))),
- (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-
-
- def : GCNPat <
- (build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))),
- (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-
- def : GCNPat <
- (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))),
- (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-}
-
-multiclass MUBUFScratchLoadPat_Lo16 <MUBUF_Pseudo InstrOffen,
- MUBUF_Pseudo InstrOffset,
- ValueType vt, PatFrag ld> {
- def : GCNPat <
- (build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset))),
- (vt (Hi16Elt vt:$hi))),
- (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
- >;
-
+ ValueType vt, PatFrag ld_frag> {
def : GCNPat <
- (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset))))),
- (f16 (Hi16Elt f16:$hi))),
- (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
+ (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in),
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in)
>;
def : GCNPat <
- (build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
- (vt (Hi16Elt vt:$hi))),
- (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
- >;
-
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))),
- (f16 (Hi16Elt f16:$hi))),
- (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
+ (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in),
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in)
>;
}
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, az_extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, zextloadi8_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, az_extloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, zextloadi16_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
let OtherPredicates = [D16PreservesUnusedBits] in {
-defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
-defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
-defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2f16, load_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2f16, az_extloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2f16, sextloadi8_d16_hi_private>;
-defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, i16, load_private>;
-defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>;
-defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2i16, load_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2i16, az_extloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2i16, sextloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2f16, load_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2f16, az_extloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>;
}
+
multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_st> {
// Store follows atomic op convention so address is forst
def : GCNPat <
(atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$slc), vt:$val),
- (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0)
+ (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0)
>;
def : GCNPat <
(atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
- (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0)
+ (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0)
>;
}
-let SubtargetPredicate = isSICI in {
+let SubtargetPredicate = isGFX6GFX7 in {
defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, store_atomic_global>;
defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, store_atomic_global>;
-} // End Predicates = isSICI
+} // End Predicates = isGFX6GFX7
multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
@@ -1469,8 +1558,8 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
def : GCNPat <
(st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe)),
- (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)),
+ (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
>;
}
@@ -1479,17 +1568,18 @@ defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, store_global>;
multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
MUBUF_Pseudo InstrOffset,
- ValueType vt, PatFrag st> {
+ ValueType vt, PatFrag st,
+ RegisterClass rc = VGPR_32> {
def : GCNPat <
(st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset)),
- (InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+ (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0)
>;
def : GCNPat <
(st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
u16imm:$offset)),
- (InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0)
+ (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0)
>;
}
@@ -1498,8 +1588,9 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET
defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private, VReg_64>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OFFSET, v3i32, store_private, VReg_96>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
let OtherPredicates = [D16PreservesUnusedBits] in {
@@ -1526,7 +1617,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1534,7 +1625,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, imm)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1542,7 +1633,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1552,15 +1643,17 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
}
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32, "TBUFFER_LOAD_FORMAT_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3i32, "TBUFFER_LOAD_FORMAT_XYZ">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32, "TBUFFER_LOAD_FORMAT_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3f32, "TBUFFER_LOAD_FORMAT_XYZ">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
let SubtargetPredicate = HasUnpackedD16VMem in {
@@ -1582,7 +1675,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1590,7 +1683,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, imm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1598,7 +1691,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
imm:$format, imm:$cachepolicy, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
def : GCNPat<
@@ -1608,17 +1701,17 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
>;
}
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32, "TBUFFER_STORE_FORMAT_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
-defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3i32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32, "TBUFFER_STORE_FORMAT_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">;
-defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3f32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
let SubtargetPredicate = HasUnpackedD16VMem in {
@@ -1634,28 +1727,22 @@ let SubtargetPredicate = HasPackedD16VMem in {
} // End HasPackedD16VMem.
//===----------------------------------------------------------------------===//
-// Target instructions, move to the appropriate target TD file
+// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// SI
+// Base ENC_MUBUF for GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
-class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> :
- MUBUF_Real<op, ps>,
- Enc64,
- SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
- let AssemblerPredicate=isSICI;
- let DecoderNamespace="SICI";
-
+class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
+ MUBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
- let Inst{15} = ps.addr64;
let Inst{16} = !if(ps.lds, 1, 0);
let Inst{24-18} = op;
- let Inst{31-26} = 0x38; //encoding
+ let Inst{31-26} = 0x38;
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
@@ -1664,125 +1751,250 @@ class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> :
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
-multiclass MUBUF_Real_AllAddr_si<bits<7> op> {
- def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
- def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>;
- def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
- def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
- def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
-}
-
-multiclass MUBUF_Real_AllAddr_Lds_si<bits<7> op> {
-
- def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
- MUBUFLdsTable<0, NAME # "_OFFSET_si">;
- def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
- MUBUFLdsTable<0, NAME # "_ADDR64_si">;
- def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
- MUBUFLdsTable<0, NAME # "_OFFEN_si">;
- def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
- MUBUFLdsTable<0, NAME # "_IDXEN_si">;
- def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
- MUBUFLdsTable<0, NAME # "_BOTHEN_si">;
-
- def _LDS_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
- MUBUFLdsTable<1, NAME # "_OFFSET_si">;
- def _LDS_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>,
- MUBUFLdsTable<1, NAME # "_ADDR64_si">;
- def _LDS_OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
- MUBUFLdsTable<1, NAME # "_OFFEN_si">;
- def _LDS_IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
- MUBUFLdsTable<1, NAME # "_IDXEN_si">;
- def _LDS_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
- MUBUFLdsTable<1, NAME # "_BOTHEN_si">;
-}
-
-multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
- def _OFFSET_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
- def _ADDR64_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>;
- def _OFFEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
- def _IDXEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
- def _BOTHEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
-}
-
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_si <0x00>;
-defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_si <0x01>;
-defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x02>;
-defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x03>;
-defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_si <0x04>;
-defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_si <0x05>;
-defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x06>;
-defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x07>;
-defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_si <0x08>;
-defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_si <0x09>;
-defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_si <0x0a>;
-defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_si <0x0b>;
-defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_si <0x0c>;
-defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_si <0x0d>;
-defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_si <0x0e>;
-defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_si <0x0f>;
-defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_si <0x18>;
-defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_si <0x1a>;
-defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_si <0x1c>;
-defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_si <0x1d>;
-defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_si <0x1e>;
-defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_si <0x1f>;
-
-defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_si <0x30>;
-defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_si <0x31>;
-defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_si <0x32>;
-defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_si <0x33>;
-//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomic_si <0x34>; // isn't on CI & VI
-defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_si <0x35>;
-defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_si <0x36>;
-defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_si <0x37>;
-defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_si <0x38>;
-defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_si <0x39>;
-defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_si <0x3a>;
-defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_si <0x3b>;
-defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_si <0x3c>;
-defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_si <0x3d>;
-
-//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_si <0x3e>; // isn't on VI
-//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomic_si <0x3f>; // isn't on VI
-//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomic_si <0x40>; // isn't on VI
-defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_si <0x50>;
-defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_si <0x51>;
-defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_si <0x52>;
-defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_si <0x53>;
-//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Real_Atomic_si <0x54>; // isn't on CI & VI
-defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_si <0x55>;
-defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_si <0x56>;
-defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_si <0x57>;
-defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_si <0x58>;
-defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_si <0x59>;
-defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_si <0x5a>;
-defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>;
-defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>;
-defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>;
-// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI.
-//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI
-//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI
-//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI
-
-def BUFFER_WBINVL1_SC_si : MUBUF_Real_si <0x70, BUFFER_WBINVL1_SC>;
-def BUFFER_WBINVL1_si : MUBUF_Real_si <0x71, BUFFER_WBINVL1>;
-
-class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
- MTBUF_Real<ps>,
- Enc64,
- SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
- let AssemblerPredicate=isSICI;
- let DecoderNamespace="SICI";
+class MUBUF_Real_gfx10<bits<8> op, MUBUF_Pseudo ps> :
+ Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> {
+ let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value);
+ let Inst{25} = op{7};
+}
+
+class MUBUF_Real_gfx6_gfx7<bits<8> op, MUBUF_Pseudo ps> :
+ Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> {
+ let Inst{15} = ps.addr64;
+}
+//===----------------------------------------------------------------------===//
+// MUBUF - GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass MUBUF_Real_gfx10_with_name<bits<8> op, string opName,
+ string asmName> {
+ def _gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(opName)> {
+ MUBUF_Pseudo ps = !cast<MUBUF_Pseudo>(opName);
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ multiclass MUBUF_Real_AllAddr_gfx10<bits<8> op> {
+ def _BOTHEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+ def _IDXEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _OFFEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _OFFSET_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ }
+ multiclass MUBUF_Real_AllAddr_Lds_gfx10<bits<8> op> {
+ def _OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ MUBUFLdsTable<0, NAME # "_OFFSET_gfx10">;
+ def _OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ MUBUFLdsTable<0, NAME # "_OFFEN_gfx10">;
+ def _IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ MUBUFLdsTable<0, NAME # "_IDXEN_gfx10">;
+ def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ MUBUFLdsTable<0, NAME # "_BOTHEN_gfx10">;
+
+ def _LDS_OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+ MUBUFLdsTable<1, NAME # "_OFFSET_gfx10">;
+ def _LDS_OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+ MUBUFLdsTable<1, NAME # "_OFFEN_gfx10">;
+ def _LDS_IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+ MUBUFLdsTable<1, NAME # "_IDXEN_gfx10">;
+ def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+ MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">;
+ }
+ multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
+ MUBUF_Real_AllAddr_gfx10<op> {
+ def _BOTHEN_RTN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
+ def _IDXEN_RTN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ def _OFFEN_RTN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ def _OFFSET_RTN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>;
+defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x01b>;
+defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_gfx10<0x020>;
+defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x021>;
+defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_gfx10<0x022>;
+defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x023>;
+defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_gfx10<0x024>;
+defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x025>;
+// FIXME-GFX10: Add following instructions:
+//defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x026>;
+//defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x027>;
+defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx10<0x080>;
+defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx10<0x081>;
+defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx10<0x082>;
+defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx10<0x083>;
+defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx10<0x084>;
+defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx10<0x085>;
+defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx10<0x086>;
+defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx10<0x087>;
+
+def BUFFER_GL0_INV_gfx10 :
+ MUBUF_Real_gfx10<0x071, BUFFER_GL0_INV>;
+def BUFFER_GL1_INV_gfx10 :
+ MUBUF_Real_gfx10<0x072, BUFFER_GL1_INV>;
+
+//===----------------------------------------------------------------------===//
+// MUBUF - GFX6, GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX6, DecoderNamespace = "GFX6" in {
+ multiclass MUBUF_Real_gfx6<bits<8> op> {
+ def _gfx6 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME)>;
+ }
+} // End AssemblerPredicate = isGFX6, DecoderNamespace = "GFX6"
+
+let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
+ multiclass MUBUF_Real_gfx7<bits<8> op> {
+ def _gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME)>;
+ }
+} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
+
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass MUBUF_Real_AllAddr_gfx6_gfx7<bits<8> op> {
+ def _ADDR64_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>;
+ def _BOTHEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+ def _IDXEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _OFFEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _OFFSET_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ }
+ multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7<bits<8> op> {
+ def _OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ MUBUFLdsTable<0, NAME # "_OFFSET_gfx6_gfx7">;
+ def _ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
+ MUBUFLdsTable<0, NAME # "_ADDR64_gfx6_gfx7">;
+ def _OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ MUBUFLdsTable<0, NAME # "_OFFEN_gfx6_gfx7">;
+ def _IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ MUBUFLdsTable<0, NAME # "_IDXEN_gfx6_gfx7">;
+ def _BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ MUBUFLdsTable<0, NAME # "_BOTHEN_gfx6_gfx7">;
+
+ def _LDS_OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+ MUBUFLdsTable<1, NAME # "_OFFSET_gfx6_gfx7">;
+ def _LDS_ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>,
+ MUBUFLdsTable<1, NAME # "_ADDR64_gfx6_gfx7">;
+ def _LDS_OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+ MUBUFLdsTable<1, NAME # "_OFFEN_gfx6_gfx7">;
+ def _LDS_IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+ MUBUFLdsTable<1, NAME # "_IDXEN_gfx6_gfx7">;
+ def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+ MUBUFLdsTable<1, NAME # "_BOTHEN_gfx6_gfx7">;
+ }
+ multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> :
+ MUBUF_Real_AllAddr_gfx6_gfx7<op> {
+ def _ADDR64_RTN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>;
+ def _BOTHEN_RTN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
+ def _IDXEN_RTN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ def _OFFEN_RTN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ def _OFFSET_RTN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ }
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<bits<8> op> :
+ MUBUF_Real_AllAddr_gfx6_gfx7<op>, MUBUF_Real_AllAddr_gfx10<op>;
+
+multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<bits<8> op> :
+ MUBUF_Real_AllAddr_Lds_gfx6_gfx7<op>, MUBUF_Real_AllAddr_Lds_gfx10<op>;
+
+multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10<bits<8> op> :
+ MUBUF_Real_Atomics_gfx6_gfx7<op>, MUBUF_Real_Atomics_gfx10<op>;
+
+// FIXME-GFX6: Following instructions are available only on GFX6.
+//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomics_gfx6 <0x034>;
+//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Real_Atomics_gfx6 <0x054>;
+
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x000>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x001>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x002>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x003>;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x004>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x005>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x006>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x007>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x008>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x009>;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00a>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00b>;
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00c>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00d>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00e>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00f>;
+defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x018>;
+defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01a>;
+defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01c>;
+defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01d>;
+defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01e>;
+defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01f>;
+
+defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x030>;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x031>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x032>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x033>;
+defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x035>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x036>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x037>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x038>;
+defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x039>;
+defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03a>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03b>;
+defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03c>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03d>;
+// FIXME-GFX6-GFX7-GFX10: Add following instructions:
+//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>;
+//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>;
+//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>;
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x050>;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x051>;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x052>;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x053>;
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x055>;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x056>;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x057>;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x058>;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x059>;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05a>;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05b>;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>;
+// FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7.
+// FIXME-GFX6-GFX7-GFX10: Add following instructions:
+//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
+//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
+//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
+
+defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>;
+defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>;
+def BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>;
+
+//===----------------------------------------------------------------------===//
+// Base ENC_MTBUF for GFX6, GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> :
+ MTBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> {
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
- let Inst{15} = ps.addr64;
let Inst{18-16} = op;
- let Inst{22-19} = dfmt;
- let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
@@ -1792,47 +2004,87 @@ class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
-multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
- def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
- def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
- def _OFFEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
- def _IDXEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
- def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
-}
+//===----------------------------------------------------------------------===//
+// MTBUF - GFX10.
+//===----------------------------------------------------------------------===//
+
+class MTBUF_Real_gfx10<bits<4> op, MTBUF_Pseudo ps> :
+ Base_MTBUF_Real_gfx6_gfx7_gfx10<op{2-0}, ps, SIEncodingFamily.GFX10> {
+ let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value);
+ let Inst{25-19} = format;
+ let Inst{53} = op{3};
+}
+
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass MTBUF_Real_AllAddr_gfx10<bits<4> op> {
+ def _BOTHEN_gfx10 :
+ MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+ def _IDXEN_gfx10 :
+ MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _OFFEN_gfx10 :
+ MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _OFFSET_gfx10 :
+ MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
-defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>;
-defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>;
-defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>;
-defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>;
-defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_si <6>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
+defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx10<0x008>;
+defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx10<0x009>;
+defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx10<0x00a>;
+defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx10<0x00b>;
+defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx10<0x00c>;
+defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx10<0x00d>;
+defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx10<0x00e>;
+defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx10<0x00f>;
//===----------------------------------------------------------------------===//
-// CI
-// MTBUF - GFX6, GFX7.
+// MTBUF - GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
-class MUBUF_Real_ci <bits<7> op, MUBUF_Pseudo ps> :
- MUBUF_Real_si<op, ps> {
- let AssemblerPredicate=isCIOnly;
- let DecoderNamespace="CI";
+class MTBUF_Real_gfx6_gfx7<bits<4> op, MTBUF_Pseudo ps> :
+ Base_MTBUF_Real_gfx6_gfx7_gfx10<op{2-0}, ps, SIEncodingFamily.SI> {
+ let Inst{15} = ps.addr64;
+ let Inst{22-19} = dfmt;
+ let Inst{25-23} = nfmt;
}
-def BUFFER_WBINVL1_VOL_ci : MUBUF_Real_ci <0x70, BUFFER_WBINVL1_VOL>;
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass MTBUF_Real_AllAddr_gfx6_gfx7<bits<4> op> {
+ def _ADDR64_gfx6_gfx7 :
+ MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
+ def _BOTHEN_gfx6_gfx7 :
+ MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+ def _IDXEN_gfx6_gfx7 :
+ MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _OFFEN_gfx6_gfx7 :
+ MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _OFFSET_gfx6_gfx7 :
+ MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ }
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<bits<4> op> :
+ MTBUF_Real_AllAddr_gfx6_gfx7<op>, MTBUF_Real_AllAddr_gfx10<op>;
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x000>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x001>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x002>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x003>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x004>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x005>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x006>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x007>;
//===----------------------------------------------------------------------===//
-// VI
+// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
- MUBUF_Real<op, ps>,
+ MUBUF_Real<ps>,
Enc64,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
- let AssemblerPredicate=isVI;
- let DecoderNamespace="VI";
+ let AssemblerPredicate = isGFX8GFX9;
+ let DecoderNamespace = "GFX8";
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
@@ -1878,7 +2130,7 @@ multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
}
class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
- MUBUF_Real<op, ps>,
+ MUBUF_Real<ps>,
Enc64,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> {
let AssemblerPredicate=HasUnpackedD16VMem;
@@ -2002,12 +2254,19 @@ def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>;
def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
+let SubtargetPredicate = HasAtomicFaddInsts in {
+
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_AllAddr_vi <0x4d>;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_AllAddr_vi <0x4e>;
+
+} // End SubtargetPredicate = HasAtomicFaddInsts
+
class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
MTBUF_Real<ps>,
Enc64,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
- let AssemblerPredicate=isVI;
- let DecoderNamespace="VI";
+ let AssemblerPredicate = isGFX8GFX9;
+ let DecoderNamespace = "GFX8";
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td
index ae40c6387982..1a526675164a 100644
--- a/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/lib/Target/AMDGPU/CaymanInstructions.td
@@ -1,9 +1,8 @@
//===-- CaymanInstructions.td - CM Instruction defs -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index 31d2ebef481d..c52eaaa3fdc5 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -1,9 +1,8 @@
//===-- DSInstructions.td - DS Instruction Defintions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -11,8 +10,6 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
InstSI <outs, ins, "", pattern>,
SIMCInstr <opName, SIEncodingFamily.NONE> {
- let SubtargetPredicate = isGCN;
-
let LGKM_CNT = 1;
let DS = 1;
let Size = 8;
@@ -21,6 +18,7 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
// Most instruction load and store data, so set this as the default.
let mayLoad = 1;
let mayStore = 1;
+ let maybeAtomic = 1;
let hasSideEffects = 0;
let SchedRW = [WriteLDS];
@@ -40,6 +38,8 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
bits<1> has_data0 = 1;
bits<1> has_data1 = 1;
+ bits<1> has_gws_data0 = 0; // data0 is encoded as addr
+
bits<1> has_offset = 1; // has "offset" that should be split to offset0,1
bits<1> has_offset0 = 1;
bits<1> has_offset1 = 1;
@@ -61,6 +61,7 @@ class DS_Real <DS_Pseudo ds> :
// copy relevant pseudo op flags
let SubtargetPredicate = ds.SubtargetPredicate;
+ let OtherPredicates = ds.OtherPredicates;
let AsmMatchConverter = ds.AsmMatchConverter;
// encoding fields
@@ -322,7 +323,7 @@ class DS_GWS_1D <string opName>
: DS_GWS<opName,
(ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
- let has_data0 = 1;
+ let has_gws_data0 = 1;
}
class DS_VOID <string opName> : DS_Pseudo<opName,
@@ -469,11 +470,15 @@ defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>;
defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
-def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">;
+let isConvergent = 1, usesCustomInserter = 1 in {
+def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> {
+ let mayLoad = 0;
+}
def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">;
def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">;
def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">;
def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">;
+}
def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;
@@ -550,12 +555,14 @@ def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
// Instruction definitions for CI and newer.
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isCIVI in {
+let SubtargetPredicate = isGFX7Plus in {
defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPR_32>;
defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VReg_64>;
+let isConvergent = 1, usesCustomInserter = 1 in {
def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">;
+}
let mayStore = 0 in {
defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", VReg_96>;
@@ -569,13 +576,13 @@ defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", VReg_128>;
def DS_NOP : DS_VOID<"ds_nop">;
-} // let SubtargetPredicate = isCIVI
+} // let SubtargetPredicate = isGFX7Plus
//===----------------------------------------------------------------------===//
// Instruction definitions for VI and newer.
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8Plus in {
let Uses = [EXEC] in {
def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32",
@@ -586,7 +593,7 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
-} // let SubtargetPredicate = isVI
+} // let SubtargetPredicate = isGFX8Plus
//===----------------------------------------------------------------------===//
// DS Patterns
@@ -597,9 +604,9 @@ def : GCNPat <
(DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
>;
-class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
(vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
- (inst $ptr, (as_i16imm $offset), (i1 0))
+ (inst $ptr, (as_i16imm $offset), (i1 gds))
>;
multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
@@ -613,38 +620,21 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
}
-
-multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
- def : GCNPat <
- (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
- (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
- >;
-
- def : GCNPat <
- (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))),
- (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
- >;
-}
-
-multiclass DSReadPat_Lo16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
- def : GCNPat <
- (build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (vt (Hi16Elt vt:$hi))),
- (v2i16 (inst $ptr, (as_i16imm $offset), 0, $hi))
- >;
-
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), (f16 (Hi16Elt f16:$hi))),
- (v2f16 (inst $ptr, (as_i16imm $offset), 0, $hi))
- >;
-}
+class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
+ (inst $ptr, (as_i16imm $offset), (i1 0), $in)
+>;
defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
-defm : DSReadPat_mc <DS_READ_U8, i32, "az_extloadi8_local">;
defm : DSReadPat_mc <DS_READ_I8, i16, "sextloadi8_local">;
-defm : DSReadPat_mc <DS_READ_U8, i16, "az_extloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i32, "extloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i32, "zextloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i16, "extloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i16, "zextloadi8_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
-defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">;
+defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
+defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
@@ -658,21 +648,24 @@ defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
} // End AddedComplexity = 100
let OtherPredicates = [D16PreservesUnusedBits] in {
-let AddedComplexity = 100 in {
-defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
-defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
-defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>;
-
-defm : DSReadPat_Lo16<DS_READ_U16_D16, load_local>;
-defm : DSReadPat_Lo16<DS_READ_U8_D16, az_extloadi8_local>;
-defm : DSReadPat_Lo16<DS_READ_I8_D16, sextloadi8_local>;
-
-}
+def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>;
+def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2f16>;
+def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2i16>;
+def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2f16>;
+
+def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2f16>;
+def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2f16>;
+def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2i16>;
+def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>;
}
-class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
(frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
- (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+ (inst $ptr, $value, (as_i16imm $offset), (i1 gds))
>;
multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
@@ -730,7 +723,7 @@ class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
// related to bounds checking.
-let OtherPredicates = [LDSRequiresM0Init, isCIVI] in {
+let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
}
@@ -747,260 +740,313 @@ defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
} // End AddedComplexity = 100
-class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
- (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+ (inst $ptr, $value, (as_i16imm $offset), (i1 gds))
>;
multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0")>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag)>;
+ !cast<PatFrag>(frag#"_local")>;
}
+
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>;
}
-class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
- (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
+ (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 gds))
>;
multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0")>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag)>;
+ !cast<PatFrag>(frag#"_local")>;
}
+
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>;
}
// 32-bit atomics.
-defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap_local">;
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add_local">;
-defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub_local">;
-defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc_local">;
-defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec_local">;
-defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and_local">;
-defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or_local">;
-defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor_local">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min_local">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">;
-defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin_local">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax_local">;
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd_local">;
+defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add">;
+defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub">;
+defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc">;
+defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec">;
+defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and">;
+defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or">;
+defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax">;
+defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd">;
// 64-bit atomics.
-defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">;
-defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add_local">;
-defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub_local">;
-defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc_local">;
-defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec_local">;
-defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and_local">;
-defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or_local">;
-defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor_local">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min_local">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max_local">;
-defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin_local">;
-defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax_local">;
-
-defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap_local">;
+defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add">;
+defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub">;
+defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc">;
+defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec">;
+defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and">;
+defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or">;
+defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax">;
+
+defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">;
+
+def : Pat <
+ (SIds_ordered_count i32:$value, i16:$offset),
+ (DS_ORDERED_COUNT $value, (as_i16imm $offset))
+>;
//===----------------------------------------------------------------------===//
-// Real instructions
+// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// SIInstructions.td
+// Base ENC_DS for GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
-class DS_Real_si <bits<8> op, DS_Pseudo ds> :
- DS_Real <ds>,
- SIMCInstr <ds.Mnemonic, SIEncodingFamily.SI> {
- let AssemblerPredicates=[isSICI];
- let DecoderNamespace="SICI";
+class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> :
+ DS_Real<ps>, SIMCInstr <ps.Mnemonic, ef> {
- // encoding
- let Inst{7-0} = !if(ds.has_offset0, offset0, 0);
- let Inst{15-8} = !if(ds.has_offset1, offset1, 0);
- let Inst{17} = !if(ds.has_gds, gds, ds.gdsValue);
+ let Inst{7-0} = !if(ps.has_offset0, offset0, 0);
+ let Inst{15-8} = !if(ps.has_offset1, offset1, 0);
+ let Inst{17} = !if(ps.has_gds, gds, ps.gdsValue);
let Inst{25-18} = op;
- let Inst{31-26} = 0x36; // ds prefix
- let Inst{39-32} = !if(ds.has_addr, addr, 0);
- let Inst{47-40} = !if(ds.has_data0, data0, 0);
- let Inst{55-48} = !if(ds.has_data1, data1, 0);
- let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+ let Inst{31-26} = 0x36;
+ let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0, 0));
+ let Inst{47-40} = !if(ps.has_data0, data0, 0);
+ let Inst{55-48} = !if(ps.has_data1, data1, 0);
+ let Inst{63-56} = !if(ps.has_vdst, vdst, 0);
}
-def DS_ADD_U32_si : DS_Real_si<0x0, DS_ADD_U32>;
-def DS_SUB_U32_si : DS_Real_si<0x1, DS_SUB_U32>;
-def DS_RSUB_U32_si : DS_Real_si<0x2, DS_RSUB_U32>;
-def DS_INC_U32_si : DS_Real_si<0x3, DS_INC_U32>;
-def DS_DEC_U32_si : DS_Real_si<0x4, DS_DEC_U32>;
-def DS_MIN_I32_si : DS_Real_si<0x5, DS_MIN_I32>;
-def DS_MAX_I32_si : DS_Real_si<0x6, DS_MAX_I32>;
-def DS_MIN_U32_si : DS_Real_si<0x7, DS_MIN_U32>;
-def DS_MAX_U32_si : DS_Real_si<0x8, DS_MAX_U32>;
-def DS_AND_B32_si : DS_Real_si<0x9, DS_AND_B32>;
-def DS_OR_B32_si : DS_Real_si<0xa, DS_OR_B32>;
-def DS_XOR_B32_si : DS_Real_si<0xb, DS_XOR_B32>;
-def DS_MSKOR_B32_si : DS_Real_si<0xc, DS_MSKOR_B32>;
-def DS_WRITE_B32_si : DS_Real_si<0xd, DS_WRITE_B32>;
-def DS_WRITE2_B32_si : DS_Real_si<0xe, DS_WRITE2_B32>;
-def DS_WRITE2ST64_B32_si : DS_Real_si<0xf, DS_WRITE2ST64_B32>;
-def DS_CMPST_B32_si : DS_Real_si<0x10, DS_CMPST_B32>;
-def DS_CMPST_F32_si : DS_Real_si<0x11, DS_CMPST_F32>;
-def DS_MIN_F32_si : DS_Real_si<0x12, DS_MIN_F32>;
-def DS_MAX_F32_si : DS_Real_si<0x13, DS_MAX_F32>;
-def DS_NOP_si : DS_Real_si<0x14, DS_NOP>;
-def DS_GWS_INIT_si : DS_Real_si<0x19, DS_GWS_INIT>;
-def DS_GWS_SEMA_V_si : DS_Real_si<0x1a, DS_GWS_SEMA_V>;
-def DS_GWS_SEMA_BR_si : DS_Real_si<0x1b, DS_GWS_SEMA_BR>;
-def DS_GWS_SEMA_P_si : DS_Real_si<0x1c, DS_GWS_SEMA_P>;
-def DS_GWS_BARRIER_si : DS_Real_si<0x1d, DS_GWS_BARRIER>;
-def DS_WRITE_B8_si : DS_Real_si<0x1e, DS_WRITE_B8>;
-def DS_WRITE_B16_si : DS_Real_si<0x1f, DS_WRITE_B16>;
-def DS_ADD_RTN_U32_si : DS_Real_si<0x20, DS_ADD_RTN_U32>;
-def DS_SUB_RTN_U32_si : DS_Real_si<0x21, DS_SUB_RTN_U32>;
-def DS_RSUB_RTN_U32_si : DS_Real_si<0x22, DS_RSUB_RTN_U32>;
-def DS_INC_RTN_U32_si : DS_Real_si<0x23, DS_INC_RTN_U32>;
-def DS_DEC_RTN_U32_si : DS_Real_si<0x24, DS_DEC_RTN_U32>;
-def DS_MIN_RTN_I32_si : DS_Real_si<0x25, DS_MIN_RTN_I32>;
-def DS_MAX_RTN_I32_si : DS_Real_si<0x26, DS_MAX_RTN_I32>;
-def DS_MIN_RTN_U32_si : DS_Real_si<0x27, DS_MIN_RTN_U32>;
-def DS_MAX_RTN_U32_si : DS_Real_si<0x28, DS_MAX_RTN_U32>;
-def DS_AND_RTN_B32_si : DS_Real_si<0x29, DS_AND_RTN_B32>;
-def DS_OR_RTN_B32_si : DS_Real_si<0x2a, DS_OR_RTN_B32>;
-def DS_XOR_RTN_B32_si : DS_Real_si<0x2b, DS_XOR_RTN_B32>;
-def DS_MSKOR_RTN_B32_si : DS_Real_si<0x2c, DS_MSKOR_RTN_B32>;
-def DS_WRXCHG_RTN_B32_si : DS_Real_si<0x2d, DS_WRXCHG_RTN_B32>;
-def DS_WRXCHG2_RTN_B32_si : DS_Real_si<0x2e, DS_WRXCHG2_RTN_B32>;
-def DS_WRXCHG2ST64_RTN_B32_si : DS_Real_si<0x2f, DS_WRXCHG2ST64_RTN_B32>;
-def DS_CMPST_RTN_B32_si : DS_Real_si<0x30, DS_CMPST_RTN_B32>;
-def DS_CMPST_RTN_F32_si : DS_Real_si<0x31, DS_CMPST_RTN_F32>;
-def DS_MIN_RTN_F32_si : DS_Real_si<0x32, DS_MIN_RTN_F32>;
-def DS_MAX_RTN_F32_si : DS_Real_si<0x33, DS_MAX_RTN_F32>;
-
-// These instruction are CI/VI only
-def DS_WRAP_RTN_B32_si : DS_Real_si<0x34, DS_WRAP_RTN_B32>;
-def DS_CONDXCHG32_RTN_B64_si : DS_Real_si<0x7e, DS_CONDXCHG32_RTN_B64>;
-def DS_GWS_SEMA_RELEASE_ALL_si : DS_Real_si<0x18, DS_GWS_SEMA_RELEASE_ALL>;
-
-def DS_SWIZZLE_B32_si : DS_Real_si<0x35, DS_SWIZZLE_B32>;
-def DS_READ_B32_si : DS_Real_si<0x36, DS_READ_B32>;
-def DS_READ2_B32_si : DS_Real_si<0x37, DS_READ2_B32>;
-def DS_READ2ST64_B32_si : DS_Real_si<0x38, DS_READ2ST64_B32>;
-def DS_READ_I8_si : DS_Real_si<0x39, DS_READ_I8>;
-def DS_READ_U8_si : DS_Real_si<0x3a, DS_READ_U8>;
-def DS_READ_I16_si : DS_Real_si<0x3b, DS_READ_I16>;
-def DS_READ_U16_si : DS_Real_si<0x3c, DS_READ_U16>;
-def DS_CONSUME_si : DS_Real_si<0x3d, DS_CONSUME>;
-def DS_APPEND_si : DS_Real_si<0x3e, DS_APPEND>;
-def DS_ORDERED_COUNT_si : DS_Real_si<0x3f, DS_ORDERED_COUNT>;
-def DS_ADD_U64_si : DS_Real_si<0x40, DS_ADD_U64>;
-def DS_SUB_U64_si : DS_Real_si<0x41, DS_SUB_U64>;
-def DS_RSUB_U64_si : DS_Real_si<0x42, DS_RSUB_U64>;
-def DS_INC_U64_si : DS_Real_si<0x43, DS_INC_U64>;
-def DS_DEC_U64_si : DS_Real_si<0x44, DS_DEC_U64>;
-def DS_MIN_I64_si : DS_Real_si<0x45, DS_MIN_I64>;
-def DS_MAX_I64_si : DS_Real_si<0x46, DS_MAX_I64>;
-def DS_MIN_U64_si : DS_Real_si<0x47, DS_MIN_U64>;
-def DS_MAX_U64_si : DS_Real_si<0x48, DS_MAX_U64>;
-def DS_AND_B64_si : DS_Real_si<0x49, DS_AND_B64>;
-def DS_OR_B64_si : DS_Real_si<0x4a, DS_OR_B64>;
-def DS_XOR_B64_si : DS_Real_si<0x4b, DS_XOR_B64>;
-def DS_MSKOR_B64_si : DS_Real_si<0x4c, DS_MSKOR_B64>;
-def DS_WRITE_B64_si : DS_Real_si<0x4d, DS_WRITE_B64>;
-def DS_WRITE2_B64_si : DS_Real_si<0x4E, DS_WRITE2_B64>;
-def DS_WRITE2ST64_B64_si : DS_Real_si<0x4f, DS_WRITE2ST64_B64>;
-def DS_CMPST_B64_si : DS_Real_si<0x50, DS_CMPST_B64>;
-def DS_CMPST_F64_si : DS_Real_si<0x51, DS_CMPST_F64>;
-def DS_MIN_F64_si : DS_Real_si<0x52, DS_MIN_F64>;
-def DS_MAX_F64_si : DS_Real_si<0x53, DS_MAX_F64>;
-
-def DS_ADD_RTN_U64_si : DS_Real_si<0x60, DS_ADD_RTN_U64>;
-def DS_SUB_RTN_U64_si : DS_Real_si<0x61, DS_SUB_RTN_U64>;
-def DS_RSUB_RTN_U64_si : DS_Real_si<0x62, DS_RSUB_RTN_U64>;
-def DS_INC_RTN_U64_si : DS_Real_si<0x63, DS_INC_RTN_U64>;
-def DS_DEC_RTN_U64_si : DS_Real_si<0x64, DS_DEC_RTN_U64>;
-def DS_MIN_RTN_I64_si : DS_Real_si<0x65, DS_MIN_RTN_I64>;
-def DS_MAX_RTN_I64_si : DS_Real_si<0x66, DS_MAX_RTN_I64>;
-def DS_MIN_RTN_U64_si : DS_Real_si<0x67, DS_MIN_RTN_U64>;
-def DS_MAX_RTN_U64_si : DS_Real_si<0x68, DS_MAX_RTN_U64>;
-def DS_AND_RTN_B64_si : DS_Real_si<0x69, DS_AND_RTN_B64>;
-def DS_OR_RTN_B64_si : DS_Real_si<0x6a, DS_OR_RTN_B64>;
-def DS_XOR_RTN_B64_si : DS_Real_si<0x6b, DS_XOR_RTN_B64>;
-def DS_MSKOR_RTN_B64_si : DS_Real_si<0x6c, DS_MSKOR_RTN_B64>;
-def DS_WRXCHG_RTN_B64_si : DS_Real_si<0x6d, DS_WRXCHG_RTN_B64>;
-def DS_WRXCHG2_RTN_B64_si : DS_Real_si<0x6e, DS_WRXCHG2_RTN_B64>;
-def DS_WRXCHG2ST64_RTN_B64_si : DS_Real_si<0x6f, DS_WRXCHG2ST64_RTN_B64>;
-def DS_CMPST_RTN_B64_si : DS_Real_si<0x70, DS_CMPST_RTN_B64>;
-def DS_CMPST_RTN_F64_si : DS_Real_si<0x71, DS_CMPST_RTN_F64>;
-def DS_MIN_RTN_F64_si : DS_Real_si<0x72, DS_MIN_RTN_F64>;
-def DS_MAX_RTN_F64_si : DS_Real_si<0x73, DS_MAX_RTN_F64>;
-
-def DS_READ_B64_si : DS_Real_si<0x76, DS_READ_B64>;
-def DS_READ2_B64_si : DS_Real_si<0x77, DS_READ2_B64>;
-def DS_READ2ST64_B64_si : DS_Real_si<0x78, DS_READ2ST64_B64>;
-
-def DS_ADD_SRC2_U32_si : DS_Real_si<0x80, DS_ADD_SRC2_U32>;
-def DS_SUB_SRC2_U32_si : DS_Real_si<0x81, DS_SUB_SRC2_U32>;
-def DS_RSUB_SRC2_U32_si : DS_Real_si<0x82, DS_RSUB_SRC2_U32>;
-def DS_INC_SRC2_U32_si : DS_Real_si<0x83, DS_INC_SRC2_U32>;
-def DS_DEC_SRC2_U32_si : DS_Real_si<0x84, DS_DEC_SRC2_U32>;
-def DS_MIN_SRC2_I32_si : DS_Real_si<0x85, DS_MIN_SRC2_I32>;
-def DS_MAX_SRC2_I32_si : DS_Real_si<0x86, DS_MAX_SRC2_I32>;
-def DS_MIN_SRC2_U32_si : DS_Real_si<0x87, DS_MIN_SRC2_U32>;
-def DS_MAX_SRC2_U32_si : DS_Real_si<0x88, DS_MAX_SRC2_U32>;
-def DS_AND_SRC2_B32_si : DS_Real_si<0x89, DS_AND_SRC2_B32>;
-def DS_OR_SRC2_B32_si : DS_Real_si<0x8a, DS_OR_SRC2_B32>;
-def DS_XOR_SRC2_B32_si : DS_Real_si<0x8b, DS_XOR_SRC2_B32>;
-def DS_WRITE_SRC2_B32_si : DS_Real_si<0x8d, DS_WRITE_SRC2_B32>;
-
-def DS_MIN_SRC2_F32_si : DS_Real_si<0x92, DS_MIN_SRC2_F32>;
-def DS_MAX_SRC2_F32_si : DS_Real_si<0x93, DS_MAX_SRC2_F32>;
-
-def DS_ADD_SRC2_U64_si : DS_Real_si<0xc0, DS_ADD_SRC2_U64>;
-def DS_SUB_SRC2_U64_si : DS_Real_si<0xc1, DS_SUB_SRC2_U64>;
-def DS_RSUB_SRC2_U64_si : DS_Real_si<0xc2, DS_RSUB_SRC2_U64>;
-def DS_INC_SRC2_U64_si : DS_Real_si<0xc3, DS_INC_SRC2_U64>;
-def DS_DEC_SRC2_U64_si : DS_Real_si<0xc4, DS_DEC_SRC2_U64>;
-def DS_MIN_SRC2_I64_si : DS_Real_si<0xc5, DS_MIN_SRC2_I64>;
-def DS_MAX_SRC2_I64_si : DS_Real_si<0xc6, DS_MAX_SRC2_I64>;
-def DS_MIN_SRC2_U64_si : DS_Real_si<0xc7, DS_MIN_SRC2_U64>;
-def DS_MAX_SRC2_U64_si : DS_Real_si<0xc8, DS_MAX_SRC2_U64>;
-def DS_AND_SRC2_B64_si : DS_Real_si<0xc9, DS_AND_SRC2_B64>;
-def DS_OR_SRC2_B64_si : DS_Real_si<0xca, DS_OR_SRC2_B64>;
-def DS_XOR_SRC2_B64_si : DS_Real_si<0xcb, DS_XOR_SRC2_B64>;
-def DS_WRITE_SRC2_B64_si : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>;
-
-def DS_MIN_SRC2_F64_si : DS_Real_si<0xd2, DS_MIN_SRC2_F64>;
-def DS_MAX_SRC2_F64_si : DS_Real_si<0xd3, DS_MAX_SRC2_F64>;
-def DS_WRITE_B96_si : DS_Real_si<0xde, DS_WRITE_B96>;
-def DS_WRITE_B128_si : DS_Real_si<0xdf, DS_WRITE_B128>;
-def DS_READ_B96_si : DS_Real_si<0xfe, DS_READ_B96>;
-def DS_READ_B128_si : DS_Real_si<0xff, DS_READ_B128>;
+//===----------------------------------------------------------------------===//
+// GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass DS_Real_gfx10<bits<8> op> {
+ def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ SIEncodingFamily.GFX10>;
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+defm DS_ADD_F32 : DS_Real_gfx10<0x015>;
+defm DS_ADD_RTN_F32 : DS_Real_gfx10<0x055>;
+defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>;
+defm DS_WRITE_B8_D16_HI : DS_Real_gfx10<0x0a0>;
+defm DS_WRITE_B16_D16_HI : DS_Real_gfx10<0x0a1>;
+defm DS_READ_U8_D16 : DS_Real_gfx10<0x0a2>;
+defm DS_READ_U8_D16_HI : DS_Real_gfx10<0x0a3>;
+defm DS_READ_I8_D16 : DS_Real_gfx10<0x0a4>;
+defm DS_READ_I8_D16_HI : DS_Real_gfx10<0x0a5>;
+defm DS_READ_U16_D16 : DS_Real_gfx10<0x0a6>;
+defm DS_READ_U16_D16_HI : DS_Real_gfx10<0x0a7>;
+defm DS_WRITE_ADDTID_B32 : DS_Real_gfx10<0x0b0>;
+defm DS_READ_ADDTID_B32 : DS_Real_gfx10<0x0b1>;
+defm DS_PERMUTE_B32 : DS_Real_gfx10<0x0b2>;
+defm DS_BPERMUTE_B32 : DS_Real_gfx10<0x0b3>;
+
+//===----------------------------------------------------------------------===//
+// GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
+ multiclass DS_Real_gfx7<bits<8> op> {
+ def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ SIEncodingFamily.SI>;
+ }
+} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
+
+multiclass DS_Real_gfx7_gfx10<bits<8> op> :
+ DS_Real_gfx7<op>, DS_Real_gfx10<op>;
+
+// FIXME-GFX7: Add tests when upstreaming this part.
+defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10<0x018>;
+defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10<0x034>;
+defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10<0x07e>;
+defm DS_WRITE_B96 : DS_Real_gfx7_gfx10<0x0de>;
+defm DS_WRITE_B128 : DS_Real_gfx7_gfx10<0x0df>;
+defm DS_READ_B96 : DS_Real_gfx7_gfx10<0x0fe>;
+defm DS_READ_B128 : DS_Real_gfx7_gfx10<0x0ff>;
+
+//===----------------------------------------------------------------------===//
+// GFX6, GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass DS_Real_gfx6_gfx7<bits<8> op> {
+ def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME),
+ SIEncodingFamily.SI>;
+ }
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> :
+ DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>;
+
+defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10<0x000>;
+defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x001>;
+defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x002>;
+defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10<0x003>;
+defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10<0x004>;
+defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10<0x005>;
+defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10<0x006>;
+defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10<0x007>;
+defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10<0x008>;
+defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10<0x009>;
+defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00a>;
+defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00b>;
+defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00c>;
+defm DS_WRITE_B32 : DS_Real_gfx6_gfx7_gfx10<0x00d>;
+defm DS_WRITE2_B32 : DS_Real_gfx6_gfx7_gfx10<0x00e>;
+defm DS_WRITE2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x00f>;
+defm DS_CMPST_B32 : DS_Real_gfx6_gfx7_gfx10<0x010>;
+defm DS_CMPST_F32 : DS_Real_gfx6_gfx7_gfx10<0x011>;
+defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10<0x012>;
+defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10<0x013>;
+defm DS_NOP : DS_Real_gfx6_gfx7_gfx10<0x014>;
+defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10<0x019>;
+defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10<0x01a>;
+defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10<0x01b>;
+defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10<0x01c>;
+defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10<0x01d>;
+defm DS_WRITE_B8 : DS_Real_gfx6_gfx7_gfx10<0x01e>;
+defm DS_WRITE_B16 : DS_Real_gfx6_gfx7_gfx10<0x01f>;
+defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x020>;
+defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x021>;
+defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x022>;
+defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x023>;
+defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x024>;
+defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x025>;
+defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x026>;
+defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x027>;
+defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x028>;
+defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x029>;
+defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02a>;
+defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02b>;
+defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02c>;
+defm DS_WRXCHG_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02d>;
+defm DS_WRXCHG2_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02e>;
+defm DS_WRXCHG2ST64_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02f>;
+defm DS_CMPST_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x030>;
+defm DS_CMPST_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x031>;
+defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x032>;
+defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x033>;
+defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10<0x035>;
+defm DS_READ_B32 : DS_Real_gfx6_gfx7_gfx10<0x036>;
+defm DS_READ2_B32 : DS_Real_gfx6_gfx7_gfx10<0x037>;
+defm DS_READ2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x038>;
+defm DS_READ_I8 : DS_Real_gfx6_gfx7_gfx10<0x039>;
+defm DS_READ_U8 : DS_Real_gfx6_gfx7_gfx10<0x03a>;
+defm DS_READ_I16 : DS_Real_gfx6_gfx7_gfx10<0x03b>;
+defm DS_READ_U16 : DS_Real_gfx6_gfx7_gfx10<0x03c>;
+defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10<0x03d>;
+defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10<0x03e>;
+defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10<0x03f>;
+defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10<0x040>;
+defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x041>;
+defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x042>;
+defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10<0x043>;
+defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10<0x044>;
+defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10<0x045>;
+defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10<0x046>;
+defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10<0x047>;
+defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10<0x048>;
+defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10<0x049>;
+defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04a>;
+defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04b>;
+defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04c>;
+defm DS_WRITE_B64 : DS_Real_gfx6_gfx7_gfx10<0x04d>;
+defm DS_WRITE2_B64 : DS_Real_gfx6_gfx7_gfx10<0x04e>;
+defm DS_WRITE2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x04f>;
+defm DS_CMPST_B64 : DS_Real_gfx6_gfx7_gfx10<0x050>;
+defm DS_CMPST_F64 : DS_Real_gfx6_gfx7_gfx10<0x051>;
+defm DS_MIN_F64 : DS_Real_gfx6_gfx7_gfx10<0x052>;
+defm DS_MAX_F64 : DS_Real_gfx6_gfx7_gfx10<0x053>;
+defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x060>;
+defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x061>;
+defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x062>;
+defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x063>;
+defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x064>;
+defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x065>;
+defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x066>;
+defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x067>;
+defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x068>;
+defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x069>;
+defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06a>;
+defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06b>;
+defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06c>;
+defm DS_WRXCHG_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06d>;
+defm DS_WRXCHG2_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06e>;
+defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06f>;
+defm DS_CMPST_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x070>;
+defm DS_CMPST_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x071>;
+defm DS_MIN_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x072>;
+defm DS_MAX_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x073>;
+defm DS_READ_B64 : DS_Real_gfx6_gfx7_gfx10<0x076>;
+defm DS_READ2_B64 : DS_Real_gfx6_gfx7_gfx10<0x077>;
+defm DS_READ2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x078>;
+defm DS_ADD_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x080>;
+defm DS_SUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x081>;
+defm DS_RSUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x082>;
+defm DS_INC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x083>;
+defm DS_DEC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x084>;
+defm DS_MIN_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x085>;
+defm DS_MAX_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x086>;
+defm DS_MIN_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x087>;
+defm DS_MAX_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x088>;
+defm DS_AND_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x089>;
+defm DS_OR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08a>;
+defm DS_XOR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08b>;
+defm DS_WRITE_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08d>;
+defm DS_MIN_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x092>;
+defm DS_MAX_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x093>;
+defm DS_ADD_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c0>;
+defm DS_SUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c1>;
+defm DS_RSUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c2>;
+defm DS_INC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c3>;
+defm DS_DEC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c4>;
+defm DS_MIN_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c5>;
+defm DS_MAX_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c6>;
+defm DS_MIN_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c7>;
+defm DS_MAX_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c8>;
+defm DS_AND_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0c9>;
+defm DS_OR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0ca>;
+defm DS_XOR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cb>;
+defm DS_WRITE_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cd>;
+defm DS_MIN_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d2>;
+defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3>;
//===----------------------------------------------------------------------===//
-// VIInstructions.td
+// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
class DS_Real_vi <bits<8> op, DS_Pseudo ds> :
DS_Real <ds>,
SIMCInstr <ds.Mnemonic, SIEncodingFamily.VI> {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace="VI";
+ let AssemblerPredicates = [isGFX8GFX9];
+ let DecoderNamespace = "GFX8";
// encoding
let Inst{7-0} = !if(ds.has_offset0, offset0, 0);
@@ -1008,7 +1054,7 @@ class DS_Real_vi <bits<8> op, DS_Pseudo ds> :
let Inst{16} = !if(ds.has_gds, gds, ds.gdsValue);
let Inst{24-17} = op;
let Inst{31-26} = 0x36; // ds prefix
- let Inst{39-32} = !if(ds.has_addr, addr, 0);
+ let Inst{39-32} = !if(ds.has_addr, addr, !if(ds.has_gws_data0, data0, 0));
let Inst{47-40} = !if(ds.has_data0, data0, 0);
let Inst{55-48} = !if(ds.has_data1, data1, 0);
let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index f3de903f21b2..4ec4be9bc485 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -22,13 +21,14 @@
#include "AMDGPURegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm-c/Disassembler.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCExpr.h"
@@ -52,8 +52,22 @@ using namespace llvm;
#define DEBUG_TYPE "amdgpu-disassembler"
+#define SGPR_MAX (isGFX10() ? AMDGPU::EncValues::SGPR_MAX_GFX10 \
+ : AMDGPU::EncValues::SGPR_MAX_SI)
+
using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
+AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
+ MCContext &Ctx,
+ MCInstrInfo const *MCII) :
+ MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()),
+ TargetMaxInstBytes(Ctx.getAsmInfo()->getMaxInstLength(&STI)) {
+
+ // ToDo: AMDGPUDisassembler supports only VI ISA.
+ if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10())
+ report_fatal_error("Disassembly not yet supported for subtarget");
+}
+
inline static MCDisassembler::DecodeStatus
addOperand(MCInst &Inst, const MCOperand& Opnd) {
Inst.addOperand(Opnd);
@@ -77,6 +91,8 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
uint64_t Addr, const void *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ // Our branches take a simm16, but we need two extra bits to account for the
+ // factor of 4.
APInt SignedOffset(18, Imm * 4, true);
int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue();
@@ -85,6 +101,12 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
return addOperand(Inst, MCOperand::createImm(Imm));
}
+static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val,
+ uint64_t Addr, const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeBoolReg(Val));
+}
+
#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
static DecodeStatus StaticDecoderName(MCInst &Inst, \
unsigned Imm, \
@@ -98,6 +120,7 @@ static DecodeStatus StaticDecoderName(MCInst &Inst, \
DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
DECODE_OPERAND_REG(VGPR_32)
+DECODE_OPERAND_REG(VRegOrLds_32)
DECODE_OPERAND_REG(VS_32)
DECODE_OPERAND_REG(VS_64)
DECODE_OPERAND_REG(VS_128)
@@ -109,12 +132,20 @@ DECODE_OPERAND_REG(VReg_128)
DECODE_OPERAND_REG(SReg_32)
DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
DECODE_OPERAND_REG(SReg_32_XEXEC_HI)
+DECODE_OPERAND_REG(SRegOrLds_32)
DECODE_OPERAND_REG(SReg_64)
DECODE_OPERAND_REG(SReg_64_XEXEC)
DECODE_OPERAND_REG(SReg_128)
DECODE_OPERAND_REG(SReg_256)
DECODE_OPERAND_REG(SReg_512)
+DECODE_OPERAND_REG(AGPR_32)
+DECODE_OPERAND_REG(AReg_128)
+DECODE_OPERAND_REG(AReg_512)
+DECODE_OPERAND_REG(AReg_1024)
+DECODE_OPERAND_REG(AV_32)
+DECODE_OPERAND_REG(AV_64)
+
static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
unsigned Imm,
uint64_t Addr,
@@ -131,6 +162,62 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
}
+static DecodeStatus decodeOperand_VS_16(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
+}
+
+static DecodeStatus decodeOperand_VS_32(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm));
+}
+
+static DecodeStatus decodeOperand_AReg_128(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512));
+}
+
+static DecodeStatus decodeOperand_AReg_512(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm | 512));
+}
+
+static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512));
+}
+
+static DecodeStatus decodeOperand_SReg_32(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeOperand_SReg_32(Imm));
+}
+
+static DecodeStatus decodeOperand_VGPR_32(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW32, Imm));
+}
+
#define DECODE_SDWA(DecName) \
DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
@@ -168,6 +255,16 @@ DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
return MCDisassembler::Fail;
}
+static bool isValidDPP8(const MCInst &MI) {
+ using namespace llvm::AMDGPU::DPP;
+ int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi);
+ assert(FiIdx != -1);
+ if ((unsigned)FiIdx >= MI.getNumOperands())
+ return false;
+ unsigned Fi = MI.getOperand(FiIdx).getImm();
+ return Fi == DPP8_FI_0 || Fi == DPP8_FI_1;
+}
+
DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes_,
uint64_t Address,
@@ -176,11 +273,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
CommentStream = &CS;
bool IsSDWA = false;
- // ToDo: AMDGPUDisassembler supports only VI ISA.
- if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding])
- report_fatal_error("Disassembly not yet supported for subtarget");
-
- const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size());
+ unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
Bytes = Bytes_.slice(0, MaxInstBytesNum);
DecodeStatus Res = MCDisassembler::Fail;
@@ -192,6 +285,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// encodings
if (Bytes.size() >= 8) {
const uint64_t QW = eatBytes<uint64_t>(Bytes);
+
+ Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address);
+ if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+ break;
+
+ MI = MCInst(); // clear
+
Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address);
if (Res) break;
@@ -201,6 +301,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
if (Res) { IsSDWA = true; break; }
+ Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address);
+ if (Res) { IsSDWA = true; break; }
+
+ // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
+ // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
+ // table first so we print the correct name.
+
+ if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) {
+ Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address);
+ if (Res) break;
+ }
+
if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) {
Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address);
if (Res)
@@ -223,7 +335,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Try decode 32-bit instruction
if (Bytes.size() < 4) break;
const uint32_t DW = eatBytes<uint32_t>(Bytes);
- Res = tryDecodeInst(DecoderTableVI32, MI, DW, Address);
+ Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address);
if (Res) break;
Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address);
@@ -232,33 +344,84 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address);
if (Res) break;
+ Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address);
+ if (Res) break;
+
if (Bytes.size() < 4) break;
const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
- Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address);
if (Res) break;
Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
if (Res) break;
Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address);
+ if (Res) break;
+
+ Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address);
} while (false);
+ if (Res && (MaxInstBytesNum - Bytes.size()) == 12 && (!HasLiteral ||
+ !(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3))) {
+ MaxInstBytesNum = 8;
+ Bytes = Bytes_.slice(0, MaxInstBytesNum);
+ eatBytes<uint64_t>(Bytes);
+ }
+
if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
- MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
+ MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 ||
+ MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 ||
MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ||
- MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi)) {
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10)) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
}
if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
- Res = convertMIMGInst(MI);
+ int VAddr0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
+ int RsrcIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
+ unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
+ if (VAddr0Idx >= 0 && NSAArgs > 0) {
+ unsigned NSAWords = (NSAArgs + 3) / 4;
+ if (Bytes.size() < 4 * NSAWords) {
+ Res = MCDisassembler::Fail;
+ } else {
+ for (unsigned i = 0; i < NSAArgs; ++i) {
+ MI.insert(MI.begin() + VAddr0Idx + 1 + i,
+ decodeOperand_VGPR_32(Bytes[i]));
+ }
+ Bytes = Bytes.slice(4 * NSAWords);
+ }
+ }
+
+ if (Res)
+ Res = convertMIMGInst(MI);
}
if (Res && IsSDWA)
Res = convertSDWAInst(MI);
+ int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vdst_in);
+ if (VDstIn_Idx != -1) {
+ int Tied = MCII->get(MI.getOpcode()).getOperandConstraint(VDstIn_Idx,
+ MCOI::OperandConstraint::TIED_TO);
+ if (Tied != -1 && (MI.getNumOperands() <= (unsigned)VDstIn_Idx ||
+ !MI.getOperand(VDstIn_Idx).isReg() ||
+ MI.getOperand(VDstIn_Idx).getReg() != MI.getOperand(Tied).getReg())) {
+ if (MI.getNumOperands() > (unsigned)VDstIn_Idx)
+ MI.erase(&MI.getOperand(VDstIn_Idx));
+ insertNamedMCOperand(MI,
+ MCOperand::createReg(MI.getOperand(Tied).getReg()),
+ AMDGPU::OpName::vdst_in);
+ }
+ }
+
// if the opcode was not recognized we'll assume a Size of 4 bytes
// (unless there are fewer bytes left)
Size = Res ? (MaxInstBytesNum - Bytes.size())
@@ -267,7 +430,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
+ STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1)
// VOPC - insert clamp
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
@@ -285,9 +449,27 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
return MCDisassembler::Success;
}
-// Note that MIMG format provides no information about VADDR size.
-// Consequently, decoded instructions always show address
-// as if it has 1 dword, which could be not really so.
+DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
+ unsigned Opc = MI.getOpcode();
+ unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+
+ // Insert dummy unused src modifiers.
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src0_modifiers);
+
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src1_modifiers);
+
+ return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
+}
+
+// Note that before gfx10, the MIMG encoding provided no information about
+// VADDR size. Consequently, decoded instructions always show address as if it
+// has 1 dword, which could be not really so.
DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
@@ -295,7 +477,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdata);
-
+ int VAddr0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::dmask);
@@ -308,16 +491,42 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
assert(DMaskIdx != -1);
assert(TFEIdx != -1);
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
bool IsAtomic = (VDstIdx != -1);
bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
- unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
- if (DMask == 0)
- return MCDisassembler::Success;
+ bool IsNSA = false;
+ unsigned AddrSize = Info->VAddrDwords;
+
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
+ unsigned DimIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+ const AMDGPU::MIMGDimInfo *Dim =
+ AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());
+
+ AddrSize = BaseOpcode->NumExtraArgs +
+ (BaseOpcode->Gradients ? Dim->NumGradients : 0) +
+ (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA;
+ if (!IsNSA) {
+ if (AddrSize > 8)
+ AddrSize = 16;
+ else if (AddrSize > 4)
+ AddrSize = 8;
+ } else {
+ if (AddrSize > Info->VAddrDwords) {
+ // The NSA encoding does not contain enough operands for the combination
+ // of base opcode / dimension. Should this be an error?
+ return MCDisassembler::Success;
+ }
+ }
+ }
- unsigned DstSize = IsGather4 ? 4 : countPopulation(DMask);
- if (DstSize == 1)
- return MCDisassembler::Success;
+ unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf;
+ unsigned DstSize = IsGather4 ? 4 : std::max(countPopulation(DMask), 1u);
bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm();
if (D16 && AMDGPU::hasPackedD16(STI)) {
@@ -328,44 +537,64 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
if (MI.getOperand(TFEIdx).getImm())
return MCDisassembler::Success;
- int NewOpcode = -1;
+ if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
+ return MCDisassembler::Success;
+
+ int NewOpcode =
+ AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
+ if (NewOpcode == -1)
+ return MCDisassembler::Success;
- if (IsGather4) {
- if (D16 && AMDGPU::hasPackedD16(STI))
- NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), 2);
- else
+ // Widen the register to the correct number of enabled channels.
+ unsigned NewVdata = AMDGPU::NoRegister;
+ if (DstSize != Info->VDataDwords) {
+ auto DataRCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass;
+
+ // Get first subregister of VData
+ unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
+ unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
+ Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
+
+ NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
+ &MRI.getRegClass(DataRCID));
+ if (NewVdata == AMDGPU::NoRegister) {
+ // It's possible to encode this such that the low register + enabled
+ // components exceeds the register count.
return MCDisassembler::Success;
- } else {
- NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), DstSize);
- if (NewOpcode == -1)
+ }
+ }
+
+ unsigned NewVAddr0 = AMDGPU::NoRegister;
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX10] && !IsNSA &&
+ AddrSize != Info->VAddrDwords) {
+ unsigned VAddr0 = MI.getOperand(VAddr0Idx).getReg();
+ unsigned VAddrSub0 = MRI.getSubReg(VAddr0, AMDGPU::sub0);
+ VAddr0 = (VAddrSub0 != 0) ? VAddrSub0 : VAddr0;
+
+ auto AddrRCID = MCII->get(NewOpcode).OpInfo[VAddr0Idx].RegClass;
+ NewVAddr0 = MRI.getMatchingSuperReg(VAddr0, AMDGPU::sub0,
+ &MRI.getRegClass(AddrRCID));
+ if (NewVAddr0 == AMDGPU::NoRegister)
return MCDisassembler::Success;
}
- auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass;
+ MI.setOpcode(NewOpcode);
- // Get first subregister of VData
- unsigned Vdata0 = MI.getOperand(VDataIdx).getReg();
- unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0);
- Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0;
+ if (NewVdata != AMDGPU::NoRegister) {
+ MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
- // Widen the register to the correct number of enabled channels.
- auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0,
- &MRI.getRegClass(RCID));
- if (NewVdata == AMDGPU::NoRegister) {
- // It's possible to encode this such that the low register + enabled
- // components exceeds the register count.
- return MCDisassembler::Success;
+ if (IsAtomic) {
+ // Atomic operations have an additional operand (a copy of data)
+ MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
+ }
}
- MI.setOpcode(NewOpcode);
- // vaddr will be always appear as a single VGPR. This will look different than
- // how it is usually emitted because the number of register components is not
- // in the instruction encoding.
- MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata);
-
- if (IsAtomic) {
- // Atomic operations have an additional operand (a copy of data)
- MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata);
+ if (NewVAddr0 != AMDGPU::NoRegister) {
+ MI.getOperand(VAddr0Idx) = MCOperand::createReg(NewVAddr0);
+ } else if (IsNSA) {
+ assert(AddrSize <= Info->VAddrDwords);
+ MI.erase(MI.begin() + VAddr0Idx + AddrSize,
+ MI.begin() + VAddr0Idx + Info->VAddrDwords);
}
return MCDisassembler::Success;
@@ -470,6 +699,34 @@ MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
return createRegOperand(AMDGPU::VGPR_32RegClassID, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_VRegOrLds_32(unsigned Val) const {
+ return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AGPR_32(unsigned Val) const {
+ return createRegOperand(AMDGPU::AGPR_32RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_128(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_128RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_1024(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_1024RegClassID, Val & 255);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AV_32(unsigned Val) const {
+ return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_AV_64(unsigned Val) const {
+ return decodeSrcOp(OPW64, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const {
return createRegOperand(AMDGPU::VReg_64RegClassID, Val);
}
@@ -482,6 +739,14 @@ MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const {
return createRegOperand(AMDGPU::VReg_128RegClassID, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_256(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_256RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_512RegClassID, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const {
// table-gen generated disassembler doesn't care about operand types
// leaving only registry class so SSrc_32 operand turns into SReg_32
@@ -501,6 +766,13 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XEXEC_HI(
return decodeOperand_SReg_32(Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_SRegOrLds_32(unsigned Val) const {
+ // table-gen generated disassembler doesn't care about operand types
+ // leaving only registry class so SSrc_32 operand turns into SReg_32
+ // and therefore we accept immediates and literals here as well
+ return decodeSrcOp(OPW32, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const {
return decodeSrcOp(OPW64, Val);
}
@@ -628,6 +900,9 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
// ToDo: case 248: 1/(2*PI) - is allowed only on VI
switch (Width) {
case OPW32:
+ case OPW128: // splat constants
+ case OPW512:
+ case OPW1024:
return MCOperand::createImm(getInlineImmVal32(Imm));
case OPW64:
return MCOperand::createImm(getInlineImmVal64(Imm));
@@ -654,6 +929,24 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
}
}
+unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
+ using namespace AMDGPU;
+
+ assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+ switch (Width) {
+ default: // fall
+ case OPW32:
+ case OPW16:
+ case OPWV216:
+ return AGPR_32RegClassID;
+ case OPW64: return AReg_64RegClassID;
+ case OPW128: return AReg_128RegClassID;
+ case OPW512: return AReg_512RegClassID;
+ case OPW1024: return AReg_1024RegClassID;
+ }
+}
+
+
unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
using namespace AMDGPU;
@@ -691,8 +984,10 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
using namespace AMDGPU::EncValues;
- unsigned TTmpMin = isGFX9() ? TTMP_GFX9_MIN : TTMP_VI_MIN;
- unsigned TTmpMax = isGFX9() ? TTMP_GFX9_MAX : TTMP_VI_MAX;
+ unsigned TTmpMin =
+ (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MIN : TTMP_VI_MIN;
+ unsigned TTmpMax =
+ (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MAX : TTMP_VI_MAX;
return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1;
}
@@ -700,10 +995,14 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const {
using namespace AMDGPU::EncValues;
- assert(Val < 512); // enum9
+ assert(Val < 1024); // enum10
+
+ bool IsAGPR = Val & 512;
+ Val &= 511;
if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
- return createRegOperand(getVgprClassId(Width), Val - VGPR_MIN);
+ return createRegOperand(IsAGPR ? getAgprClassId(Width)
+ : getVgprClassId(Width), Val - VGPR_MIN);
}
if (Val <= SGPR_MAX) {
assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
@@ -765,23 +1064,23 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
case 105: return createRegOperand(XNACK_MASK_HI);
case 106: return createRegOperand(VCC_LO);
case 107: return createRegOperand(VCC_HI);
- case 108: assert(!isGFX9()); return createRegOperand(TBA_LO);
- case 109: assert(!isGFX9()); return createRegOperand(TBA_HI);
- case 110: assert(!isGFX9()); return createRegOperand(TMA_LO);
- case 111: assert(!isGFX9()); return createRegOperand(TMA_HI);
+ case 108: return createRegOperand(TBA_LO);
+ case 109: return createRegOperand(TBA_HI);
+ case 110: return createRegOperand(TMA_LO);
+ case 111: return createRegOperand(TMA_HI);
case 124: return createRegOperand(M0);
+ case 125: return createRegOperand(SGPR_NULL);
case 126: return createRegOperand(EXEC_LO);
case 127: return createRegOperand(EXEC_HI);
case 235: return createRegOperand(SRC_SHARED_BASE);
case 236: return createRegOperand(SRC_SHARED_LIMIT);
case 237: return createRegOperand(SRC_PRIVATE_BASE);
case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
- // TODO: SRC_POPS_EXITING_WAVE_ID
- // ToDo: no support for vccz register
- case 251: break;
- // ToDo: no support for execz register
- case 252: break;
- case 253: return createRegOperand(SCC);
+ case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
+ case 251: return createRegOperand(SRC_VCCZ);
+ case 252: return createRegOperand(SRC_EXECZ);
+ case 253: return createRegOperand(SRC_SCC);
+ case 254: return createRegOperand(LDS_DIRECT);
default: break;
}
return errOperand(Val, "unknown operand encoding " + Twine(Val));
@@ -794,9 +1093,17 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
case 102: return createRegOperand(FLAT_SCR);
case 104: return createRegOperand(XNACK_MASK);
case 106: return createRegOperand(VCC);
- case 108: assert(!isGFX9()); return createRegOperand(TBA);
- case 110: assert(!isGFX9()); return createRegOperand(TMA);
+ case 108: return createRegOperand(TBA);
+ case 110: return createRegOperand(TMA);
case 126: return createRegOperand(EXEC);
+ case 235: return createRegOperand(SRC_SHARED_BASE);
+ case 236: return createRegOperand(SRC_SHARED_LIMIT);
+ case 237: return createRegOperand(SRC_PRIVATE_BASE);
+ case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
+ case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID);
+ case 251: return createRegOperand(SRC_VCCZ);
+ case 252: return createRegOperand(SRC_EXECZ);
+ case 253: return createRegOperand(SRC_SCC);
default: break;
}
return errOperand(Val, "unknown operand encoding " + Twine(Val));
@@ -807,16 +1114,18 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
using namespace AMDGPU::SDWA;
using namespace AMDGPU::EncValues;
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
- // XXX: static_cast<int> is needed to avoid stupid warning:
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
+ STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
+ // XXX: cast to int is needed to avoid stupid warning:
// compare with unsigned is always true
- if (SDWA9EncValues::SRC_VGPR_MIN <= static_cast<int>(Val) &&
+ if (int(SDWA9EncValues::SRC_VGPR_MIN) <= int(Val) &&
Val <= SDWA9EncValues::SRC_VGPR_MAX) {
return createRegOperand(getVgprClassId(Width),
Val - SDWA9EncValues::SRC_VGPR_MIN);
}
if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
- Val <= SDWA9EncValues::SRC_SGPR_MAX) {
+ Val <= (isGFX10() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10
+ : SDWA9EncValues::SRC_SGPR_MAX_SI)) {
return createSRegOperand(getSgprClassId(Width),
Val - SDWA9EncValues::SRC_SGPR_MIN);
}
@@ -852,24 +1161,34 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
using namespace AMDGPU::SDWA;
- assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] &&
- "SDWAVopcDst should be present only on GFX9");
+ assert((STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
+ STI.getFeatureBits()[AMDGPU::FeatureGFX10]) &&
+ "SDWAVopcDst should be present only on GFX9+");
+
+ bool IsWave64 = STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64];
+
if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
int TTmpIdx = getTTmpIdx(Val);
if (TTmpIdx >= 0) {
return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx);
- } else if (Val > AMDGPU::EncValues::SGPR_MAX) {
- return decodeSpecialReg64(Val);
+ } else if (Val > SGPR_MAX) {
+ return IsWave64 ? decodeSpecialReg64(Val)
+ : decodeSpecialReg32(Val);
} else {
- return createSRegOperand(getSgprClassId(OPW64), Val);
+ return createSRegOperand(getSgprClassId(IsWave64 ? OPW64 : OPW32), Val);
}
} else {
- return createRegOperand(AMDGPU::VCC);
+ return createRegOperand(IsWave64 ? AMDGPU::VCC : AMDGPU::VCC_LO);
}
}
+MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const {
+ return STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
+ decodeOperand_SReg_64(Val) : decodeOperand_SReg_32(Val);
+}
+
bool AMDGPUDisassembler::isVI() const {
return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
}
@@ -878,6 +1197,10 @@ bool AMDGPUDisassembler::isGFX9() const {
return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
}
+bool AMDGPUDisassembler::isGFX10() const {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUSymbolizer
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 75cfc5e11282..c5eaba615c2a 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -1,9 +1,8 @@
//===- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -42,15 +41,14 @@ class AMDGPUDisassembler : public MCDisassembler {
private:
std::unique_ptr<MCInstrInfo const> const MCII;
const MCRegisterInfo &MRI;
+ const unsigned TargetMaxInstBytes;
mutable ArrayRef<uint8_t> Bytes;
mutable uint32_t Literal;
mutable bool HasLiteral;
public:
AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
- MCInstrInfo const *MCII) :
- MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()) {}
-
+ MCInstrInfo const *MCII);
~AMDGPUDisassembler() override = default;
DecodeStatus getInstruction(MCInst &MI, uint64_t &Size,
@@ -69,9 +67,12 @@ public:
uint64_t Address) const;
DecodeStatus convertSDWAInst(MCInst &MI) const;
+ DecodeStatus convertDPP8Inst(MCInst &MI) const;
DecodeStatus convertMIMGInst(MCInst &MI) const;
MCOperand decodeOperand_VGPR_32(unsigned Val) const;
+ MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const;
+
MCOperand decodeOperand_VS_32(unsigned Val) const;
MCOperand decodeOperand_VS_64(unsigned Val) const;
MCOperand decodeOperand_VS_128(unsigned Val) const;
@@ -81,22 +82,33 @@ public:
MCOperand decodeOperand_VReg_64(unsigned Val) const;
MCOperand decodeOperand_VReg_96(unsigned Val) const;
MCOperand decodeOperand_VReg_128(unsigned Val) const;
+ MCOperand decodeOperand_VReg_256(unsigned Val) const;
+ MCOperand decodeOperand_VReg_512(unsigned Val) const;
MCOperand decodeOperand_SReg_32(unsigned Val) const;
MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const;
MCOperand decodeOperand_SReg_32_XEXEC_HI(unsigned Val) const;
+ MCOperand decodeOperand_SRegOrLds_32(unsigned Val) const;
MCOperand decodeOperand_SReg_64(unsigned Val) const;
MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const;
MCOperand decodeOperand_SReg_128(unsigned Val) const;
MCOperand decodeOperand_SReg_256(unsigned Val) const;
MCOperand decodeOperand_SReg_512(unsigned Val) const;
+ MCOperand decodeOperand_AGPR_32(unsigned Val) const;
+ MCOperand decodeOperand_AReg_128(unsigned Val) const;
+ MCOperand decodeOperand_AReg_512(unsigned Val) const;
+ MCOperand decodeOperand_AReg_1024(unsigned Val) const;
+ MCOperand decodeOperand_AV_32(unsigned Val) const;
+ MCOperand decodeOperand_AV_64(unsigned Val) const;
+
enum OpWidthTy {
OPW32,
OPW64,
OPW128,
OPW256,
OPW512,
+ OPW1024,
OPW16,
OPWV216,
OPW_LAST_,
@@ -104,6 +116,7 @@ public:
};
unsigned getVgprClassId(const OpWidthTy Width) const;
+ unsigned getAgprClassId(const OpWidthTy Width) const;
unsigned getSgprClassId(const OpWidthTy Width) const;
unsigned getTtmpClassId(const OpWidthTy Width) const;
@@ -121,11 +134,14 @@ public:
MCOperand decodeSDWASrc32(unsigned Val) const;
MCOperand decodeSDWAVopcDst(unsigned Val) const;
+ MCOperand decodeBoolReg(unsigned Val) const;
+
int getTTmpIdx(unsigned Val) const;
bool isVI() const;
bool isGFX9() const;
- };
+ bool isGFX10() const;
+};
//===----------------------------------------------------------------------===//
// AMDGPUSymbolizer
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 944f4ffe598d..0550092ce1d6 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -1,9 +1,8 @@
//===-- EvergreenInstructions.td - EG Instruction defs ----*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 44040d352e6a..889f60dae920 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -1,17 +1,16 @@
//===-- FLATInstructions.td - FLAT Instruction Defintions -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [], -10>;
-def FLATOffset : ComplexPattern<i64, 3, "SelectFlatOffset<false>", [], [], -10>;
+def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [SDNPWantRoot], -10>;
+def FLATOffset : ComplexPattern<i64, 3, "SelectFlatOffset<false>", [], [SDNPWantRoot], -10>;
-def FLATOffsetSigned : ComplexPattern<i64, 3, "SelectFlatOffset<true>", [], [], -10>;
-def FLATSignedAtomic : ComplexPattern<i64, 3, "SelectFlatAtomicSigned", [], [], -10>;
+def FLATOffsetSigned : ComplexPattern<i64, 3, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
+def FLATSignedAtomic : ComplexPattern<i64, 3, "SelectFlatAtomicSigned", [], [SDNPWantRoot], -10>;
//===----------------------------------------------------------------------===//
// FLAT classes
@@ -52,6 +51,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
bits<1> has_data = 1;
bits<1> has_glc = 1;
bits<1> glcValue = 0;
+ bits<1> has_dlc = 1;
+ bits<1> dlcValue = 0;
let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
!if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -64,6 +65,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
// and are not considered done until both have been decremented.
let VM_CNT = 1;
let LGKM_CNT = !if(!or(is_flat_global, is_flat_scratch), 0, 1);
+
+ let IsNonFlatSeg = !if(!or(is_flat_global, is_flat_scratch), 1, 0);
}
class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -87,6 +90,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
bits<1> slc;
bits<1> glc;
+ bits<1> dlc;
// Only valid on gfx9
bits<1> lds = 0; // XXX - What does this actually do?
@@ -131,18 +135,16 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> {
// saddr is 32-bit (which isn't handled here yet).
class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
bit HasTiedOutput = 0,
- bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
+ bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs regClass:$vdst),
!con(
!con(
- !con(
- !con((ins VReg_64:$vaddr),
- !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
- (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
- (ins GLC:$glc, SLC:$slc)),
- !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
- " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
+ !con((ins VReg_64:$vaddr),
+ !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
+ (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+ !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
+ " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
let has_data = 0;
let mayLoad = 1;
let has_saddr = HasSaddr;
@@ -155,16 +157,14 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
}
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
- bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
+ bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
(outs),
!con(
- !con(
- !con((ins VReg_64:$vaddr, vdataClass:$vdata),
- !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
- (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)),
- (ins GLC:$glc, SLC:$slc)),
- " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> {
+ !con((ins VReg_64:$vaddr, vdataClass:$vdata),
+ !if(EnableSaddr, (ins SReg_64:$saddr), (ins))),
+ (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+ " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
@@ -176,18 +176,18 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
let is_flat_global = 1 in {
- def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+ def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>,
GlobalSaddrTable<0, opName>;
- def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>,
+ def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
GlobalSaddrTable<1, opName>;
}
}
multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
let is_flat_global = 1 in {
- def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+ def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
GlobalSaddrTable<0, opName>;
- def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>,
+ def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
GlobalSaddrTable<1, opName>;
}
}
@@ -197,9 +197,9 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
opName,
(outs regClass:$vdst),
!if(EnableSaddr,
- (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
- (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
- " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc"> {
+ (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
+ (ins VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+ " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc$dlc"> {
let has_data = 0;
let mayLoad = 1;
let has_saddr = 1;
@@ -213,9 +213,9 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
opName,
(outs),
!if(EnableSaddr,
- (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc),
- (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)),
- " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc"> {
+ (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
+ (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+ " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
@@ -247,6 +247,8 @@ class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins,
let mayStore = 1;
let has_glc = 0;
let glcValue = 0;
+ let has_dlc = 0;
+ let dlcValue = 0;
let has_vdst = 0;
let maybeAtomic = 1;
}
@@ -257,6 +259,7 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
let hasPostISelHook = 1;
let has_vdst = 1;
let glcValue = 1;
+ let dlcValue = 0;
let PseudoInstr = NAME # "_RTN";
}
@@ -266,24 +269,28 @@ multiclass FLAT_Atomic_Pseudo<
ValueType vt,
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc> {
+ RegisterClass data_rc = vdst_rc,
+ bit isFP = getIsFP<data_vt>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
" $vaddr, $vdata$offset$slc">,
GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let PseudoInstr = NAME;
+ let FPAtomic = isFP;
}
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
GlobalSaddrTable<0, opName#"_rtn">,
- AtomicNoRet <opName, 1>;
+ AtomicNoRet <opName, 1>{
+ let FPAtomic = isFP;
+ }
}
multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
@@ -292,27 +299,30 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
ValueType vt,
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc> {
+ RegisterClass data_rc = vdst_rc,
+ bit isFP = getIsFP<data_vt>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
" $vaddr, $vdata, off$offset$slc">,
GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let has_saddr = 1;
let PseudoInstr = NAME;
+ let FPAtomic = isFP;
}
def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC:$slc),
" $vaddr, $vdata, $saddr$offset$slc">,
GlobalSaddrTable<1, opName>,
AtomicNoRet <opName#"_saddr", 0> {
let has_saddr = 1;
let enabled_saddr = 1;
let PseudoInstr = NAME#"_SADDR";
+ let FPAtomic = isFP;
}
}
@@ -322,28 +332,31 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
ValueType vt,
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc> {
+ RegisterClass data_rc = vdst_rc,
+ bit isFP = getIsFP<data_vt>.ret> {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata, off$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1> {
let has_saddr = 1;
+ let FPAtomic = isFP;
}
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
+ (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
GlobalSaddrTable<1, opName#"_rtn">,
AtomicNoRet <opName#"_saddr", 1> {
let has_saddr = 1;
let enabled_saddr = 1;
let PseudoInstr = NAME#"_SADDR_RTN";
+ let FPAtomic = isFP;
}
}
@@ -491,7 +504,8 @@ defm FLAT_ATOMIC_INC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2",
defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
VReg_64, i64, atomic_dec_flat>;
-let SubtargetPredicate = isCI in { // CI Only flat instructions : FIXME Only?
+// GFX7-, GFX10-only flat instructions.
+let SubtargetPredicate = isGFX7GFX10 in {
defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap",
VGPR_32, f32, null_frag, v2f32, VReg_64>;
@@ -511,7 +525,7 @@ defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2",
defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
VReg_64, f64>;
-} // End SubtargetPredicate = isCI
+} // End SubtargetPredicate = isGFX7GFX10
let SubtargetPredicate = HasFlatGlobalInsts in {
defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
@@ -654,6 +668,32 @@ defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_shor
} // End SubtargetPredicate = HasFlatScratchInsts
+let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
+ defm GLOBAL_ATOMIC_FCMPSWAP :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32>;
+ defm GLOBAL_ATOMIC_FMIN :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>;
+ defm GLOBAL_ATOMIC_FMAX :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
+ defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64>;
+ defm GLOBAL_ATOMIC_FMIN_X2 :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
+ defm GLOBAL_ATOMIC_FMAX_X2 :
+ FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
+} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
+
+let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in {
+
+defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
+ "global_atomic_add_f32", VGPR_32, f32, atomic_add_global
+>;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
+ "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global
+>;
+
+} // End SubtargetPredicate = HasAtomicFaddInsts
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
@@ -661,89 +701,51 @@ defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_shor
// Patterns for global loads with no offset.
class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))),
- (inst $vaddr, $offset, 0, $slc)
+ (inst $vaddr, $offset, 0, 0, $slc)
>;
-multiclass FlatLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-
- def : GCNPat <
- (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-}
-
-multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-
- def : GCNPat <
- (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-}
-
-multiclass FlatLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
-
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
-}
-
-multiclass FlatSignedLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
+class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in),
+ (inst $vaddr, $offset, 0, 0, $slc, $in)
+>;
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
-}
+class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in),
+ (inst $vaddr, $offset, 0, 0, $slc, $in)
+>;
class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
- (inst $vaddr, $offset, 0, $slc)
+ (vt (node (FLATAtomic (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))),
+ (inst $vaddr, $offset, 0, 0, $slc)
>;
class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))),
- (inst $vaddr, $offset, 0, $slc)
+ (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))),
+ (inst $vaddr, $offset, 0, 0, $slc)
>;
-class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
(node vt:$data, (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)),
- (inst $vaddr, $data, $offset, 0, $slc)
+ (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
>;
-class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
(node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)),
- (inst $vaddr, $data, $offset, 0, $slc)
+ (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
>;
-class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
(node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
- (inst $vaddr, $data, $offset, 0, $slc)
+ (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
>;
-class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
(node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
- (inst $vaddr, $data, $offset, 0, $slc)
+ (inst $vaddr, rc:$data, $offset, 0, 0, $slc)
>;
class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
@@ -752,6 +754,11 @@ class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
(inst $vaddr, $data, $offset, $slc)
>;
+class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
+ (inst $vaddr, $data, $offset, $slc)
+>;
+
class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : GCNPat <
(vt (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)),
@@ -760,28 +767,33 @@ class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType v
let OtherPredicates = [HasFlatAddressSpace] in {
-def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, v4i32>;
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_load_flat, i32>;
-def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_flat, i64>;
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
def : FlatStorePat <FLAT_STORE_DWORD, store_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32, VReg_64>;
+def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32, VReg_96>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32, VReg_128>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat, i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat, i64>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64, VReg_64>;
def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
@@ -818,62 +830,77 @@ let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
-let AddedComplexity = 3 in {
-defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>;
-defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>;
-defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>;
-}
-
-let AddedComplexity = 9 in {
-defm : FlatLoadPat_Lo16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_flat>;
-defm : FlatLoadPat_Lo16 <FLAT_LOAD_SBYTE_D16, sextloadi8_flat>;
-defm : FlatLoadPat_Lo16 <FLAT_LOAD_SHORT_D16, load_flat>;
-}
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
+
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
} // End OtherPredicates = [HasFlatAddressSpace]
+def atomic_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_fadd>;
+def atomic_pk_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_pk_fadd>;
+
let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in {
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, az_extloadi8_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, az_extloadi8_global, i16>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, az_extloadi16_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>;
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, v2i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX3, load_global, v3i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, v4i32>;
-def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORD, atomic_load_global, i32>;
-def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORDX2, atomic_load_global, i64>;
+def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORD, atomic_load_32_global, i32>;
+def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORDX2, atomic_load_64_global, i64>;
-def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16>;
-def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32, VGPR_32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16, VGPR_32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32, VGPR_32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16, VGPR_32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32, VGPR_32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32, VReg_64>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX3, store_global, v3i32, VReg_96>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32, VReg_128>;
let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
-defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>;
-defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>;
-defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>;
-
-defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_global>;
-defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_global>;
-defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SHORT_D16, load_global>;
-
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2f16>;
+
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
}
def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
-def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64>;
+def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64, VReg_64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_add_global, i32>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
@@ -903,7 +930,10 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
-} // End OtherPredicates = [HasFlatGlobalInsts]
+def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global, f32>;
+def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global, v2f16>;
+
+} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
//===----------------------------------------------------------------------===//
@@ -917,8 +947,8 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps> :
FLAT_Real <op, ps>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SI> {
- let AssemblerPredicate = isCIOnly;
- let DecoderNamespace="CI";
+ let AssemblerPredicate = isGFX7Only;
+ let DecoderNamespace="GFX7";
}
def FLAT_LOAD_UBYTE_ci : FLAT_Real_ci <0x8, FLAT_LOAD_UBYTE>;
@@ -985,8 +1015,8 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2
class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> :
FLAT_Real <op, ps>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
- let AssemblerPredicate = isVI;
- let DecoderNamespace="VI";
+ let AssemblerPredicate = isGFX8GFX9;
+ let DecoderNamespace = "GFX8";
}
multiclass FLAT_Real_AllAddr_vi<bits<7> op> {
@@ -1133,3 +1163,200 @@ defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_vi <0x1c>;
defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>;
defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>;
defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>;
+
+
+//===----------------------------------------------------------------------===//
+// GFX10.
+//===----------------------------------------------------------------------===//
+
+class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
+ FLAT_Real<op, ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> {
+ let AssemblerPredicate = isGFX10Plus;
+ let DecoderNamespace = "GFX10";
+
+ let Inst{11-0} = {offset{12}, offset{10-0}};
+ let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue);
+ let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d);
+ let Inst{55} = 0;
+}
+
+
+multiclass FLAT_Real_Base_gfx10<bits<7> op> {
+ def _gfx10 :
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME)>;
+}
+
+multiclass FLAT_Real_RTN_gfx10<bits<7> op> {
+ def _RTN_gfx10 :
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_RTN")>;
+}
+
+multiclass FLAT_Real_SADDR_gfx10<bits<7> op> {
+ def _SADDR_gfx10 :
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+}
+
+multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> {
+ def _SADDR_RTN_gfx10 :
+ FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
+}
+
+
+multiclass FLAT_Real_AllAddr_gfx10<bits<7> op> :
+ FLAT_Real_Base_gfx10<op>,
+ FLAT_Real_SADDR_gfx10<op>;
+
+multiclass FLAT_Real_Atomics_gfx10<bits<7> op> :
+ FLAT_Real_Base_gfx10<op>,
+ FLAT_Real_RTN_gfx10<op>;
+
+multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op> :
+ FLAT_Real_AllAddr_gfx10<op>,
+ FLAT_Real_RTN_gfx10<op>,
+ FLAT_Real_SADDR_RTN_gfx10<op>;
+
+
+// ENC_FLAT.
+defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>;
+defm FLAT_LOAD_SBYTE : FLAT_Real_Base_gfx10<0x009>;
+defm FLAT_LOAD_USHORT : FLAT_Real_Base_gfx10<0x00a>;
+defm FLAT_LOAD_SSHORT : FLAT_Real_Base_gfx10<0x00b>;
+defm FLAT_LOAD_DWORD : FLAT_Real_Base_gfx10<0x00c>;
+defm FLAT_LOAD_DWORDX2 : FLAT_Real_Base_gfx10<0x00d>;
+defm FLAT_LOAD_DWORDX4 : FLAT_Real_Base_gfx10<0x00e>;
+defm FLAT_LOAD_DWORDX3 : FLAT_Real_Base_gfx10<0x00f>;
+defm FLAT_STORE_BYTE : FLAT_Real_Base_gfx10<0x018>;
+defm FLAT_STORE_BYTE_D16_HI : FLAT_Real_Base_gfx10<0x019>;
+defm FLAT_STORE_SHORT : FLAT_Real_Base_gfx10<0x01a>;
+defm FLAT_STORE_SHORT_D16_HI : FLAT_Real_Base_gfx10<0x01b>;
+defm FLAT_STORE_DWORD : FLAT_Real_Base_gfx10<0x01c>;
+defm FLAT_STORE_DWORDX2 : FLAT_Real_Base_gfx10<0x01d>;
+defm FLAT_STORE_DWORDX4 : FLAT_Real_Base_gfx10<0x01e>;
+defm FLAT_STORE_DWORDX3 : FLAT_Real_Base_gfx10<0x01f>;
+defm FLAT_LOAD_UBYTE_D16 : FLAT_Real_Base_gfx10<0x020>;
+defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Real_Base_gfx10<0x021>;
+defm FLAT_LOAD_SBYTE_D16 : FLAT_Real_Base_gfx10<0x022>;
+defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Real_Base_gfx10<0x023>;
+defm FLAT_LOAD_SHORT_D16 : FLAT_Real_Base_gfx10<0x024>;
+defm FLAT_LOAD_SHORT_D16_HI : FLAT_Real_Base_gfx10<0x025>;
+defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_gfx10<0x030>;
+defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_gfx10<0x031>;
+defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_gfx10<0x032>;
+defm FLAT_ATOMIC_SUB : FLAT_Real_Atomics_gfx10<0x033>;
+defm FLAT_ATOMIC_SMIN : FLAT_Real_Atomics_gfx10<0x035>;
+defm FLAT_ATOMIC_UMIN : FLAT_Real_Atomics_gfx10<0x036>;
+defm FLAT_ATOMIC_SMAX : FLAT_Real_Atomics_gfx10<0x037>;
+defm FLAT_ATOMIC_UMAX : FLAT_Real_Atomics_gfx10<0x038>;
+defm FLAT_ATOMIC_AND : FLAT_Real_Atomics_gfx10<0x039>;
+defm FLAT_ATOMIC_OR : FLAT_Real_Atomics_gfx10<0x03a>;
+defm FLAT_ATOMIC_XOR : FLAT_Real_Atomics_gfx10<0x03b>;
+defm FLAT_ATOMIC_INC : FLAT_Real_Atomics_gfx10<0x03c>;
+defm FLAT_ATOMIC_DEC : FLAT_Real_Atomics_gfx10<0x03d>;
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_Real_Atomics_gfx10<0x03e>;
+defm FLAT_ATOMIC_FMIN : FLAT_Real_Atomics_gfx10<0x03f>;
+defm FLAT_ATOMIC_FMAX : FLAT_Real_Atomics_gfx10<0x040>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_Real_Atomics_gfx10<0x050>;
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x051>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_Real_Atomics_gfx10<0x052>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_Real_Atomics_gfx10<0x053>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_Real_Atomics_gfx10<0x055>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_Real_Atomics_gfx10<0x056>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_Real_Atomics_gfx10<0x057>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_Real_Atomics_gfx10<0x058>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_Real_Atomics_gfx10<0x059>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_Real_Atomics_gfx10<0x05a>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_gfx10<0x05b>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_gfx10<0x05c>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_gfx10<0x05d>;
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x05e>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f>;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060>;
+
+
+// ENC_FLAT_GLBL.
+defm GLOBAL_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>;
+defm GLOBAL_LOAD_SBYTE : FLAT_Real_AllAddr_gfx10<0x009>;
+defm GLOBAL_LOAD_USHORT : FLAT_Real_AllAddr_gfx10<0x00a>;
+defm GLOBAL_LOAD_SSHORT : FLAT_Real_AllAddr_gfx10<0x00b>;
+defm GLOBAL_LOAD_DWORD : FLAT_Real_AllAddr_gfx10<0x00c>;
+defm GLOBAL_LOAD_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x00d>;
+defm GLOBAL_LOAD_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x00e>;
+defm GLOBAL_LOAD_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x00f>;
+defm GLOBAL_STORE_BYTE : FLAT_Real_AllAddr_gfx10<0x018>;
+defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x019>;
+defm GLOBAL_STORE_SHORT : FLAT_Real_AllAddr_gfx10<0x01a>;
+defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x01b>;
+defm GLOBAL_STORE_DWORD : FLAT_Real_AllAddr_gfx10<0x01c>;
+defm GLOBAL_STORE_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x01d>;
+defm GLOBAL_STORE_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x01e>;
+defm GLOBAL_STORE_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x01f>;
+defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x020>;
+defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x021>;
+defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x022>;
+defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x023>;
+defm GLOBAL_LOAD_SHORT_D16 : FLAT_Real_AllAddr_gfx10<0x024>;
+defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x025>;
+defm GLOBAL_ATOMIC_SWAP : FLAT_Real_GlblAtomics_gfx10<0x030>;
+defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x031>;
+defm GLOBAL_ATOMIC_ADD : FLAT_Real_GlblAtomics_gfx10<0x032>;
+defm GLOBAL_ATOMIC_SUB : FLAT_Real_GlblAtomics_gfx10<0x033>;
+defm GLOBAL_ATOMIC_SMIN : FLAT_Real_GlblAtomics_gfx10<0x035>;
+defm GLOBAL_ATOMIC_UMIN : FLAT_Real_GlblAtomics_gfx10<0x036>;
+defm GLOBAL_ATOMIC_SMAX : FLAT_Real_GlblAtomics_gfx10<0x037>;
+defm GLOBAL_ATOMIC_UMAX : FLAT_Real_GlblAtomics_gfx10<0x038>;
+defm GLOBAL_ATOMIC_AND : FLAT_Real_GlblAtomics_gfx10<0x039>;
+defm GLOBAL_ATOMIC_OR : FLAT_Real_GlblAtomics_gfx10<0x03a>;
+defm GLOBAL_ATOMIC_XOR : FLAT_Real_GlblAtomics_gfx10<0x03b>;
+defm GLOBAL_ATOMIC_INC : FLAT_Real_GlblAtomics_gfx10<0x03c>;
+defm GLOBAL_ATOMIC_DEC : FLAT_Real_GlblAtomics_gfx10<0x03d>;
+defm GLOBAL_ATOMIC_FCMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x03e>;
+defm GLOBAL_ATOMIC_FMIN : FLAT_Real_GlblAtomics_gfx10<0x03f>;
+defm GLOBAL_ATOMIC_FMAX : FLAT_Real_GlblAtomics_gfx10<0x040>;
+defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x050>;
+defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x051>;
+defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Real_GlblAtomics_gfx10<0x052>;
+defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Real_GlblAtomics_gfx10<0x053>;
+defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x055>;
+defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x056>;
+defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x057>;
+defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x058>;
+defm GLOBAL_ATOMIC_AND_X2 : FLAT_Real_GlblAtomics_gfx10<0x059>;
+defm GLOBAL_ATOMIC_OR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05a>;
+defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05b>;
+defm GLOBAL_ATOMIC_INC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05c>;
+defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>;
+defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>;
+defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>;
+defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>;
+
+
+// ENC_FLAT_SCRATCH.
+defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>;
+defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_gfx10<0x009>;
+defm SCRATCH_LOAD_USHORT : FLAT_Real_AllAddr_gfx10<0x00a>;
+defm SCRATCH_LOAD_SSHORT : FLAT_Real_AllAddr_gfx10<0x00b>;
+defm SCRATCH_LOAD_DWORD : FLAT_Real_AllAddr_gfx10<0x00c>;
+defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x00d>;
+defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x00e>;
+defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x00f>;
+defm SCRATCH_STORE_BYTE : FLAT_Real_AllAddr_gfx10<0x018>;
+defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x019>;
+defm SCRATCH_STORE_SHORT : FLAT_Real_AllAddr_gfx10<0x01a>;
+defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x01b>;
+defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_gfx10<0x01c>;
+defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x01d>;
+defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x01e>;
+defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x01f>;
+defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x020>;
+defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x021>;
+defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x022>;
+defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x023>;
+defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_gfx10<0x024>;
+defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x025>;
+
+let SubtargetPredicate = HasAtomicFaddInsts in {
+
+defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Real_AllAddr_vi <0x04d>;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Real_AllAddr_vi <0x04e>;
+
+} // End SubtargetPredicate = HasAtomicFaddInsts
diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 56071d0d2374..e1845e2e8e87 100644
--- a/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -1,37 +1,40 @@
//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
-// operand.If any of the use instruction cannot be combined with the mov the
+// operand. If any of the use instruction cannot be combined with the mov the
// whole sequence is reverted.
//
// $old = ...
// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
-// dpp_controls..., $bound_ctrl
-// $res = VALU $dpp_value, ...
+// dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
+// $res = VALU $dpp_value [, src1]
//
// to
//
-// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ...,
-// dpp_controls..., $folded_bound_ctrl
+// $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
+// dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
//
// Combining rules :
//
-// $bound_ctrl is DPP_BOUND_ZERO, $old is any
-// $bound_ctrl is DPP_BOUND_OFF, $old is 0
+// if $row_mask and $bank_mask are fully enabled (0xF) and
+// $bound_ctrl==DPP_BOUND_ZERO or $old==0
+// -> $combined_old = undef,
+// $combined_bound_ctrl = DPP_BOUND_ZERO
//
-// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO
-// $bound_ctrl is DPP_BOUND_OFF, $old is undef
+// if the VALU op is binary and
+// $bound_ctrl==DPP_BOUND_OFF and
+// $old==identity value (immediate) for the VALU op
+// -> $combined_old = src1,
+// $combined_bound_ctrl = DPP_BOUND_OFF
//
-// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF
-// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
+// Otherwise cancel.
//
-// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF
+// The mov_dpp instruction should reside in the same BB as all its uses
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -67,20 +70,16 @@ class GCNDPPCombine : public MachineFunctionPass {
MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
- RegSubRegPair foldOldOpnd(MachineInstr &OrigMI,
- RegSubRegPair OldOpndVGPR,
- MachineOperand &OldOpndValue) const;
-
MachineInstr *createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
- RegSubRegPair OldOpndVGPR,
+ RegSubRegPair CombOldVGPR,
MachineOperand *OldOpnd,
- bool BoundCtrlZero) const;
+ bool CombBCZ) const;
MachineInstr *createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
- RegSubRegPair OldOpndVGPR,
- bool BoundCtrlZero) const;
+ RegSubRegPair CombOldVGPR,
+ bool CombBCZ) const;
bool hasNoImmOrEqual(MachineInstr &MI,
unsigned OpndName,
@@ -153,8 +152,8 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
- RegSubRegPair OldOpndVGPR,
- bool BoundCtrlZero) const {
+ RegSubRegPair CombOldVGPR,
+ bool CombBCZ) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
@@ -178,9 +177,15 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
if (OldIdx != -1) {
assert(OldIdx == NumOperands);
- assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI));
- DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg);
+ assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
+ DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg);
++NumOperands;
+ } else {
+ // TODO: this discards MAC/FMA instructions for now, let's add it later
+ LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
+ " TBD\n");
+ Fail = true;
+ break;
}
if (auto *Mod0 = TII->getNamedOperand(OrigMI,
@@ -199,6 +204,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
break;
}
DPPInst.add(*Src0);
+ DPPInst->getOperand(NumOperands).setIsKill(false);
++NumOperands;
if (auto *Mod1 = TII->getNamedOperand(OrigMI,
@@ -231,7 +237,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
- DPPInst.addImm(BoundCtrlZero ? 1 : 0);
+ DPPInst.addImm(CombBCZ ? 1 : 0);
} while (false);
if (Fail) {
@@ -242,64 +248,81 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
return DPPInst.getInstr();
}
-GCNDPPCombine::RegSubRegPair
-GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI,
- RegSubRegPair OldOpndVGPR,
- MachineOperand &OldOpndValue) const {
- assert(OldOpndValue.isImm());
- switch (OrigMI.getOpcode()) {
+static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
+ assert(OldOpnd->isImm());
+ switch (OrigMIOp) {
default: break;
+ case AMDGPU::V_ADD_U32_e32:
+ case AMDGPU::V_ADD_U32_e64:
+ case AMDGPU::V_ADD_I32_e32:
+ case AMDGPU::V_ADD_I32_e64:
+ case AMDGPU::V_OR_B32_e32:
+ case AMDGPU::V_OR_B32_e64:
+ case AMDGPU::V_SUBREV_U32_e32:
+ case AMDGPU::V_SUBREV_U32_e64:
+ case AMDGPU::V_SUBREV_I32_e32:
+ case AMDGPU::V_SUBREV_I32_e64:
case AMDGPU::V_MAX_U32_e32:
- if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max())
- return OldOpndVGPR;
+ case AMDGPU::V_MAX_U32_e64:
+ case AMDGPU::V_XOR_B32_e32:
+ case AMDGPU::V_XOR_B32_e64:
+ if (OldOpnd->getImm() == 0)
+ return true;
break;
- case AMDGPU::V_MAX_I32_e32:
- if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max())
- return OldOpndVGPR;
+ case AMDGPU::V_AND_B32_e32:
+ case AMDGPU::V_AND_B32_e64:
+ case AMDGPU::V_MIN_U32_e32:
+ case AMDGPU::V_MIN_U32_e64:
+ if (static_cast<uint32_t>(OldOpnd->getImm()) ==
+ std::numeric_limits<uint32_t>::max())
+ return true;
break;
case AMDGPU::V_MIN_I32_e32:
- if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min())
- return OldOpndVGPR;
+ case AMDGPU::V_MIN_I32_e64:
+ if (static_cast<int32_t>(OldOpnd->getImm()) ==
+ std::numeric_limits<int32_t>::max())
+ return true;
+ break;
+ case AMDGPU::V_MAX_I32_e32:
+ case AMDGPU::V_MAX_I32_e64:
+ if (static_cast<int32_t>(OldOpnd->getImm()) ==
+ std::numeric_limits<int32_t>::min())
+ return true;
break;
-
case AMDGPU::V_MUL_I32_I24_e32:
+ case AMDGPU::V_MUL_I32_I24_e64:
case AMDGPU::V_MUL_U32_U24_e32:
- if (OldOpndValue.getImm() == 1) {
- auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
- assert(Src1 && Src1->isReg());
- return getRegSubRegPair(*Src1);
- }
+ case AMDGPU::V_MUL_U32_U24_e64:
+ if (OldOpnd->getImm() == 1)
+ return true;
break;
}
- return RegSubRegPair();
+ return false;
}
-// Cases to combine:
-// $bound_ctrl is DPP_BOUND_ZERO, $old is any
-// $bound_ctrl is DPP_BOUND_OFF, $old is 0
-// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO
-
-// $bound_ctrl is DPP_BOUND_OFF, $old is undef
-// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF
-
-// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
-// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF
-
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
- RegSubRegPair OldOpndVGPR,
+ RegSubRegPair CombOldVGPR,
MachineOperand *OldOpndValue,
- bool BoundCtrlZero) const {
- assert(OldOpndVGPR.Reg);
- if (!BoundCtrlZero && OldOpndValue) {
- assert(OldOpndValue->isImm());
- OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue);
- if (!OldOpndVGPR.Reg) {
- LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n");
+ bool CombBCZ) const {
+ assert(CombOldVGPR.Reg);
+ if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
+ auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
+ if (!Src1 || !Src1->isReg()) {
+ LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
+ return nullptr;
+ }
+ if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
+ LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
+ return nullptr;
+ }
+ CombOldVGPR = getRegSubRegPair(*Src1);
+ if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) {
+ LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n");
return nullptr;
}
}
- return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero);
+ return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ);
}
// returns true if MI doesn't have OpndName immediate operand or the
@@ -316,31 +339,64 @@ bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+ LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
+
+ auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
+ assert(DstOpnd && DstOpnd->isReg());
+ auto DPPMovReg = DstOpnd->getReg();
+ if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
+ LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
+ " for all uses\n");
+ return false;
+ }
+
+ auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
+ assert(RowMaskOpnd && RowMaskOpnd->isImm());
+ auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
+ assert(BankMaskOpnd && BankMaskOpnd->isImm());
+ const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
+ BankMaskOpnd->getImm() == 0xF;
+
auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
assert(BCZOpnd && BCZOpnd->isImm());
- bool BoundCtrlZero = 0 != BCZOpnd->getImm();
-
- LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
+ bool BoundCtrlZero = BCZOpnd->getImm();
auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
assert(OldOpnd && OldOpnd->isReg());
- auto OldOpndVGPR = getRegSubRegPair(*OldOpnd);
- auto *OldOpndValue = getOldOpndValue(*OldOpnd);
+
+ auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
+ // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
+ // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
+ // but the third option is used to distinguish undef from non-immediate
+ // to reuse IMPLICIT_DEF instruction later
assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
- if (OldOpndValue) {
- if (BoundCtrlZero) {
- OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd
- OldOpndValue = nullptr;
- } else {
- if (!OldOpndValue->isImm()) {
- LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n");
- return false;
- }
- if (OldOpndValue->getImm() == 0) {
- OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef
- OldOpndValue = nullptr;
- BoundCtrlZero = true;
+
+ bool CombBCZ = false;
+
+ if (MaskAllLanes && BoundCtrlZero) { // [1]
+ CombBCZ = true;
+ } else {
+ if (!OldOpndValue || !OldOpndValue->isImm()) {
+ LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n");
+ return false;
+ }
+
+ if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) {
+ LLVM_DEBUG(dbgs() <<
+ " failed: old reg def and mov should be in the same BB\n");
+ return false;
+ }
+
+ if (OldOpndValue->getImm() == 0) {
+ if (MaskAllLanes) {
+ assert(!BoundCtrlZero); // by check [1]
+ CombBCZ = true;
}
+ } else if (BoundCtrlZero) {
+ assert(!MaskAllLanes); // by check [1]
+ LLVM_DEBUG(dbgs() <<
+ " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
+ return false;
}
}
@@ -348,25 +404,28 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
if (!OldOpndValue)
dbgs() << "undef";
else
- dbgs() << OldOpndValue->getImm();
- dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n');
-
- std::vector<MachineInstr*> OrigMIs, DPPMIs;
- if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef
- OldOpndVGPR = RegSubRegPair(
+ dbgs() << *OldOpndValue;
+ dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
+
+ SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
+ auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
+ // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
+ if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
+ CombOldVGPR = RegSubRegPair(
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
- TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg);
+ TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
DPPMIs.push_back(UndefInst.getInstr());
}
OrigMIs.push_back(&MovMI);
bool Rollback = true;
- for (auto &Use : MRI->use_nodbg_operands(
- TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) {
+ for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
Rollback = true;
auto &OrigMI = *Use.getParent();
+ LLVM_DEBUG(dbgs() << " try: " << OrigMI);
+
auto OrigOp = OrigMI.getOpcode();
if (TII->isVOP3(OrigOp)) {
if (!TII->hasVALU32BitEncoding(OrigOp)) {
@@ -389,8 +448,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
- if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR,
- OldOpndValue, BoundCtrlZero)) {
+ if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
+ OldOpndValue, CombBCZ)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
@@ -401,8 +460,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
BB->insert(OrigMI, NewMI);
if (TII->commuteInstruction(*NewMI)) {
LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
- if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR,
- OldOpndValue, BoundCtrlZero)) {
+ if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR,
+ OldOpndValue, CombBCZ)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index c6396de89c4f..885239e2faed 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1,9 +1,8 @@
//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -21,6 +20,7 @@
#include "llvm/ADT/iterator_range.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/MC/MCInstrDesc.h"
@@ -38,6 +38,7 @@ using namespace llvm;
//===----------------------------------------------------------------------===//
GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
+ IsHazardRecognizerMode(false),
CurrCycleInstr(nullptr),
MF(MF),
ST(MF.getSubtarget<GCNSubtarget>()),
@@ -45,7 +46,8 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
TRI(TII.getRegisterInfo()),
ClauseUses(TRI.getNumRegUnits()),
ClauseDefs(TRI.getNumRegUnits()) {
- MaxLookAhead = 5;
+ MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
+ TSchedModel.init(&ST);
}
void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
@@ -88,18 +90,38 @@ static bool isSMovRel(unsigned Opcode) {
}
}
-static bool isSendMsgTraceDataOrGDS(const MachineInstr &MI) {
+static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
+ const MachineInstr &MI) {
+ if (TII.isAlwaysGDS(MI.getOpcode()))
+ return true;
+
switch (MI.getOpcode()) {
case AMDGPU::S_SENDMSG:
case AMDGPU::S_SENDMSGHALT:
case AMDGPU::S_TTRACEDATA:
return true;
+ // These DS opcodes don't support GDS.
+ case AMDGPU::DS_NOP:
+ case AMDGPU::DS_PERMUTE_B32:
+ case AMDGPU::DS_BPERMUTE_B32:
+ return false;
default:
- // TODO: GDS
+ if (TII.isDS(MI.getOpcode())) {
+ int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::gds);
+ if (MI.getOperand(GDS).getImm())
+ return true;
+ }
return false;
}
}
+static bool isPermlane(const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ return Opcode == AMDGPU::V_PERMLANE16_B32 ||
+ Opcode == AMDGPU::V_PERMLANEX16_B32;
+}
+
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
AMDGPU::OpName::simm16);
@@ -109,6 +131,8 @@ static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
ScheduleHazardRecognizer::HazardType
GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
MachineInstr *MI = SU->getInstr();
+ if (MI->isBundle())
+ return NoHazard;
if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
return NoopHazard;
@@ -119,6 +143,15 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
&& checkVMEMHazards(MI) > 0)
return NoopHazard;
+ if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
+ return NoopHazard;
+
+ if (checkFPAtomicToDenormModeHazard(MI) > 0)
+ return NoopHazard;
+
+ if (ST.hasNoDataDepHazard())
+ return NoHazard;
+
if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
return NoopHazard;
@@ -145,10 +178,16 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
checkReadM0Hazards(MI) > 0)
return NoopHazard;
- if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI) &&
+ if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
checkReadM0Hazards(MI) > 0)
return NoopHazard;
+ if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
+ return NoopHazard;
+
+ if ((MI->mayLoad() || MI->mayStore()) && checkMAILdStHazards(MI) > 0)
+ return NoopHazard;
+
if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
return NoopHazard;
@@ -158,22 +197,74 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
return NoHazard;
}
+static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
+ .addImm(0);
+}
+
+void GCNHazardRecognizer::processBundle() {
+ MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
+ MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
+ // Check bundled MachineInstr's for hazards.
+ for (; MI != E && MI->isInsideBundle(); ++MI) {
+ CurrCycleInstr = &*MI;
+ unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
+
+ if (IsHazardRecognizerMode)
+ fixHazards(CurrCycleInstr);
+
+ for (unsigned i = 0; i < WaitStates; ++i)
+ insertNoopInBundle(CurrCycleInstr, TII);
+
+ // It’s unnecessary to track more than MaxLookAhead instructions. Since we
+ // include the bundled MI directly after, only add a maximum of
+ // (MaxLookAhead - 1) noops to EmittedInstrs.
+ for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
+ EmittedInstrs.push_front(nullptr);
+
+ EmittedInstrs.push_front(CurrCycleInstr);
+ EmittedInstrs.resize(MaxLookAhead);
+ }
+ CurrCycleInstr = nullptr;
+}
+
unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
- return PreEmitNoops(SU->getInstr());
+ IsHazardRecognizerMode = false;
+ return PreEmitNoopsCommon(SU->getInstr());
}
unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+ IsHazardRecognizerMode = true;
+ CurrCycleInstr = MI;
+ unsigned W = PreEmitNoopsCommon(MI);
+ fixHazards(MI);
+ CurrCycleInstr = nullptr;
+ return W;
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
+ if (MI->isBundle())
+ return 0;
+
int WaitStates = std::max(0, checkAnyInstHazards(MI));
if (SIInstrInfo::isSMRD(*MI))
return std::max(WaitStates, checkSMRDHazards(MI));
- if (SIInstrInfo::isVALU(*MI))
- WaitStates = std::max(WaitStates, checkVALUHazards(MI));
-
if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
+ if (ST.hasNSAtoVMEMBug())
+ WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
+
+ WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
+
+ if (ST.hasNoDataDepHazard())
+ return WaitStates;
+
+ if (SIInstrInfo::isVALU(*MI))
+ WaitStates = std::max(WaitStates, checkVALUHazards(MI));
+
if (SIInstrInfo::isDPP(*MI))
WaitStates = std::max(WaitStates, checkDPPHazards(MI));
@@ -199,9 +290,15 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
isSMovRel(MI->getOpcode())))
return std::max(WaitStates, checkReadM0Hazards(MI));
- if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI))
+ if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
return std::max(WaitStates, checkReadM0Hazards(MI));
+ if (SIInstrInfo::isMAI(*MI))
+ return std::max(WaitStates, checkMAIHazards(MI));
+
+ if (MI->mayLoad() || MI->mayStore())
+ return std::max(WaitStates, checkMAILdStHazards(MI));
+
return WaitStates;
}
@@ -218,10 +315,14 @@ void GCNHazardRecognizer::AdvanceCycle() {
// Do not track non-instructions which do not affect the wait states.
// If included, these instructions can lead to buffer overflow such that
// detectable hazards are missed.
- if (CurrCycleInstr->getOpcode() == AMDGPU::IMPLICIT_DEF)
+ if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
+ CurrCycleInstr->isKill())
return;
- else if (CurrCycleInstr->isDebugInstr())
+
+ if (CurrCycleInstr->isBundle()) {
+ processBundle();
return;
+ }
unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
@@ -252,41 +353,112 @@ void GCNHazardRecognizer::RecedeCycle() {
// Helper Functions
//===----------------------------------------------------------------------===//
-int GCNHazardRecognizer::getWaitStatesSince(
- function_ref<bool(MachineInstr *)> IsHazard) {
+typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
+
+// Returns a minimum wait states since \p I walking all predecessors.
+// Only scans until \p IsExpired does not return true.
+// Can only be run in a hazard recognizer mode.
+static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock::reverse_instr_iterator I,
+ int WaitStates,
+ IsExpiredFn IsExpired,
+ DenseSet<const MachineBasicBlock *> &Visited) {
+ for (auto E = MBB->instr_rend(); I != E; ++I) {
+ // Don't add WaitStates for parent BUNDLE instructions.
+ if (I->isBundle())
+ continue;
+
+ if (IsHazard(&*I))
+ return WaitStates;
+
+ if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
+ continue;
+
+ WaitStates += SIInstrInfo::getNumWaitStates(*I);
+
+ if (IsExpired(&*I, WaitStates))
+ return std::numeric_limits<int>::max();
+ }
+
+ int MinWaitStates = WaitStates;
+ bool Found = false;
+ for (MachineBasicBlock *Pred : MBB->predecessors()) {
+ if (!Visited.insert(Pred).second)
+ continue;
+
+ int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
+ WaitStates, IsExpired, Visited);
+
+ if (W == std::numeric_limits<int>::max())
+ continue;
+
+ MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
+ if (IsExpired(nullptr, MinWaitStates))
+ return MinWaitStates;
+
+ Found = true;
+ }
+
+ if (Found)
+ return MinWaitStates;
+
+ return std::numeric_limits<int>::max();
+}
+
+static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ MachineInstr *MI,
+ IsExpiredFn IsExpired) {
+ DenseSet<const MachineBasicBlock *> Visited;
+ return getWaitStatesSince(IsHazard, MI->getParent(),
+ std::next(MI->getReverseIterator()),
+ 0, IsExpired, Visited);
+}
+
+int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
+ if (IsHazardRecognizerMode) {
+ auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
+ return WaitStates >= Limit;
+ };
+ return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
+ }
+
int WaitStates = 0;
for (MachineInstr *MI : EmittedInstrs) {
if (MI) {
if (IsHazard(MI))
return WaitStates;
- unsigned Opcode = MI->getOpcode();
- if (Opcode == AMDGPU::INLINEASM)
+ if (MI->isInlineAsm())
continue;
}
++WaitStates;
+
+ if (WaitStates >= Limit)
+ break;
}
return std::numeric_limits<int>::max();
}
-int GCNHazardRecognizer::getWaitStatesSinceDef(
- unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
+int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
+ IsHazardFn IsHazardDef,
+ int Limit) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
};
- return getWaitStatesSince(IsHazardFn);
+ return getWaitStatesSince(IsHazardFn, Limit);
}
-int GCNHazardRecognizer::getWaitStatesSinceSetReg(
- function_ref<bool(MachineInstr *)> IsHazard) {
+int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
+ int Limit) {
auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
return isSSetReg(MI->getOpcode()) && IsHazard(MI);
};
- return getWaitStatesSince(IsHazardFn);
+ return getWaitStatesSince(IsHazardFn, Limit);
}
//===----------------------------------------------------------------------===//
@@ -328,9 +500,9 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
// instructions in this group may return out of order and/or may be
// replayed (i.e. the same instruction issued more than once).
//
- // In order to handle these situations correctly we need to make sure
- // that when a clause has more than one instruction, no instruction in the
- // clause writes to a register that is read another instruction in the clause
+ // In order to handle these situations correctly we need to make sure that
+ // when a clause has more than one instruction, no instruction in the clause
+ // writes to a register that is read by another instruction in the clause
// (including itself). If we encounter this situaion, we need to break the
// clause by inserting a non SMEM instruction.
@@ -363,13 +535,12 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
}
int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
int WaitStatesNeeded = 0;
WaitStatesNeeded = checkSoftClauseHazards(SMRD);
// This SMRD hazard only affects SI.
- if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ if (!ST.hasSMRDReadVALUDefHazard())
return WaitStatesNeeded;
// A read of an SGPR by SMRD instruction requires 4 wait states when the
@@ -384,7 +555,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
if (!Use.isReg())
continue;
int WaitStatesNeededForUse =
- SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+ SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
+ SmrdSgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
// This fixes what appears to be undocumented hardware behavior in SI where
@@ -397,7 +569,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
if (IsBufferSMRD) {
int WaitStatesNeededForUse =
SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
- IsBufferHazardDefFn);
+ IsBufferHazardDefFn,
+ SmrdSgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
}
@@ -406,7 +579,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
}
int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
- if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (!ST.hasVMEMReadSGPRVALUDefHazard())
return 0;
int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
@@ -415,13 +588,13 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
// SGPR was written by a VALU Instruction.
const int VmemSgprWaitStates = 5;
auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
-
for (const MachineOperand &Use : VMEM->uses()) {
if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
continue;
int WaitStatesNeededForUse =
- VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+ VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
+ VmemSgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
return WaitStatesNeeded;
@@ -441,13 +614,16 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
continue;
int WaitStatesNeededForUse =
- DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg());
+ DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
+ [](MachineInstr *) { return true; },
+ DppVgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
WaitStatesNeeded = std::max(
WaitStatesNeeded,
- DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn));
+ DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
+ DppExecWaitStates));
return WaitStatesNeeded;
}
@@ -459,7 +635,8 @@ int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
// instruction.
const int DivFMasWaitStates = 4;
auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
- int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn);
+ int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
+ DivFMasWaitStates);
return DivFMasWaitStates - WaitStatesNeeded;
}
@@ -472,7 +649,7 @@ int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
return GetRegHWReg == getHWReg(TII, *MI);
};
- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
return GetRegWaitStates - WaitStatesNeeded;
}
@@ -481,12 +658,11 @@ int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
const SIInstrInfo *TII = ST.getInstrInfo();
unsigned HWReg = getHWReg(TII, *SetRegInstr);
- const int SetRegWaitStates =
- ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ? 1 : 2;
+ const int SetRegWaitStates = ST.getSetRegWaitStates();
auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
return HWReg == getHWReg(TII, *MI);
};
- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
return SetRegWaitStates - WaitStatesNeeded;
}
@@ -557,7 +733,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
};
int WaitStatesNeededForDef =
- VALUWaitStates - getWaitStatesSince(IsHazardFn);
+ VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
return WaitStatesNeeded;
@@ -622,12 +798,13 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
};
const int RWLaneWaitStates = 4;
- int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn);
+ int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
+ RWLaneWaitStates);
return RWLaneWaitStates - WaitStatesSince;
}
int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
- if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (!ST.hasRFEHazards())
return 0;
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -637,7 +814,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
auto IsHazardFn = [TII] (MachineInstr *MI) {
return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
};
- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
return RFEWaitStates - WaitStatesNeeded;
}
@@ -661,7 +838,8 @@ int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
};
int WaitStatesNeededForUse =
- MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn);
+ MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
+ MovFedWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
@@ -674,5 +852,557 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
auto IsHazardFn = [TII] (MachineInstr *MI) {
return TII->isSALU(*MI);
};
- return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn);
+ return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
+ SMovRelWaitStates);
+}
+
+void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
+ fixVMEMtoScalarWriteHazards(MI);
+ fixVcmpxPermlaneHazards(MI);
+ fixSMEMtoVectorWriteHazards(MI);
+ fixVcmpxExecWARHazard(MI);
+ fixLdsBranchVmemWARHazard(MI);
+}
+
+bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
+ if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ auto IsHazardFn = [TII] (MachineInstr *MI) {
+ return TII->isVOPC(*MI);
+ };
+
+ auto IsExpiredFn = [] (MachineInstr *MI, int) {
+ if (!MI)
+ return false;
+ unsigned Opc = MI->getOpcode();
+ return SIInstrInfo::isVALU(*MI) &&
+ Opc != AMDGPU::V_NOP_e32 &&
+ Opc != AMDGPU::V_NOP_e64 &&
+ Opc != AMDGPU::V_NOP_sdwa;
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ // V_NOP will be discarded by SQ.
+ // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
+ // which is always a VGPR and available.
+ auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
+ unsigned Reg = Src0->getReg();
+ bool IsUndef = Src0->isUndef();
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_MOV_B32_e32))
+ .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
+ .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
+
+ return true;
+}
+
+bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
+ if (!ST.hasVMEMtoScalarWriteHazard())
+ return false;
+
+ if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
+ return false;
+
+ if (MI->getNumDefs() == 0)
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
+ if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
+ !SIInstrInfo::isFLAT(*I))
+ return false;
+
+ for (const MachineOperand &Def : MI->defs()) {
+ MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
+ if (!Op)
+ continue;
+ return true;
+ }
+ return false;
+ };
+
+ auto IsExpiredFn = [] (MachineInstr *MI, int) {
+ return MI && (SIInstrInfo::isVALU(*MI) ||
+ (MI->getOpcode() == AMDGPU::S_WAITCNT &&
+ !MI->getOperand(0).getImm()));
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+ return true;
+}
+
+bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
+ if (!ST.hasSMEMtoVectorWriteHazard())
+ return false;
+
+ if (!SIInstrInfo::isVALU(*MI))
+ return false;
+
+ unsigned SDSTName;
+ switch (MI->getOpcode()) {
+ case AMDGPU::V_READLANE_B32:
+ case AMDGPU::V_READFIRSTLANE_B32:
+ SDSTName = AMDGPU::OpName::vdst;
+ break;
+ default:
+ SDSTName = AMDGPU::OpName::sdst;
+ break;
+ }
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
+ const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
+ if (!SDST) {
+ for (const auto &MO : MI->implicit_operands()) {
+ if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
+ SDST = &MO;
+ break;
+ }
+ }
+ }
+
+ if (!SDST)
+ return false;
+
+ const unsigned SDSTReg = SDST->getReg();
+ auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
+ return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
+ };
+
+ auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
+ if (MI) {
+ if (TII->isSALU(*MI)) {
+ switch (MI->getOpcode()) {
+ case AMDGPU::S_SETVSKIP:
+ case AMDGPU::S_VERSION:
+ case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VMCNT:
+ case AMDGPU::S_WAITCNT_EXPCNT:
+ // These instructions cannot not mitigate the hazard.
+ return false;
+ case AMDGPU::S_WAITCNT_LGKMCNT:
+ // Reducing lgkmcnt count to 0 always mitigates the hazard.
+ return (MI->getOperand(1).getImm() == 0) &&
+ (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ case AMDGPU::S_WAITCNT: {
+ const int64_t Imm = MI->getOperand(0).getImm();
+ AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
+ return (Decoded.LgkmCnt == 0);
+ }
+ default:
+ // SOPP instructions cannot mitigate the hazard.
+ if (TII->isSOPP(*MI))
+ return false;
+ // At this point the SALU can be assumed to mitigate the hazard
+ // because either:
+ // (a) it is independent of the at risk SMEM (breaking chain),
+ // or
+ // (b) it is dependent on the SMEM, in which case an appropriate
+ // s_waitcnt lgkmcnt _must_ exist between it and the at risk
+ // SMEM instruction.
+ return true;
+ }
+ }
+ }
+ return false;
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
+ .addImm(0);
+ return true;
+}
+
+bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
+ if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
+ return false;
+
+ auto IsHazardFn = [TRI] (MachineInstr *I) {
+ if (SIInstrInfo::isVALU(*I))
+ return false;
+ return I->readsRegister(AMDGPU::EXEC, TRI);
+ };
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
+ if (!MI)
+ return false;
+ if (SIInstrInfo::isVALU(*MI)) {
+ if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
+ return true;
+ for (auto MO : MI->implicit_operands())
+ if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
+ return true;
+ }
+ if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
+ return true;
+ return false;
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0xfffe);
+ return true;
+}
+
+bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
+ if (!ST.hasLdsBranchVmemWARHazard())
+ return false;
+
+ auto IsHazardInst = [] (const MachineInstr *MI) {
+ if (SIInstrInfo::isDS(*MI))
+ return 1;
+ if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
+ return 2;
+ return 0;
+ };
+
+ auto InstType = IsHazardInst(MI);
+ if (!InstType)
+ return false;
+
+ auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
+ return I && (IsHazardInst(I) ||
+ (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+ I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
+ !I->getOperand(1).getImm()));
+ };
+
+ auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
+ if (!I->isBranch())
+ return false;
+
+ auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
+ auto InstType2 = IsHazardInst(I);
+ return InstType2 && InstType != InstType2;
+ };
+
+ auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
+ if (!I)
+ return false;
+
+ auto InstType2 = IsHazardInst(I);
+ if (InstType == InstType2)
+ return true;
+
+ return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+ I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
+ !I->getOperand(1).getImm();
+ };
+
+ return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
+ std::numeric_limits<int>::max();
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+
+ return true;
+}
+
+int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
+ int NSAtoVMEMWaitStates = 1;
+
+ if (!ST.hasNSAtoVMEMBug())
+ return 0;
+
+ if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
+ return 0;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
+ if (!Offset || (Offset->getImm() & 6) == 0)
+ return 0;
+
+ auto IsHazardFn = [TII] (MachineInstr *I) {
+ if (!SIInstrInfo::isMIMG(*I))
+ return false;
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
+ return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
+ TII->getInstSizeInBytes(*I) >= 16;
+ };
+
+ return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
+}
+
+int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
+ int FPAtomicToDenormModeWaitStates = 3;
+
+ if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
+ return 0;
+
+ auto IsHazardFn = [] (MachineInstr *I) {
+ if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
+ return false;
+ return SIInstrInfo::isFPAtomic(*I);
+ };
+
+ auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
+ if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
+ return true;
+
+ switch (MI->getOpcode()) {
+ case AMDGPU::S_WAITCNT:
+ case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VMCNT:
+ case AMDGPU::S_WAITCNT_EXPCNT:
+ case AMDGPU::S_WAITCNT_LGKMCNT:
+ case AMDGPU::S_WAITCNT_IDLE:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+ };
+
+
+ return FPAtomicToDenormModeWaitStates -
+ ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
+}
+
+int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
+ assert(SIInstrInfo::isMAI(*MI));
+
+ int WaitStatesNeeded = 0;
+ unsigned Opc = MI->getOpcode();
+
+ auto IsVALUFn = [] (MachineInstr *MI) {
+ return SIInstrInfo::isVALU(*MI);
+ };
+
+ if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
+ const int LegacyVALUWritesVGPRWaitStates = 2;
+ const int VALUWritesExecWaitStates = 4;
+ const int MaxWaitStates = 4;
+
+ int WaitStatesNeededForUse = VALUWritesExecWaitStates -
+ getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded < MaxWaitStates) {
+ for (const MachineOperand &Use : MI->explicit_uses()) {
+ const int MaxWaitStates = 2;
+
+ if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+ continue;
+
+ int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
+ getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ break;
+ }
+ }
+ }
+
+ auto IsMFMAFn = [] (MachineInstr *MI) {
+ return SIInstrInfo::isMAI(*MI) &&
+ MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
+ MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
+ };
+
+ for (const MachineOperand &Op : MI->explicit_operands()) {
+ if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
+ continue;
+
+ if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
+ continue;
+
+ const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
+ const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
+ const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
+ const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
+ const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
+ const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
+ const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
+ const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
+ const int MaxWaitStates = 18;
+ unsigned Reg = Op.getReg();
+ unsigned HazardDefLatency = 0;
+
+ auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
+ (MachineInstr *MI) {
+ if (!IsMFMAFn(MI))
+ return false;
+ unsigned DstReg = MI->getOperand(0).getReg();
+ if (DstReg == Reg)
+ return false;
+ HazardDefLatency = std::max(HazardDefLatency,
+ TSchedModel.computeInstrLatency(MI));
+ return TRI.regsOverlap(DstReg, Reg);
+ };
+
+ int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
+ MaxWaitStates);
+ int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
+ int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ int OpNo = MI->getOperandNo(&Op);
+ if (OpNo == SrcCIdx) {
+ NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
+ } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
+ switch (HazardDefLatency) {
+ case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
+ break;
+ case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
+ break;
+ }
+ } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
+ switch (HazardDefLatency) {
+ case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
+ break;
+ case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
+ break;
+ }
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ return WaitStatesNeeded; // Early exit.
+
+ auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
+ if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
+ return false;
+ unsigned DstReg = MI->getOperand(0).getReg();
+ return TRI.regsOverlap(Reg, DstReg);
+ };
+
+ const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
+ const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
+ const int AccVGPRWriteAccVgprReadWaitStates = 3;
+ NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
+ if (OpNo == SrcCIdx)
+ NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
+ else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
+ NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
+
+ WaitStatesNeededForUse = NeedWaitStates -
+ getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ return WaitStatesNeeded; // Early exit.
+ }
+
+ if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
+ const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
+ const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
+ const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
+ const int MaxWaitStates = 13;
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned HazardDefLatency = 0;
+
+ auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
+ (MachineInstr *MI) {
+ if (!IsMFMAFn(MI))
+ return false;
+ unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+ HazardDefLatency = std::max(HazardDefLatency,
+ TSchedModel.computeInstrLatency(MI));
+ return TRI.regsOverlap(Reg, DstReg);
+ };
+
+ int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
+ int NeedWaitStates;
+ switch (HazardDefLatency) {
+ case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
+ break;
+ case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
+ break;
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+
+ return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
+ if (!ST.hasMAIInsts())
+ return 0;
+
+ int WaitStatesNeeded = 0;
+
+ auto IsAccVgprReadFn = [] (MachineInstr *MI) {
+ return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
+ };
+
+ for (const MachineOperand &Op : MI->explicit_uses()) {
+ if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
+ continue;
+
+ unsigned Reg = Op.getReg();
+
+ const int AccVgprReadLdStWaitStates = 2;
+ const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
+ const int MaxWaitStates = 2;
+
+ int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
+ getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ return WaitStatesNeeded; // Early exit.
+
+ auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
+ if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
+ return false;
+ auto IsVALUFn = [] (MachineInstr *MI) {
+ return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
+ };
+ return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
+ std::numeric_limits<int>::max();
+ };
+
+ WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
+ getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+
+ return WaitStatesNeeded;
}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h
index ca17e7cb6018..6aa2e70dfbfb 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -1,9 +1,8 @@
//===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,6 +16,7 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetSchedule.h"
#include <list>
namespace llvm {
@@ -31,6 +31,13 @@ class SIRegisterInfo;
class GCNSubtarget;
class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
+public:
+ typedef function_ref<bool(MachineInstr *)> IsHazardFn;
+
+private:
+ // Distinguish if we are called from scheduler or hazard recognizer
+ bool IsHazardRecognizerMode;
+
// This variable stores the instruction that has been emitted this cycle. It
// will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is
// called.
@@ -40,6 +47,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
const GCNSubtarget &ST;
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
+ TargetSchedModel TSchedModel;
/// RegUnits of uses in the current soft memory clause.
BitVector ClauseUses;
@@ -54,11 +62,13 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
void addClauseInst(const MachineInstr &MI);
- int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
- int getWaitStatesSinceDef(unsigned Reg,
- function_ref<bool(MachineInstr *)> IsHazardDef =
- [](MachineInstr *) { return true; });
- int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard);
+ // Advance over a MachineInstr bundle. Look for hazards in the bundled
+ // instructions.
+ void processBundle();
+
+ int getWaitStatesSince(IsHazardFn IsHazard, int Limit);
+ int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit);
+ int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit);
int checkSoftClauseHazards(MachineInstr *SMEM);
int checkSMRDHazards(MachineInstr *SMRD);
@@ -75,6 +85,18 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
int checkInlineAsmHazards(MachineInstr *IA);
int checkAnyInstHazards(MachineInstr *MI);
int checkReadM0Hazards(MachineInstr *SMovRel);
+ int checkNSAtoVMEMHazard(MachineInstr *MI);
+ int checkFPAtomicToDenormModeHazard(MachineInstr *MI);
+ void fixHazards(MachineInstr *MI);
+ bool fixVcmpxPermlaneHazards(MachineInstr *MI);
+ bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);
+ bool fixSMEMtoVectorWriteHazards(MachineInstr *MI);
+ bool fixVcmpxExecWARHazard(MachineInstr *MI);
+ bool fixLdsBranchVmemWARHazard(MachineInstr *MI);
+
+ int checkMAIHazards(MachineInstr *MI);
+ int checkMAILdStHazards(MachineInstr *MI);
+
public:
GCNHazardRecognizer(const MachineFunction &MF);
// We can only issue one instruction per cycle.
@@ -85,6 +107,7 @@ public:
void EmitNoop() override;
unsigned PreEmitNoops(SUnit *SU) override;
unsigned PreEmitNoops(MachineInstr *) override;
+ unsigned PreEmitNoopsCommon(MachineInstr *);
void AdvanceCycle() override;
void RecedeCycle() override;
};
diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp
index d62dc8d86781..1eb617640c32 100644
--- a/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -1,9 +1,8 @@
//===---------------------------- GCNILPSched.cpp - -----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 8e4cc391dc21..3525174223bd 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -1,9 +1,8 @@
//===- GCNIterativeScheduler.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.h b/lib/Target/AMDGPU/GCNIterativeScheduler.h
index 14ef5147f32a..e6f83914af5b 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.h
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -1,9 +1,8 @@
//===- GCNIterativeScheduler.h - GCN Scheduler ------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index ec6bcae33555..c469cf290e26 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -1,9 +1,8 @@
//===- GCNMinRegStrategy.cpp ----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/GCNNSAReassign.cpp b/lib/Target/AMDGPU/GCNNSAReassign.cpp
new file mode 100644
index 000000000000..51c4c99cfb18
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -0,0 +1,343 @@
+//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
+/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
+/// with sequential versions where possible.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-nsa-reassign"
+
+STATISTIC(NumNSAInstructions,
+ "Number of NSA instructions with non-sequential address found");
+STATISTIC(NumNSAConverted,
+ "Number of NSA instructions changed to sequential");
+
+namespace {
+
+class GCNNSAReassign : public MachineFunctionPass {
+public:
+ static char ID;
+
+ GCNNSAReassign() : MachineFunctionPass(ID) {
+ initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "GCN NSA Reassign"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.addRequired<VirtRegMap>();
+ AU.addRequired<LiveRegMatrix>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ typedef enum {
+ NOT_NSA, // Not an NSA instruction
+ FIXED, // NSA which we cannot modify
+ NON_CONTIGUOUS, // NSA with non-sequential address which we can try
+ // to optimize.
+ CONTIGUOUS // NSA with all sequential address registers
+ } NSA_Status;
+
+ const GCNSubtarget *ST;
+
+ const MachineRegisterInfo *MRI;
+
+ const SIRegisterInfo *TRI;
+
+ VirtRegMap *VRM;
+
+ LiveRegMatrix *LRM;
+
+ LiveIntervals *LIS;
+
+ unsigned MaxNumVGPRs;
+
+ const MCPhysReg *CSRegs;
+
+ NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
+
+ bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
+ unsigned StartReg) const;
+
+ bool canAssign(unsigned StartReg, unsigned NumRegs) const;
+
+ bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
+ false, false)
+
+
+char GCNNSAReassign::ID = 0;
+
+char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
+
+bool
+GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
+ unsigned StartReg) const {
+ unsigned NumRegs = Intervals.size();
+
+ for (unsigned N = 0; N < NumRegs; ++N)
+ if (VRM->hasPhys(Intervals[N]->reg))
+ LRM->unassign(*Intervals[N]);
+
+ for (unsigned N = 0; N < NumRegs; ++N)
+ if (LRM->checkInterference(*Intervals[N], StartReg + N))
+ return false;
+
+ for (unsigned N = 0; N < NumRegs; ++N)
+ LRM->assign(*Intervals[N], StartReg + N);
+
+ return true;
+}
+
+bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
+ for (unsigned N = 0; N < NumRegs; ++N) {
+ unsigned Reg = StartReg + N;
+ if (!MRI->isAllocatable(Reg))
+ return false;
+
+ for (unsigned I = 0; CSRegs[I]; ++I)
+ if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
+ !LRM->isPhysRegUsed(CSRegs[I]))
+ return false;
+ }
+
+ return true;
+}
+
+bool
+GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
+ unsigned NumRegs = Intervals.size();
+
+ if (NumRegs > MaxNumVGPRs)
+ return false;
+ unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
+
+ for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
+ if (!canAssign(Reg, NumRegs))
+ continue;
+
+ if (tryAssignRegisters(Intervals, Reg))
+ return true;
+ }
+
+ return false;
+}
+
+GCNNSAReassign::NSA_Status
+GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+ if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+ return NSA_Status::NOT_NSA;
+
+ int VAddr0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
+
+ unsigned VgprBase = 0;
+ bool NSA = false;
+ for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
+ const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
+ unsigned Reg = Op.getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+ return NSA_Status::FIXED;
+
+ unsigned PhysReg = VRM->getPhys(Reg);
+
+ if (!Fast) {
+ if (!PhysReg)
+ return NSA_Status::FIXED;
+
+ // Bail if address is not a VGPR32. That should be possible to extend the
+ // optimization to work with subregs of a wider register tuples, but the
+ // logic to find free registers will be much more complicated with much
+ // less chances for success. That seems reasonable to assume that in most
+ // cases a tuple is used because a vector variable contains different
+ // parts of an address and it is either already consequitive or cannot
+ // be reassigned if not. If needed it is better to rely on register
+ // coalescer to process such address tuples.
+ if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
+ return NSA_Status::FIXED;
+
+ const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
+
+ if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
+ return NSA_Status::FIXED;
+
+ for (auto U : MRI->use_nodbg_operands(Reg)) {
+ if (U.isImplicit())
+ return NSA_Status::FIXED;
+ const MachineInstr *UseInst = U.getParent();
+ if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
+ return NSA_Status::FIXED;
+ }
+
+ if (!LIS->hasInterval(Reg))
+ return NSA_Status::FIXED;
+ }
+
+ if (I == 0)
+ VgprBase = PhysReg;
+ else if (VgprBase + I != PhysReg)
+ NSA = true;
+ }
+
+ return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
+}
+
+bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (ST->getGeneration() < GCNSubtarget::GFX10)
+ return false;
+
+ MRI = &MF.getRegInfo();
+ TRI = ST->getRegisterInfo();
+ VRM = &getAnalysis<VirtRegMap>();
+ LRM = &getAnalysis<LiveRegMatrix>();
+ LIS = &getAnalysis<LiveIntervals>();
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
+ MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
+ CSRegs = MRI->getCalleeSavedRegs();
+
+ using Candidate = std::pair<const MachineInstr*, bool>;
+ SmallVector<Candidate, 32> Candidates;
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ switch (CheckNSA(MI)) {
+ default:
+ continue;
+ case NSA_Status::CONTIGUOUS:
+ Candidates.push_back(std::make_pair(&MI, true));
+ break;
+ case NSA_Status::NON_CONTIGUOUS:
+ Candidates.push_back(std::make_pair(&MI, false));
+ ++NumNSAInstructions;
+ break;
+ }
+ }
+ }
+
+ bool Changed = false;
+ for (auto &C : Candidates) {
+ if (C.second)
+ continue;
+
+ const MachineInstr *MI = C.first;
+ if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
+ // Already happen to be fixed.
+ C.second = true;
+ ++NumNSAConverted;
+ continue;
+ }
+
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
+ int VAddr0Idx =
+ AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
+
+ SmallVector<LiveInterval *, 16> Intervals;
+ SmallVector<unsigned, 16> OrigRegs;
+ SlotIndex MinInd, MaxInd;
+ for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
+ const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
+ unsigned Reg = Op.getReg();
+ LiveInterval *LI = &LIS->getInterval(Reg);
+ if (llvm::find(Intervals, LI) != Intervals.end()) {
+ // Same register used, unable to make sequential
+ Intervals.clear();
+ break;
+ }
+ Intervals.push_back(LI);
+ OrigRegs.push_back(VRM->getPhys(Reg));
+ MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
+ MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
+ }
+
+ if (Intervals.empty())
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
+ << "\tOriginal allocation:\t";
+ for(auto *LI : Intervals)
+ dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI);
+ dbgs() << '\n');
+
+ bool Success = scavengeRegs(Intervals);
+ if (!Success) {
+ LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
+ if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation.
+ continue;
+ } else {
+ // Check we did not make it worse for other instructions.
+ auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
+ [this](const Candidate &C, SlotIndex I) {
+ return LIS->getInstructionIndex(*C.first) < I;
+ });
+ for (auto E = Candidates.end(); Success && I != E &&
+ LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
+ if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
+ Success = false;
+ LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
+ }
+ }
+ }
+
+ if (!Success) {
+ for (unsigned I = 0; I < Info->VAddrDwords; ++I)
+ if (VRM->hasPhys(Intervals[I]->reg))
+ LRM->unassign(*Intervals[I]);
+
+ for (unsigned I = 0; I < Info->VAddrDwords; ++I)
+ LRM->assign(*Intervals[I], OrigRegs[I]);
+
+ continue;
+ }
+
+ C.second = true;
+ ++NumNSAConverted;
+ LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
+ << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI)
+ << " : "
+ << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI)
+ << "]\n");
+ Changed = true;
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td
index b8142a4e4ff8..b926041afb2f 100644
--- a/lib/Target/AMDGPU/GCNProcessors.td
+++ b/lib/Target/AMDGPU/GCNProcessors.td
@@ -1,163 +1,185 @@
//===-- GCNProcessors.td - GCN Processor definitions ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// The code produced for "generic" is only useful for tests and cannot
// reasonably be expected to execute on any particular target.
def : ProcessorModel<"generic", NoSchedModel,
- [FeatureGCN, FeatureWavefrontSize64]
+ [FeatureWavefrontSize64]
>;
-//===----------------------------------------------------------------------===//
+def : ProcessorModel<"generic-hsa", NoSchedModel,
+ [FeatureWavefrontSize64, FeatureFlatAddressSpace]
+>;
+
+//===------------------------------------------------------------===//
// GCN GFX6 (Southern Islands (SI)).
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
def : ProcessorModel<"gfx600", SIFullSpeedModel,
- [FeatureISAVersion6_0_0]
+ FeatureISAVersion6_0_0.Features
>;
def : ProcessorModel<"tahiti", SIFullSpeedModel,
- [FeatureISAVersion6_0_0]
+ FeatureISAVersion6_0_0.Features
>;
def : ProcessorModel<"gfx601", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]
+ FeatureISAVersion6_0_1.Features
>;
def : ProcessorModel<"hainan", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]
+ FeatureISAVersion6_0_1.Features
>;
def : ProcessorModel<"oland", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]
+ FeatureISAVersion6_0_1.Features
>;
def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]
+ FeatureISAVersion6_0_1.Features
>;
def : ProcessorModel<"verde", SIQuarterSpeedModel,
- [FeatureISAVersion6_0_1]
+ FeatureISAVersion6_0_1.Features
>;
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
// GCN GFX7 (Sea Islands (CI)).
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
def : ProcessorModel<"gfx700", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_0]
+ FeatureISAVersion7_0_0.Features
>;
def : ProcessorModel<"kaveri", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_0]
+ FeatureISAVersion7_0_0.Features
>;
def : ProcessorModel<"gfx701", SIFullSpeedModel,
- [FeatureISAVersion7_0_1]
+ FeatureISAVersion7_0_1.Features
>;
def : ProcessorModel<"hawaii", SIFullSpeedModel,
- [FeatureISAVersion7_0_1]
+ FeatureISAVersion7_0_1.Features
>;
def : ProcessorModel<"gfx702", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_2]
+ FeatureISAVersion7_0_2.Features
>;
def : ProcessorModel<"gfx703", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_3]
+ FeatureISAVersion7_0_3.Features
>;
def : ProcessorModel<"kabini", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_3]
+ FeatureISAVersion7_0_3.Features
>;
def : ProcessorModel<"mullins", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_3]
+ FeatureISAVersion7_0_3.Features
>;
def : ProcessorModel<"gfx704", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_4]
+ FeatureISAVersion7_0_4.Features
>;
def : ProcessorModel<"bonaire", SIQuarterSpeedModel,
- [FeatureISAVersion7_0_4]
+ FeatureISAVersion7_0_4.Features
>;
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
// GCN GFX8 (Volcanic Islands (VI)).
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
def : ProcessorModel<"gfx801", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_1]
+ FeatureISAVersion8_0_1.Features
>;
def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_1]
+ FeatureISAVersion8_0_1.Features
>;
def : ProcessorModel<"gfx802", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_2]
+ FeatureISAVersion8_0_2.Features
>;
def : ProcessorModel<"iceland", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_2]
+ FeatureISAVersion8_0_2.Features
>;
def : ProcessorModel<"tonga", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_2]
+ FeatureISAVersion8_0_2.Features
>;
def : ProcessorModel<"gfx803", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
+ FeatureISAVersion8_0_3.Features
>;
def : ProcessorModel<"fiji", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
+ FeatureISAVersion8_0_3.Features
>;
def : ProcessorModel<"polaris10", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
+ FeatureISAVersion8_0_3.Features
>;
def : ProcessorModel<"polaris11", SIQuarterSpeedModel,
- [FeatureISAVersion8_0_3]
+ FeatureISAVersion8_0_3.Features
>;
def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
- [FeatureISAVersion8_1_0]
+ FeatureISAVersion8_1_0.Features
>;
def : ProcessorModel<"stoney", SIQuarterSpeedModel,
- [FeatureISAVersion8_1_0]
+ FeatureISAVersion8_1_0.Features
>;
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
// GCN GFX9.
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------===//
def : ProcessorModel<"gfx900", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_0]
+ FeatureISAVersion9_0_0.Features
>;
def : ProcessorModel<"gfx902", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_2]
+ FeatureISAVersion9_0_2.Features
>;
def : ProcessorModel<"gfx904", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_4]
+ FeatureISAVersion9_0_4.Features
>;
def : ProcessorModel<"gfx906", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_6]
+ FeatureISAVersion9_0_6.Features
+>;
+
+def : ProcessorModel<"gfx908", SIQuarterSpeedModel,
+ FeatureISAVersion9_0_8.Features
>;
def : ProcessorModel<"gfx909", SIQuarterSpeedModel,
- [FeatureISAVersion9_0_9]
+ FeatureISAVersion9_0_9.Features
+>;
+
+//===----------------------------------------------------------------------===//
+// GCN GFX10.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx1010", GFX10SpeedModel,
+ FeatureISAVersion10_1_0.Features
>;
+def : ProcessorModel<"gfx1011", GFX10SpeedModel,
+ FeatureISAVersion10_1_1.Features
+>;
+
+def : ProcessorModel<"gfx1012", GFX10SpeedModel,
+ FeatureISAVersion10_1_2.Features
+>;
diff --git a/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/lib/Target/AMDGPU/GCNRegBankReassign.cpp
new file mode 100644
index 000000000000..f0d47eaa4ed1
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -0,0 +1,800 @@
+//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Try to reassign registers on GFX10+ to reduce register bank
+/// conflicts.
+///
+/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
+/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
+/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
+/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
+///
+/// The shader can read one dword from each of these banks once per cycle.
+/// If an instruction has to read more register operands from the same bank
+/// an additional cycle is needed. HW attempts to pre-load registers through
+/// input operand gathering, but a stall cycle may occur if that fails. For
+/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
+/// potentially incuring 2 stall cycles.
+///
+/// The pass tries to reassign registers to reduce bank conflicts.
+///
+/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
+/// that 4 has to be subtracted from an SGPR bank number to get the real value.
+/// This also corresponds to bit numbers in bank masks used in the pass.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
+ cl::desc("Verify stall cycles in the regbanks reassign pass"),
+ cl::value_desc("0|1|2"),
+ cl::init(0), cl::Hidden);
+
+#define DEBUG_TYPE "amdgpu-regbanks-reassign"
+
+#define NUM_VGPR_BANKS 4
+#define NUM_SGPR_BANKS 8
+#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
+#define SGPR_BANK_OFFSET NUM_VGPR_BANKS
+#define VGPR_BANK_MASK 0xf
+#define SGPR_BANK_MASK 0xff0
+#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
+
+STATISTIC(NumStallsDetected,
+ "Number of operand read stalls detected");
+STATISTIC(NumStallsRecovered,
+ "Number of operand read stalls recovered");
+
+namespace {
+
+class GCNRegBankReassign : public MachineFunctionPass {
+
+ class OperandMask {
+ public:
+ OperandMask(unsigned r, unsigned s, unsigned m)
+ : Reg(r), SubReg(s), Mask(m) {}
+ unsigned Reg;
+ unsigned SubReg;
+ unsigned Mask;
+ };
+
+ class Candidate {
+ public:
+ Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
+ unsigned weight)
+ : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
+
+ bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(const GCNRegBankReassign *P) const {
+ MI->dump();
+ dbgs() << P->printReg(Reg) << " to banks ";
+ dumpFreeBanks(FreeBanks);
+ dbgs() << " weight " << Weight << '\n';
+ }
+#endif
+
+ MachineInstr *MI;
+ unsigned Reg;
+ unsigned FreeBanks;
+ unsigned Weight;
+ };
+
+ class CandidateList : public std::list<Candidate> {
+ public:
+ // Speedup subsequent sort.
+ void push(const Candidate&& C) {
+ if (C.Weight) push_back(C);
+ else push_front(C);
+ }
+ };
+
+public:
+ static char ID;
+
+public:
+ GCNRegBankReassign() : MachineFunctionPass(ID) {
+ initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "GCN RegBank Reassign"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineLoopInfo>();
+ AU.addRequired<LiveIntervals>();
+ AU.addRequired<VirtRegMap>();
+ AU.addRequired<LiveRegMatrix>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ const GCNSubtarget *ST;
+
+ const MachineRegisterInfo *MRI;
+
+ const SIRegisterInfo *TRI;
+
+ MachineLoopInfo *MLI;
+
+ VirtRegMap *VRM;
+
+ LiveRegMatrix *LRM;
+
+ LiveIntervals *LIS;
+
+ unsigned MaxNumVGPRs;
+
+ unsigned MaxNumSGPRs;
+
+ BitVector RegsUsed;
+
+ SmallVector<OperandMask, 8> OperandMasks;
+
+ CandidateList Candidates;
+
+ const MCPhysReg *CSRegs;
+
+ // Returns bank for a phys reg.
+ unsigned getPhysRegBank(unsigned Reg) const;
+
+ // Return a bit set for each register bank used. 4 banks for VGPRs and
+ // 8 banks for SGPRs.
+ // Registers already processed and recorded in RegsUsed are excluded.
+ // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
+ unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
+
+ // Return number of stalls in the instructions.
+ // UsedBanks has bits set for the banks used by all operands.
+ // If Reg and Bank provided substitute the Reg with the Bank.
+ unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks,
+ unsigned Reg = AMDGPU::NoRegister, int Bank = -1);
+
+ // Return true if register is regular VGPR or SGPR or their tuples.
+ // Returns false for special registers like m0, vcc etc.
+ bool isReassignable(unsigned Reg) const;
+
+ // Check if registers' defs are old and may be pre-loaded.
+ // Returns 0 if both registers are old enough, 1 or 2 if one or both
+ // registers will not likely be pre-loaded.
+ unsigned getOperandGatherWeight(const MachineInstr& MI,
+ unsigned Reg1,
+ unsigned Reg2,
+ unsigned StallCycles) const;
+
+
+ // Find all bank bits in UsedBanks where Mask can be relocated to.
+ unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
+
+ // Find all bank bits in UsedBanks where Mask can be relocated to.
+ // Bank is relative to the register and not its subregister component.
+ // Returns 0 is a register is not reassignable.
+ unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
+ unsigned UsedBanks) const;
+
+ // Add cadidate instruction to the work list.
+ void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
+ unsigned StallCycles);
+
+ // Collect cadidate instructions across function. Returns a number stall
+ // cycles detected. Only counts stalls if Collect is false.
+ unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
+
+ // Remove all candidates that read specified register.
+ void removeCandidates(unsigned Reg);
+
+ // Compute stalls within the uses of SrcReg replaced by a register from
+ // Bank. If Bank is -1 does not perform substitution. If Collect is set
+ // candidates are collected and added to work list.
+ unsigned computeStallCycles(unsigned SrcReg,
+ unsigned Reg = AMDGPU::NoRegister,
+ int Bank = -1, bool Collect = false);
+
+ // Search for a register in Bank unused within LI.
+ // Returns phys reg or NoRegister.
+ unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
+
+ // Try to reassign candidate. Returns number or stall cycles saved.
+ unsigned tryReassign(Candidate &C);
+
+ bool verifyCycles(MachineFunction &MF,
+ unsigned OriginalCycles, unsigned CyclesSaved);
+
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+public:
+ Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
+ return Printable([Reg, SubReg, this](raw_ostream &OS) {
+ if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ OS << llvm::printReg(Reg, TRI);
+ return;
+ }
+ if (!VRM->isAssignedReg(Reg))
+ OS << "<unassigned> " << llvm::printReg(Reg, TRI);
+ else
+ OS << llvm::printReg(Reg, TRI) << '('
+ << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
+ if (SubReg)
+ OS << ':' << TRI->getSubRegIndexName(SubReg);
+ });
+ }
+
+ static Printable printBank(unsigned Bank) {
+ return Printable([Bank](raw_ostream &OS) {
+ OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
+ });
+ }
+
+ static void dumpFreeBanks(unsigned FreeBanks) {
+ for (unsigned L = 0; L < NUM_BANKS; ++L)
+ if (FreeBanks & (1 << L))
+ dbgs() << printBank(L) << ' ';
+ }
+#endif
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
+ false, false)
+
+
+char GCNRegBankReassign::ID = 0;
+
+char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
+
+unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
+ assert (TargetRegisterInfo::isPhysicalRegister(Reg));
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ if (Size > 32)
+ Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+
+ if (TRI->hasVGPRs(RC)) {
+ Reg -= AMDGPU::VGPR0;
+ return Reg % NUM_VGPR_BANKS;
+ }
+
+ Reg = TRI->getEncodingValue(Reg) / 2;
+ return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
+}
+
+unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
+ int Bank) {
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (!VRM->isAssignedReg(Reg))
+ return 0;
+
+ Reg = VRM->getPhys(Reg);
+ if (!Reg)
+ return 0;
+ if (SubReg)
+ Reg = TRI->getSubReg(Reg, SubReg);
+ }
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ unsigned Size = TRI->getRegSizeInBits(*RC) / 32;
+ if (Size > 1)
+ Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+
+ if (TRI->hasVGPRs(RC)) {
+ // VGPRs have 4 banks assigned in a round-robin fashion.
+ Reg -= AMDGPU::VGPR0;
+ unsigned Mask = (1 << Size) - 1;
+ unsigned Used = 0;
+ // Bitmask lacks an extract method
+ for (unsigned I = 0; I < Size; ++I)
+ if (RegsUsed.test(Reg + I))
+ Used |= 1 << I;
+ RegsUsed.set(Reg, Reg + Size);
+ Mask &= ~Used;
+ Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank);
+ return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
+ }
+
+ // SGPRs have 8 banks holding 2 consequitive registers each.
+ Reg = TRI->getEncodingValue(Reg) / 2;
+ unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
+ if (Reg + StartBit >= RegsUsed.size())
+ return 0;
+
+ if (Size > 1)
+ Size /= 2;
+ unsigned Mask = (1 << Size) - 1;
+ unsigned Used = 0;
+ for (unsigned I = 0; I < Size; ++I)
+ if (RegsUsed.test(StartBit + Reg + I))
+ Used |= 1 << I;
+ RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
+ Mask &= ~Used;
+ Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS
+ : unsigned(Bank - SGPR_BANK_OFFSET);
+ Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
+ // Reserve 4 bank ids for VGPRs.
+ return Mask << SGPR_BANK_OFFSET;
+}
+
+unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
+ unsigned& UsedBanks,
+ unsigned Reg,
+ int Bank) {
+ unsigned StallCycles = 0;
+ UsedBanks = 0;
+
+ if (MI.isDebugValue())
+ return 0;
+
+ RegsUsed.reset();
+ OperandMasks.clear();
+ for (const auto& Op : MI.explicit_uses()) {
+ // Undef can be assigned to any register, so two vregs can be assigned
+ // the same phys reg within the same instruction.
+ if (!Op.isReg() || Op.isUndef())
+ continue;
+
+ unsigned R = Op.getReg();
+ if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
+ continue;
+
+ unsigned ShiftedBank = Bank;
+
+ if (Bank != -1 && R == Reg && Op.getSubReg()) {
+ unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger();
+ if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) {
+ // If a register spans all banks we cannot shift it to avoid conflict.
+ if (countPopulation(LM) >= NUM_VGPR_BANKS)
+ continue;
+ ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS;
+ } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) {
+ // If a register spans all banks we cannot shift it to avoid conflict.
+ if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS)
+ continue;
+ ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET +
+ (countTrailingZeros(LM) >> 1)) %
+ NUM_SGPR_BANKS;
+ }
+ }
+
+ unsigned Mask = getRegBankMask(R, Op.getSubReg(),
+ (Reg == R) ? ShiftedBank : -1);
+ StallCycles += countPopulation(UsedBanks & Mask);
+ UsedBanks |= Mask;
+ OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
+ }
+
+ return StallCycles;
+}
+
+unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
+ unsigned Reg1,
+ unsigned Reg2,
+ unsigned StallCycles) const
+{
+ unsigned Defs = 0;
+ MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
+ MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
+ for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
+ if (MI.isDebugInstr())
+ continue;
+ --Def;
+ if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+ continue;
+ if (Def->modifiesRegister(Reg1, TRI))
+ Defs |= 1;
+ if (Def->modifiesRegister(Reg2, TRI))
+ Defs |= 2;
+ }
+ return countPopulation(Defs);
+}
+
+bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
+ if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+ return false;
+
+ const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
+
+ unsigned PhysReg = VRM->getPhys(Reg);
+
+ if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
+ return false;
+
+ for (auto U : MRI->use_nodbg_operands(Reg)) {
+ if (U.isImplicit())
+ return false;
+ const MachineInstr *UseInst = U.getParent();
+ if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
+ return false;
+ }
+
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
+ if (TRI->hasVGPRs(RC))
+ return true;
+
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ if (Size > 32)
+ PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
+
+ return AMDGPU::SGPR_32RegClass.contains(PhysReg);
+}
+
+unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
+ unsigned UsedBanks) const {
+ unsigned Size = countPopulation(Mask);
+ unsigned FreeBanks = 0;
+ unsigned Bank = findFirstSet(Mask);
+
+ UsedBanks &= ~Mask;
+
+ // Find free VGPR banks
+ if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
+ for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
+ if (Bank == I)
+ continue;
+ unsigned NewMask = ((1 << Size) - 1) << I;
+ NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
+ if (!(UsedBanks & NewMask))
+ FreeBanks |= 1 << I;
+ }
+ return FreeBanks;
+ }
+
+ // Find free SGPR banks
+ // SGPR tuples must be aligned, so step is size in banks it
+ // crosses.
+ Bank -= SGPR_BANK_OFFSET;
+ for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
+ if (Bank == I)
+ continue;
+ unsigned NewMask = ((1 << Size) - 1) << I;
+ NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
+ if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
+ FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
+ }
+
+ return FreeBanks;
+}
+
+unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
+ unsigned SubReg,
+ unsigned Mask,
+ unsigned UsedBanks) const {
+ if (!isReassignable(Reg))
+ return 0;
+
+ unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
+
+ unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger();
+ if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) {
+ unsigned Shift = countTrailingZeros(LM);
+ if (Shift >= NUM_VGPR_BANKS)
+ return 0;
+ unsigned VB = FreeBanks & VGPR_BANK_MASK;
+ FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
+ VGPR_BANK_MASK;
+ } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) {
+ unsigned Shift = countTrailingZeros(LM) >> 1;
+ if (Shift >= NUM_SGPR_BANKS)
+ return 0;
+ unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
+ FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
+ SGPR_BANK_SHIFTED_MASK;
+ FreeBanks <<= SGPR_BANK_OFFSET;
+ }
+
+ LLVM_DEBUG(if (FreeBanks) {
+ dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
+ << " to banks: "; dumpFreeBanks(FreeBanks);
+ dbgs() << '\n'; });
+
+ return FreeBanks;
+}
+
+void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
+ unsigned UsedBanks,
+ unsigned StallCycles) {
+ LLVM_DEBUG(MI.dump());
+
+ if (!StallCycles)
+ return;
+
+ LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
+
+ for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
+ for (unsigned J = I + 1; J != E; ++J) {
+ if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
+ continue;
+
+ unsigned Reg1 = OperandMasks[I].Reg;
+ unsigned Reg2 = OperandMasks[J].Reg;
+ unsigned SubReg1 = OperandMasks[I].SubReg;
+ unsigned SubReg2 = OperandMasks[J].SubReg;
+ unsigned Mask1 = OperandMasks[I].Mask;
+ unsigned Mask2 = OperandMasks[J].Mask;
+ unsigned Size1 = countPopulation(Mask1);
+ unsigned Size2 = countPopulation(Mask2);
+
+ LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
+ " and " << printReg(Reg2, SubReg2) << '\n');
+
+ unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
+ Weight += MLI->getLoopDepth(MI.getParent()) * 10;
+
+ LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
+
+ unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
+ unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
+ if (FreeBanks1)
+ Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
+ + ((Size2 > Size1) ? 1 : 0)));
+ if (FreeBanks2)
+ Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
+ + ((Size1 > Size2) ? 1 : 0)));
+ }
+ }
+}
+
+unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
+ unsigned Reg, int Bank,
+ bool Collect) {
+ unsigned TotalStallCycles = 0;
+ unsigned UsedBanks = 0;
+ SmallSet<const MachineInstr *, 16> Visited;
+
+ for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
+ if (MI.isBundle())
+ continue;
+ if (!Visited.insert(&MI).second)
+ continue;
+ unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank);
+ TotalStallCycles += StallCycles;
+ if (Collect)
+ collectCandidates(MI, UsedBanks, StallCycles);
+ }
+
+ return TotalStallCycles;
+}
+
+unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
+ unsigned Bank) const {
+ const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
+ unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
+ : MaxNumSGPRs;
+ unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
+ : AMDGPU::SGPR0);
+
+ for (unsigned Reg : RC->getRegisters()) {
+ // Check occupancy limit.
+ if (TRI->isSubRegisterEq(Reg, MaxReg))
+ break;
+
+ if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
+ continue;
+
+ for (unsigned I = 0; CSRegs[I]; ++I)
+ if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
+ !LRM->isPhysRegUsed(CSRegs[I]))
+ return AMDGPU::NoRegister;
+
+ LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
+
+ if (!LRM->checkInterference(LI, Reg))
+ return Reg;
+ }
+
+ return AMDGPU::NoRegister;
+}
+
+unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
+ if (!LIS->hasInterval(C.Reg))
+ return 0;
+
+ LiveInterval &LI = LIS->getInterval(C.Reg);
+ LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
+ LI.dump());
+
+ // For each candidate bank walk all instructions in the range of live
+ // interval and check if replacing the register with one belonging to
+ // the candidate bank reduces conflicts.
+
+ unsigned OrigStalls = computeStallCycles(C.Reg);
+ LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
+ if (!OrigStalls)
+ return 0;
+
+ struct BankStall {
+ BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
+ bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; }
+ unsigned Bank;
+ unsigned Stalls;
+ };
+ SmallVector<BankStall, 8> BankStalls;
+
+ for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
+ if (C.FreeBanks & (1 << Bank)) {
+ LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
+ unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
+ if (Stalls < OrigStalls) {
+ LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
+ << Stalls << '\n');
+ BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
+ }
+ }
+ }
+ std::sort(BankStalls.begin(), BankStalls.end());
+
+ unsigned OrigReg = VRM->getPhys(C.Reg);
+ LRM->unassign(LI);
+ while (!BankStalls.empty()) {
+ BankStall BS = BankStalls.pop_back_val();
+ unsigned Reg = scavengeReg(LI, BS.Bank);
+ if (Reg == AMDGPU::NoRegister) {
+ LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
+ << '\n');
+ continue;
+ }
+ LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
+ << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
+ << " in bank " << printBank(BS.Bank) << '\n');
+
+ LRM->assign(LI, Reg);
+
+ LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
+
+ return OrigStalls - BS.Stalls;
+ }
+ LRM->assign(LI, OrigReg);
+
+ return 0;
+}
+
+unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
+ bool Collect) {
+ unsigned TotalStallCycles = 0;
+
+ for (MachineBasicBlock &MBB : MF) {
+
+ LLVM_DEBUG(if (Collect) {
+ if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
+ else dbgs() << MBB.getName(); dbgs() << ":\n";
+ });
+
+ for (MachineInstr &MI : MBB.instrs()) {
+ if (MI.isBundle())
+ continue; // we analyze the instructions inside the bundle individually
+
+ unsigned UsedBanks = 0;
+ unsigned StallCycles = analyzeInst(MI, UsedBanks);
+
+ if (Collect)
+ collectCandidates(MI, UsedBanks, StallCycles);
+
+ TotalStallCycles += StallCycles;
+ }
+
+ LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
+ }
+
+ return TotalStallCycles;
+}
+
+void GCNRegBankReassign::removeCandidates(unsigned Reg) {
+ Candidates.remove_if([Reg, this](const Candidate& C) {
+ return C.MI->readsRegister(Reg, TRI);
+ });
+}
+
+bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
+ unsigned OriginalCycles,
+ unsigned CyclesSaved) {
+ unsigned StallCycles = collectCandidates(MF, false);
+ LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
+ << " stall cycles left\n");
+ return StallCycles + CyclesSaved == OriginalCycles;
+}
+
+bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
+ return false;
+
+ MRI = &MF.getRegInfo();
+ TRI = ST->getRegisterInfo();
+ MLI = &getAnalysis<MachineLoopInfo>();
+ VRM = &getAnalysis<VirtRegMap>();
+ LRM = &getAnalysis<LiveRegMatrix>();
+ LIS = &getAnalysis<LiveIntervals>();
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned Occupancy = MFI->getOccupancy();
+ MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
+ MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
+ MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
+ MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
+
+ CSRegs = MRI->getCalleeSavedRegs();
+
+ RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
+ TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
+
+ LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
+ << '\n');
+
+ unsigned StallCycles = collectCandidates(MF);
+ NumStallsDetected += StallCycles;
+
+ LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
+ "function " << MF.getName() << '\n');
+
+ Candidates.sort();
+
+ LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
+ for (auto C : Candidates) C.dump(this);
+ dbgs() << "\n\n");
+
+ unsigned CyclesSaved = 0;
+ while (!Candidates.empty()) {
+ Candidate C = Candidates.back();
+ unsigned LocalCyclesSaved = tryReassign(C);
+ CyclesSaved += LocalCyclesSaved;
+
+ if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
+ report_fatal_error("RegBank reassign stall cycles verification failed.");
+
+ Candidates.pop_back();
+ if (LocalCyclesSaved) {
+ removeCandidates(C.Reg);
+ computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
+ Candidates.sort();
+
+ LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
+ for (auto C : Candidates)
+ C.dump(this);
+ dbgs() << "\n\n");
+ }
+ }
+ NumStallsRecovered += CyclesSaved;
+
+ LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
+ << " cycles saved in function " << MF.getName() << '\n');
+
+ Candidates.clear();
+
+ if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
+ report_fatal_error("RegBank reassign stall cycles verification failed.");
+
+ RegsUsed.clear();
+
+ return CyclesSaved > 0;
+}
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 3d8cacc4f02c..39460fbd8a84 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -1,9 +1,8 @@
//===- GCNRegPressure.cpp -------------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -64,9 +63,10 @@ void llvm::printLivesAt(SlotIndex SI,
}
if (!Num) dbgs() << " <none>\n";
}
+#endif
-static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
- const GCNRPTracker::LiveRegSet &S2) {
+bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
+ const GCNRPTracker::LiveRegSet &S2) {
if (S1.size() != S2.size())
return false;
@@ -77,7 +77,7 @@ static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
}
return true;
}
-#endif
+
///////////////////////////////////////////////////////////////////////////////
// GCNRegPressure
@@ -89,7 +89,9 @@ unsigned GCNRegPressure::getRegKind(unsigned Reg,
auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
return STI->isSGPRClass(RC) ?
(STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) :
- (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE);
+ STI->hasAGPRs(RC) ?
+ (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) :
+ (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE);
}
void GCNRegPressure::inc(unsigned Reg,
@@ -110,16 +112,18 @@ void GCNRegPressure::inc(unsigned Reg,
switch (auto Kind = getRegKind(Reg, MRI)) {
case SGPR32:
case VGPR32:
+ case AGPR32:
assert(PrevMask.none() && NewMask == MaxMask);
Value[Kind] += Sign;
break;
case SGPR_TUPLE:
case VGPR_TUPLE:
+ case AGPR_TUPLE:
assert(NewMask < MaxMask || NewMask == MaxMask);
assert(PrevMask < NewMask);
- Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] +=
+ Value[Kind == SGPR_TUPLE ? SGPR32 : Kind == AGPR_TUPLE ? AGPR32 : VGPR32] +=
Sign * (~PrevMask & NewMask).getNumLanes();
if (PrevMask.none()) {
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index 357d3b7b2334..e4894418b943 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -1,9 +1,8 @@
//===- GCNRegPressure.h -----------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -32,6 +31,8 @@ struct GCNRegPressure {
SGPR_TUPLE,
VGPR32,
VGPR_TUPLE,
+ AGPR32,
+ AGPR_TUPLE,
TOTAL_KINDS
};
@@ -44,9 +45,10 @@ struct GCNRegPressure {
void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
unsigned getSGPRNum() const { return Value[SGPR32]; }
- unsigned getVGPRNum() const { return Value[VGPR32]; }
+ unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); }
- unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
+ unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE],
+ Value[AGPR_TUPLE]); }
unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
unsigned getOccupancy(const GCNSubtarget &ST) const {
@@ -191,6 +193,50 @@ GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
+/// creates a map MachineInstr -> LiveRegSet
+/// R - range of iterators on instructions
+/// After - upon entry or exit of every instruction
+/// Note: there is no entry in the map for instructions with empty live reg set
+/// Complexity = O(NumVirtRegs * averageLiveRangeSegmentsPerReg * lg(R))
+template <typename Range>
+DenseMap<MachineInstr*, GCNRPTracker::LiveRegSet>
+getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
+ std::vector<SlotIndex> Indexes;
+ Indexes.reserve(std::distance(R.begin(), R.end()));
+ auto &SII = *LIS.getSlotIndexes();
+ for (MachineInstr *I : R) {
+ auto SI = SII.getInstructionIndex(*I);
+ Indexes.push_back(After ? SI.getDeadSlot() : SI.getBaseIndex());
+ }
+ std::sort(Indexes.begin(), Indexes.end());
+
+ auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo();
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap;
+ SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs;
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ auto Reg = TargetRegisterInfo::index2VirtReg(I);
+ if (!LIS.hasInterval(Reg))
+ continue;
+ auto &LI = LIS.getInterval(Reg);
+ LiveIdxs.clear();
+ if (!LI.findIndexesLiveAt(Indexes, std::back_inserter(LiveIdxs)))
+ continue;
+ if (!LI.hasSubRanges()) {
+ for (auto SI : LiveIdxs)
+ LiveRegMap[SII.getInstructionFromIndex(SI)][Reg] =
+ MRI.getMaxLaneMaskForVReg(Reg);
+ } else
+ for (const auto &S : LI.subranges()) {
+ // constrain search for subranges by indexes live at main range
+ SRLiveIdxs.clear();
+ S.findIndexesLiveAt(LiveIdxs, std::back_inserter(SRLiveIdxs));
+ for (auto SI : SRLiveIdxs)
+ LiveRegMap[SII.getInstructionFromIndex(SI)][Reg] |= S.LaneMask;
+ }
+ }
+ return LiveRegMap;
+}
+
inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
const LiveIntervals &LIS) {
return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
@@ -212,6 +258,9 @@ GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
return Res;
}
+bool isEqual(const GCNRPTracker::LiveRegSet &S1,
+ const GCNRPTracker::LiveRegSet &S2);
+
void printLivesAt(SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index f09b7f6cff22..4ea990ae490e 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1,9 +1,8 @@
//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -446,8 +445,12 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
RPTracker.reset(*MBB->begin(), &LiveIn);
MBBLiveIns.erase(LiveInIt);
} else {
- I = Regions[CurRegion].first;
- RPTracker.reset(*I);
+ auto &Rgn = Regions[CurRegion];
+ I = Rgn.first;
+ auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
+ auto LRS = BBLiveInMap.lookup(NonDbgMI);
+ assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
+ RPTracker.reset(*I, &LRS);
}
for ( ; ; ) {
@@ -478,6 +481,23 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
}
}
+DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
+GCNScheduleDAGMILive::getBBLiveInMap() const {
+ assert(!Regions.empty());
+ std::vector<MachineInstr *> BBStarters;
+ BBStarters.reserve(Regions.size());
+ auto I = Regions.rbegin(), E = Regions.rend();
+ auto *BB = I->first->getParent();
+ do {
+ auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
+ BBStarters.push_back(MI);
+ do {
+ ++I;
+ } while (I != E && I->first->getParent() == BB);
+ } while (I != E);
+ return getLiveRegMap(BBStarters, false /*After*/, *LIS);
+}
+
void GCNScheduleDAGMILive::finalizeSchedule() {
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
@@ -485,6 +505,9 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
+ if (!Regions.empty())
+ BBLiveInMap = getBBLiveInMap();
+
do {
Stage++;
RegionIdx = 0;
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 3ac6af89cb9b..eaf3dee9ba5d 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -1,9 +1,8 @@
//===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -27,7 +26,7 @@ class GCNSubtarget;
/// and the GenericScheduler is that GCNSchedStrategy uses different
/// heuristics to determine excess/critical pressure sets. Its goal is to
/// maximize kernel occupancy (i.e. maximum number of waves per simd).
-class GCNMaxOccupancySchedStrategy : public GenericScheduler {
+class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
friend class GCNScheduleDAGMILive;
SUnit *pickNodeBidirectional(bool &IsTopNode);
@@ -60,7 +59,7 @@ public:
void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
};
-class GCNScheduleDAGMILive : public ScheduleDAGMILive {
+class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
const GCNSubtarget &ST;
@@ -78,7 +77,7 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive {
// Current region index.
size_t RegionIdx;
- // Vecor of regions recorder for later rescheduling
+ // Vector of regions recorder for later rescheduling
SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;
@@ -91,6 +90,9 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive {
// Temporary basic block live-in cache.
DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
+ DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
+
// Return current region pressure.
GCNRegPressure getRealRegPressure() const;
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index abc88c02adca..57c0ba26cc3a 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
@@ -19,8 +18,10 @@
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/TargetRegistry.h"
+#include "Utils/AMDGPUBaseInfo.h"
using namespace llvm;
+using namespace llvm::AMDGPU;
namespace {
@@ -36,17 +37,13 @@ public:
const MCSubtargetInfo *STI) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
- const MCAsmLayout &Layout) const override {
- return false;
- }
+ const MCAsmLayout &Layout) const override;
+
void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- MCInst &Res) const override {
- llvm_unreachable("Not implemented");
- }
+ MCInst &Res) const override;
+
bool mayNeedRelaxation(const MCInst &Inst,
- const MCSubtargetInfo &STI) const override {
- return false;
- }
+ const MCSubtargetInfo &STI) const override;
unsigned getMinimumNopSize() const override;
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
@@ -56,6 +53,36 @@ public:
} //End anonymous namespace
+void AMDGPUAsmBackend::relaxInstruction(const MCInst &Inst,
+ const MCSubtargetInfo &STI,
+ MCInst &Res) const {
+ unsigned RelaxedOpcode = AMDGPU::getSOPPWithRelaxation(Inst.getOpcode());
+ Res.setOpcode(RelaxedOpcode);
+ Res.addOperand(Inst.getOperand(0));
+ return;
+}
+
+bool AMDGPUAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+ uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const {
+ // if the branch target has an offset of x3f this needs to be relaxed to
+ // add a s_nop 0 immediately after branch to effectively increment offset
+ // for hardware workaround in gfx1010
+ return (((int64_t(Value)/4)-1) == 0x3f);
+}
+
+bool AMDGPUAsmBackend::mayNeedRelaxation(const MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
+ if (!STI.getFeatureBits()[AMDGPU::FeatureOffset3fBug])
+ return false;
+
+ if (AMDGPU::getSOPPWithRelaxation(Inst.getOpcode()) >= 0)
+ return true;
+
+ return false;
+}
+
static unsigned getFixupKindNumBytes(unsigned Kind) {
switch (Kind) {
case AMDGPU::fixup_si_sopp_br:
@@ -173,11 +200,13 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
bool Is64Bit;
bool HasRelocationAddend;
uint8_t OSABI = ELF::ELFOSABI_NONE;
+ uint8_t ABIVersion = 0;
public:
- ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) :
+ ELFAMDGPUAsmBackend(const Target &T, const Triple &TT, uint8_t ABIVersion) :
AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn),
- HasRelocationAddend(TT.getOS() == Triple::AMDHSA) {
+ HasRelocationAddend(TT.getOS() == Triple::AMDHSA),
+ ABIVersion(ABIVersion) {
switch (TT.getOS()) {
case Triple::AMDHSA:
OSABI = ELF::ELFOSABI_AMDGPU_HSA;
@@ -195,7 +224,8 @@ public:
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
- return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend);
+ return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend,
+ ABIVersion);
}
};
@@ -206,5 +236,6 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
const MCRegisterInfo &MRI,
const MCTargetOptions &Options) {
// Use 64-bit ELF for amdgcn
- return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple());
+ return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(),
+ IsaInfo::hasCodeObjectV3(&STI) ? 1 : 0);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index c85a1ea5b054..6549a8d7d592 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -23,7 +22,8 @@ namespace {
class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
public:
- AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend);
+ AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend,
+ uint8_t ABIVersion);
protected:
unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
@@ -35,9 +35,10 @@ protected:
AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
uint8_t OSABI,
- bool HasRelocationAddend)
+ bool HasRelocationAddend,
+ uint8_t ABIVersion)
: MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU,
- HasRelocationAddend) {}
+ HasRelocationAddend, ABIVersion) {}
unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
const MCValue &Target,
@@ -84,7 +85,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
- bool HasRelocationAddend) {
+ bool HasRelocationAddend,
+ uint8_t ABIVersion) {
return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
- HasRelocationAddend);
+ HasRelocationAddend,
+ ABIVersion);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index c627a08e7463..40437d8fa1a4 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -1,9 +1,8 @@
//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index 41e9063a759e..9fbf53c944ef 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -1,9 +1,8 @@
//===-------- AMDGPUELFStreamer.h - ELF Object Output -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
index 20c1adfbc6b9..d49bb196ab3a 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -1,9 +1,8 @@
//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index fab0f87dfcbe..01b53432cbb7 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// \file
//===----------------------------------------------------------------------===//
@@ -72,11 +71,6 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
}
-void AMDGPUInstPrinter::printS13ImmDecOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm()));
-}
-
void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -123,13 +117,25 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
}
}
-void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
+void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
uint16_t Imm = MI->getOperand(OpNo).getImm();
if (Imm != 0) {
O << ((OpNo == 0)? "offset:" : " offset:");
- printS13ImmDecOperand(MI, OpNo, O);
+
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ bool IsFlatSeg = !(Desc.TSFlags & SIInstrFlags::IsNonFlatSeg);
+
+ if (IsFlatSeg) { // Unsigned offset
+ printU16ImmDecOperand(MI, OpNo, O);
+ } else { // Signed offset
+ if (AMDGPU::isGFX10(STI)) {
+ O << formatDec(SignExtend32<12>(MI->getOperand(OpNo).getImm()));
+ } else {
+ O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm()));
+ }
+ }
}
}
@@ -174,6 +180,12 @@ void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
printNamedBit(MI, OpNo, O, "gds");
}
+void AMDGPUInstPrinter::printDLC(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ if (AMDGPU::isGFX10(STI))
+ printNamedBit(MI, OpNo, O, "dlc");
+}
+
void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "glc");
@@ -197,6 +209,18 @@ void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
}
}
+void AMDGPUInstPrinter::printDim(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ unsigned Dim = MI->getOperand(OpNo).getImm();
+ O << " dim:SQ_RSRC_IMG_";
+
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
+ if (DimInfo)
+ O << DimInfo->AsmSuffix;
+ else
+ O << Dim;
+}
+
void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "unorm");
@@ -243,140 +267,96 @@ void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
if (unsigned Val = MI->getOperand(OpNo).getImm()) {
- O << " dfmt:" << (Val & 15);
- O << ", nfmt:" << (Val >> 4);
+ if (AMDGPU::isGFX10(STI))
+ O << " format:" << Val;
+ else {
+ O << " dfmt:" << (Val & 15);
+ O << ", nfmt:" << (Val >> 4);
+ }
}
}
void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
const MCRegisterInfo &MRI) {
+#if !defined(NDEBUG)
switch (RegNo) {
- case AMDGPU::VCC:
- O << "vcc";
- return;
- case AMDGPU::SCC:
- O << "scc";
- return;
- case AMDGPU::EXEC:
- O << "exec";
- return;
- case AMDGPU::M0:
- O << "m0";
- return;
- case AMDGPU::FLAT_SCR:
- O << "flat_scratch";
- return;
- case AMDGPU::XNACK_MASK:
- O << "xnack_mask";
- return;
- case AMDGPU::VCC_LO:
- O << "vcc_lo";
- return;
- case AMDGPU::VCC_HI:
- O << "vcc_hi";
- return;
- case AMDGPU::TBA_LO:
- O << "tba_lo";
- return;
- case AMDGPU::TBA_HI:
- O << "tba_hi";
- return;
- case AMDGPU::TMA_LO:
- O << "tma_lo";
- return;
- case AMDGPU::TMA_HI:
- O << "tma_hi";
- return;
- case AMDGPU::EXEC_LO:
- O << "exec_lo";
- return;
- case AMDGPU::EXEC_HI:
- O << "exec_hi";
- return;
- case AMDGPU::FLAT_SCR_LO:
- O << "flat_scratch_lo";
- return;
- case AMDGPU::FLAT_SCR_HI:
- O << "flat_scratch_hi";
- return;
- case AMDGPU::XNACK_MASK_LO:
- O << "xnack_mask_lo";
- return;
- case AMDGPU::XNACK_MASK_HI:
- O << "xnack_mask_hi";
- return;
case AMDGPU::FP_REG:
case AMDGPU::SP_REG:
case AMDGPU::SCRATCH_WAVE_OFFSET_REG:
case AMDGPU::PRIVATE_RSRC_REG:
llvm_unreachable("pseudo-register should not ever be emitted");
+ case AMDGPU::SCC:
+ llvm_unreachable("pseudo scc should not ever be emitted");
default:
break;
}
-
- // The low 8 bits of the encoding value is the register index, for both VGPRs
- // and SGPRs.
- unsigned RegIdx = MRI.getEncodingValue(RegNo) & ((1 << 8) - 1);
-
- unsigned NumRegs;
- if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(RegNo)) {
- O << 'v';
- NumRegs = 1;
- } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(RegNo)) {
- O << 's';
- NumRegs = 1;
- } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo)) {
- O <<'v';
- NumRegs = 2;
- } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo)) {
- O << 's';
- NumRegs = 2;
- } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo)) {
- O << 'v';
- NumRegs = 4;
- } else if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo)) {
- O << 's';
- NumRegs = 4;
- } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo)) {
- O << 'v';
- NumRegs = 3;
- } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) {
- O << 'v';
- NumRegs = 8;
- } else if (MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) {
- O << 's';
- NumRegs = 8;
- } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) {
- O << 'v';
- NumRegs = 16;
- } else if (MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo)) {
- O << 's';
- NumRegs = 16;
- } else {
- O << getRegisterName(RegNo);
- return;
- }
-
- if (NumRegs == 1) {
- O << RegIdx;
- return;
- }
-
- O << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+#endif
+
+ unsigned AltName = AMDGPU::Reg32;
+
+ if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg64;
+ else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg128;
+ else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SReg_96RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg96;
+ else if (MRI.getRegClass(AMDGPU::VReg_160RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SReg_160RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg160;
+ else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg256;
+ else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg512;
+ else if (MRI.getRegClass(AMDGPU::VReg_1024RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::SReg_1024RegClassID).contains(RegNo) ||
+ MRI.getRegClass(AMDGPU::AReg_1024RegClassID).contains(RegNo))
+ AltName = AMDGPU::Reg1024;
+
+ O << getRegisterName(RegNo, AltName);
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
- O << "_e64 ";
- else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP)
- O << "_dpp ";
- else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA)
- O << "_sdwa ";
- else
- O << "_e32 ";
+ if (OpNo == 0) {
+ if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
+ O << "_e64 ";
+ else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP)
+ O << "_dpp ";
+ else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA)
+ O << "_sdwa ";
+ else
+ O << "_e32 ";
+ }
printOperand(MI, OpNo, STI, O);
+
+ // Print default vcc/vcc_lo operand.
+ switch (MI->getOpcode()) {
+ default: break;
+
+ case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10:
+ printDefaultVccOperand(1, STI, O);
+ break;
+ }
}
void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
@@ -491,7 +471,7 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
O << "-4.0";
else if (Imm == 0x3fc45f306dc9c882 &&
STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
- O << "0.15915494";
+ O << "0.15915494309189532";
else {
assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882);
@@ -501,9 +481,57 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
}
}
+void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ if (!Imm)
+ return;
+
+ O << " blgp:" << Imm;
+}
+
+void AMDGPUInstPrinter::printCBSZ(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ if (!Imm)
+ return;
+
+ O << " cbsz:" << Imm;
+}
+
+void AMDGPUInstPrinter::printABID(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ if (!Imm)
+ return;
+
+ O << " abid:" << Imm;
+}
+
+void AMDGPUInstPrinter::printDefaultVccOperand(unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (OpNo > 0)
+ O << ", ";
+ printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
+ AMDGPU::VCC : AMDGPU::VCC_LO, O, MRI);
+ if (OpNo == 0)
+ O << ", ";
+}
+
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ // Print default vcc/vcc_lo operand of VOPC.
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ if (OpNo == 0 && (Desc.TSFlags & SIInstrFlags::VOPC) &&
+ (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
+ Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO)))
+ printDefaultVccOperand(OpNo, STI, O);
+
if (OpNo >= MI->getNumOperands()) {
O << "/*Missing OP" << OpNo << "*/";
return;
@@ -513,12 +541,13 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
if (Op.isReg()) {
printRegOperand(Op.getReg(), O, MRI);
} else if (Op.isImm()) {
- const MCInstrDesc &Desc = MII.get(MI->getOpcode());
switch (Desc.OpInfo[OpNo].OperandType) {
case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case MCOI::OPERAND_IMMEDIATE:
printImmediate32(Op.getImm(), STI, O);
break;
@@ -530,12 +559,24 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
break;
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_FP16:
printImmediate16(Op.getImm(), STI, O);
break;
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ if (!isUInt<16>(Op.getImm()) &&
+ STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) {
+ printImmediate32(Op.getImm(), STI, O);
+ break;
+ }
+ LLVM_FALLTHROUGH;
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
printImmediateV216(Op.getImm(), STI, O);
break;
case MCOI::OPERAND_UNKNOWN:
@@ -573,6 +614,29 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
} else {
O << "/*INV_OP*/";
}
+
+ // Print default vcc/vcc_lo operand of v_cndmask_b32_e32.
+ switch (MI->getOpcode()) {
+ default: break;
+
+ case AMDGPU::V_CNDMASK_B32_e32_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10:
+
+ case AMDGPU::V_CNDMASK_B32_e32_gfx6_gfx7:
+ case AMDGPU::V_CNDMASK_B32_e32_vi:
+ if ((int)OpNo == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::src1))
+ printDefaultVccOperand(OpNo, STI, O);
+ break;
+ }
}
void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
@@ -620,6 +684,33 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
printOperand(MI, OpNo + 1, STI, O);
if (InputModifiers & SISrcMods::SEXT)
O << ')';
+
+ // Print default vcc/vcc_lo operand of VOP2b.
+ switch (MI->getOpcode()) {
+ default: break;
+
+ case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10:
+ case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10:
+ case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10:
+ if ((int)OpNo + 1 == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::src1))
+ printDefaultVccOperand(OpNo, STI, O);
+ break;
+ }
+}
+
+void AMDGPUInstPrinter::printDPP8(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (!AMDGPU::isGFX10(STI))
+ llvm_unreachable("dpp8 is not supported on ASICs earlier than GFX10");
+
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ O << " dpp8:[" << formatDec(Imm & 0x7);
+ for (size_t i = 1; i < 8; ++i) {
+ O << ',' << formatDec((Imm >> (3 * i)) & 0x7);
+ }
+ O << ']';
}
void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
@@ -647,21 +738,61 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
O << " row_ror:";
printU4ImmDecOperand(MI, OpNo, O);
} else if (Imm == DppCtrl::WAVE_SHL1) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* wave_shl is not supported starting from GFX10 */";
+ return;
+ }
O << " wave_shl:1";
} else if (Imm == DppCtrl::WAVE_ROL1) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* wave_rol is not supported starting from GFX10 */";
+ return;
+ }
O << " wave_rol:1";
} else if (Imm == DppCtrl::WAVE_SHR1) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* wave_shr is not supported starting from GFX10 */";
+ return;
+ }
O << " wave_shr:1";
} else if (Imm == DppCtrl::WAVE_ROR1) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* wave_ror is not supported starting from GFX10 */";
+ return;
+ }
O << " wave_ror:1";
} else if (Imm == DppCtrl::ROW_MIRROR) {
O << " row_mirror";
} else if (Imm == DppCtrl::ROW_HALF_MIRROR) {
O << " row_half_mirror";
} else if (Imm == DppCtrl::BCAST15) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* row_bcast is not supported starting from GFX10 */";
+ return;
+ }
O << " row_bcast:15";
} else if (Imm == DppCtrl::BCAST31) {
+ if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) {
+ O << " /* row_bcast is not supported starting from GFX10 */";
+ return;
+ }
O << " row_bcast:31";
+ } else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) &&
+ (Imm <= DppCtrl::ROW_SHARE_LAST)) {
+ if (!AMDGPU::isGFX10(STI)) {
+ O << " /* row_share is not supported on ASICs earlier than GFX10 */";
+ return;
+ }
+ O << " row_share:";
+ printU4ImmDecOperand(MI, OpNo, O);
+ } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) &&
+ (Imm <= DppCtrl::ROW_XMASK_LAST)) {
+ if (!AMDGPU::isGFX10(STI)) {
+ O << " /* row_xmask is not supported on ASICs earlier than GFX10 */";
+ return;
+ }
+ O << "row_xmask:";
+ printU4ImmDecOperand(MI, OpNo, O);
} else {
O << " /* Invalid dpp_ctrl value */";
}
@@ -690,6 +821,16 @@ void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo,
}
}
+void AMDGPUInstPrinter::printFI(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ using namespace llvm::AMDGPU::DPP;
+ unsigned Imm = MI->getOperand(OpNo).getImm();
+ if (Imm == DPP_FI_1 || Imm == DPP8_FI_1) {
+ O << " fi:1";
+ }
+}
+
void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
using namespace llvm::AMDGPU::SDWA;
@@ -803,8 +944,10 @@ void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
O << " mrtz";
else if (Tgt == 9)
O << " null";
- else if (Tgt >= 12 && Tgt <= 15)
+ else if ((Tgt >= 12 && Tgt <= 15) || (Tgt == 16 && AMDGPU::isGFX10(STI)))
O << " pos" << Tgt - 12;
+ else if (AMDGPU::isGFX10(STI) && Tgt == 20)
+ O << " prim";
else if (Tgt >= 32 && Tgt <= 63)
O << " param" << Tgt - 32;
else {
@@ -875,6 +1018,18 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) {
+ auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
+ unsigned FI = !!(MI->getOperand(FIN).getImm() & SISrcMods::OP_SEL_0);
+ unsigned BC = !!(MI->getOperand(BCN).getImm() & SISrcMods::OP_SEL_0);
+ if (FI || BC)
+ O << " op_sel:[" << FI << ',' << BC << ']';
+ return;
+ }
+
printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O);
}
@@ -932,23 +1087,24 @@ void AMDGPUInstPrinter::printInterpAttrChan(const MCInst *MI, unsigned OpNum,
void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ using namespace llvm::AMDGPU::VGPRIndexMode;
unsigned Val = MI->getOperand(OpNo).getImm();
- if (Val == 0) {
- O << " 0";
- return;
- }
-
- if (Val & VGPRIndexMode::DST_ENABLE)
- O << " dst";
-
- if (Val & VGPRIndexMode::SRC0_ENABLE)
- O << " src0";
- if (Val & VGPRIndexMode::SRC1_ENABLE)
- O << " src1";
-
- if (Val & VGPRIndexMode::SRC2_ENABLE)
- O << " src2";
+ if ((Val & ~ENABLE_MASK) != 0) {
+ O << " " << formatHex(static_cast<uint64_t>(Val));
+ } else {
+ O << " gpr_idx(";
+ bool NeedComma = false;
+ for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
+ if (Val & (1 << ModeId)) {
+ if (NeedComma)
+ O << ',';
+ O << IdSymbolic[ModeId];
+ NeedComma = true;
+ }
+ }
+ O << ')';
+ }
}
void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
@@ -1010,40 +1166,29 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
using namespace llvm::AMDGPU::SendMsg;
- const unsigned SImm16 = MI->getOperand(OpNo).getImm();
- const unsigned Id = SImm16 & ID_MASK_;
- do {
- if (Id == ID_INTERRUPT) {
- if ((SImm16 & ~ID_MASK_) != 0) // Unused/unknown bits must be 0.
- break;
- O << "sendmsg(" << IdSymbolic[Id] << ')';
- return;
- }
- if (Id == ID_GS || Id == ID_GS_DONE) {
- if ((SImm16 & ~(ID_MASK_|OP_GS_MASK_|STREAM_ID_MASK_)) != 0) // Unused/unknown bits must be 0.
- break;
- const unsigned OpGs = (SImm16 & OP_GS_MASK_) >> OP_SHIFT_;
- const unsigned StreamId = (SImm16 & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
- if (OpGs == OP_GS_NOP && Id != ID_GS_DONE) // NOP to be used for GS_DONE only.
- break;
- if (OpGs == OP_GS_NOP && StreamId != 0) // NOP does not use/define stream id bits.
- break;
- O << "sendmsg(" << IdSymbolic[Id] << ", " << OpGsSymbolic[OpGs];
- if (OpGs != OP_GS_NOP) { O << ", " << StreamId; }
- O << ')';
- return;
- }
- if (Id == ID_SYSMSG) {
- if ((SImm16 & ~(ID_MASK_|OP_SYS_MASK_)) != 0) // Unused/unknown bits must be 0.
- break;
- const unsigned OpSys = (SImm16 & OP_SYS_MASK_) >> OP_SHIFT_;
- if (! (OP_SYS_FIRST_ <= OpSys && OpSys < OP_SYS_LAST_)) // Unused/unknown.
- break;
- O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')';
- return;
+ const unsigned Imm16 = MI->getOperand(OpNo).getImm();
+
+ uint16_t MsgId;
+ uint16_t OpId;
+ uint16_t StreamId;
+ decodeMsg(Imm16, MsgId, OpId, StreamId);
+
+ if (isValidMsgId(MsgId, STI) &&
+ isValidMsgOp(MsgId, OpId) &&
+ isValidMsgStream(MsgId, OpId, StreamId)) {
+ O << "sendmsg(" << getMsgName(MsgId);
+ if (msgRequiresOp(MsgId)) {
+ O << ", " << getMsgOpName(MsgId, OpId);
+ if (msgSupportsStream(MsgId, OpId)) {
+ O << ", " << StreamId;
+ }
}
- } while (false);
- O << SImm16; // Unknown simm16 code.
+ O << ')';
+ } else if (encodeMsg(MsgId, OpId, StreamId) == Imm16) {
+ O << "sendmsg(" << MsgId << ", " << OpId << ", " << StreamId << ')';
+ } else {
+ O << Imm16; // Unknown imm16 code.
+ }
}
static void printSwizzleBitmask(const uint16_t AndMask,
@@ -1094,7 +1239,7 @@ void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo,
if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) {
O << "swizzle(" << IdSymbolic[ID_QUAD_PERM];
- for (auto i = 0; i < LANE_NUM; ++i) {
+ for (unsigned I = 0; I < LANE_NUM; ++I) {
O << ",";
O << formatDec(Imm & LANE_MASK);
Imm >>= LANE_SHIFT;
@@ -1184,32 +1329,42 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- using namespace llvm::AMDGPU::Hwreg;
+ unsigned Id;
+ unsigned Offset;
+ unsigned Width;
- unsigned SImm16 = MI->getOperand(OpNo).getImm();
- const unsigned Id = (SImm16 & ID_MASK_) >> ID_SHIFT_;
- const unsigned Offset = (SImm16 & OFFSET_MASK_) >> OFFSET_SHIFT_;
- const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
+ using namespace llvm::AMDGPU::Hwreg;
+ unsigned Val = MI->getOperand(OpNo).getImm();
+ decodeHwreg(Val, Id, Offset, Width);
+ StringRef HwRegName = getHwreg(Id, STI);
O << "hwreg(";
- unsigned Last = ID_SYMBOLIC_LAST_;
- if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI) || AMDGPU::isVI(STI))
- Last = ID_SYMBOLIC_FIRST_GFX9_;
- if (ID_SYMBOLIC_FIRST_ <= Id && Id < Last && IdSymbolic[Id]) {
- O << IdSymbolic[Id];
+ if (!HwRegName.empty()) {
+ O << HwRegName;
} else {
O << Id;
}
- if (Width != WIDTH_M1_DEFAULT_ + 1 || Offset != OFFSET_DEFAULT_) {
+ if (Width != WIDTH_DEFAULT_ || Offset != OFFSET_DEFAULT_) {
O << ", " << Offset << ", " << Width;
}
O << ')';
}
+void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint16_t Imm = MI->getOperand(OpNo).getImm();
+ if (Imm == 0) {
+ return;
+ }
+
+ O << ' ' << formatDec(Imm);
+}
+
#include "AMDGPUGenAsmWriter.inc"
void R600InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
- StringRef Annot, const MCSubtargetInfo &STI) {
+ StringRef Annot, const MCSubtargetInfo &STI) {
O.flush();
printInstruction(MI, O);
printAnnotation(O, Annot);
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 0ba74ca0f3e1..b544d1ef3605 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -1,18 +1,18 @@
//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
-#define LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H
+#include "AMDGPUMCTargetDesc.h"
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
@@ -26,7 +26,8 @@ public:
//Autogenerated by tblgen
void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
raw_ostream &O);
- static const char *getRegisterName(unsigned RegNo);
+ static const char *getRegisterName(unsigned RegNo,
+ unsigned AltIdx = AMDGPU::NoRegAltName);
void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
const MCSubtargetInfo &STI) override;
@@ -42,7 +43,6 @@ private:
void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printS13ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
@@ -53,8 +53,8 @@ private:
void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
+ void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
@@ -68,6 +68,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printDLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -76,6 +78,8 @@ private:
raw_ostream &O);
void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -112,6 +116,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printDPP8(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -120,6 +126,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printBoundCtrl(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printFI(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printSDWADstSel(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
@@ -150,6 +158,14 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printMemOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printBLGP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printCBSZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printABID(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+ void printDefaultVccOperand(unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
template <unsigned N>
@@ -214,6 +230,8 @@ protected:
const MCSubtargetInfo &STI, raw_ostream &O);
void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printEndpgm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
};
class R600InstPrinter : public MCInstPrinter {
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 2364e7b7b5fb..9e04ab9bae93 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -1,15 +1,16 @@
//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
#include "AMDGPUMCAsmInfo.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
using namespace llvm;
@@ -19,7 +20,10 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
HasSingleParameterDotFile = false;
//===------------------------------------------------------------------===//
MinInstAlignment = 4;
- MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 8 : 16;
+
+ // This is the maximum instruction encoded size for gfx10. With a known
+ // subtarget, it can be reduced to 8 bytes.
+ MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 20 : 16;
SeparatorString = "\n";
CommentString = ";";
PrivateLabelPrefix = "";
@@ -45,3 +49,18 @@ bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
SectionName == ".hsarodata_readonly_agent" ||
MCAsmInfo::shouldOmitSectionDirective(SectionName);
}
+
+unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const {
+ if (!STI || STI->getTargetTriple().getArch() == Triple::r600)
+ return MaxInstLength;
+
+ // Maximum for NSA encoded images
+ if (STI->getFeatureBits()[AMDGPU::FeatureNSAEncoding])
+ return 20;
+
+ // 64-bit instruction with 32-bit literal.
+ if (STI->getFeatureBits()[AMDGPU::FeatureVOP3Literal])
+ return 12;
+
+ return 8;
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
index 8cb33a3179cd..71e63ec27a8f 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -1,9 +1,8 @@
//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -28,6 +27,7 @@ class AMDGPUMCAsmInfo : public MCAsmInfoELF {
public:
explicit AMDGPUMCAsmInfo(const Triple &TT);
bool shouldOmitSectionDirective(StringRef SectionName) const override;
+ unsigned getMaxInstLength(const MCSubtargetInfo *STI) const override;
};
} // namespace llvm
#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index cae7a7a6c7e7..f3d945cc0764 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index dcc10a032afe..62757a707890 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -1,9 +1,8 @@
//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -64,10 +63,17 @@ public:
return 0;
}
+ virtual unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 0;
+ }
+
protected:
- uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
- void verifyInstructionPredicates(const MCInst &MI,
- uint64_t AvailableFeatures) const;
+ FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+ void
+ verifyInstructionPredicates(const MCInst &MI,
+ const FeatureBitset &AvailableFeatures) const;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index c579c7d60e16..88df64d18cc5 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,13 +13,15 @@
#include "AMDGPUMCTargetDesc.h"
#include "AMDGPUELFStreamer.h"
+#include "AMDGPUInstPrinter.h"
#include "AMDGPUMCAsmInfo.h"
#include "AMDGPUTargetStreamer.h"
-#include "InstPrinter/AMDGPUInstPrinter.h"
#include "SIDefines.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -104,6 +105,35 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
std::move(Emitter), RelaxAll);
}
+namespace {
+
+class AMDGPUMCInstrAnalysis : public MCInstrAnalysis {
+public:
+ explicit AMDGPUMCInstrAnalysis(const MCInstrInfo *Info)
+ : MCInstrAnalysis(Info) {}
+
+ bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+ uint64_t &Target) const override {
+ if (Inst.getNumOperands() == 0 || !Inst.getOperand(0).isImm() ||
+ Info->get(Inst.getOpcode()).OpInfo[0].OperandType !=
+ MCOI::OPERAND_PCREL)
+ return false;
+
+ int64_t Imm = Inst.getOperand(0).getImm();
+ // Our branches take a simm16, but we need two extra bits to account for
+ // the factor of 4.
+ APInt SignedOffset(18, Imm * 4, true);
+ Target = (SignedOffset.sext(64) + Addr + Size).getZExtValue();
+ return true;
+ }
+};
+
+} // end anonymous namespace
+
+static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) {
+ return new AMDGPUMCInstrAnalysis(Info);
+}
+
extern "C" void LLVMInitializeAMDGPUTargetMC() {
TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo);
@@ -114,6 +144,7 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
+ TargetRegistry::RegisterMCInstrAnalysis(*T, createAMDGPUMCInstrAnalysis);
TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend);
TargetRegistry::RegisterELFStreamer(*T, createMCStreamer);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index f3628d96d6e9..9754d31fee60 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -1,9 +1,8 @@
//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -34,9 +33,6 @@ class Target;
class Triple;
class raw_pwrite_stream;
-Target &getTheAMDGPUTarget();
-Target &getTheGCNTarget();
-
MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
@@ -53,7 +49,7 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
std::unique_ptr<MCObjectTargetWriter>
createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
- bool HasRelocationAddend);
+ bool HasRelocationAddend, uint8_t ABIVersion);
} // End llvm namespace
#define GET_REGINFO_ENUM
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index c17fe126546c..8f11433476f4 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUTargetStreamer.cpp - Mips Target Streamer Methods -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,7 +18,6 @@
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/BinaryFormat/MsgPackTypes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Metadata.h"
@@ -52,51 +50,53 @@ bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
}
bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) {
- std::shared_ptr<msgpack::Node> HSAMetadataRoot;
- yaml::Input YIn(HSAMetadataString);
- YIn >> HSAMetadataRoot;
- if (YIn.error())
+ msgpack::Document HSAMetadataDoc;
+ if (!HSAMetadataDoc.fromYAML(HSAMetadataString))
return false;
- return EmitHSAMetadata(HSAMetadataRoot, false);
+ return EmitHSAMetadata(HSAMetadataDoc, false);
}
StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
AMDGPU::GPUKind AK;
switch (ElfMach) {
- case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break;
- case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break;
- case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break;
- case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break;
- case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break;
- case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break;
- case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break;
- case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break;
- case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break;
- case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break;
- case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break;
- case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break;
- case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break;
- case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break;
- case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break;
- case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break;
- case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
- case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
+ case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break;
+ case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break;
+ case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break;
+ case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break;
+ case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break;
+ case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break;
+ case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break;
+ case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break;
+ case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break;
+ case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break;
+ case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break;
+ case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: AK = GK_GFX908; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
+ case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
}
StringRef GPUName = getArchNameAMDGCN(AK);
@@ -142,7 +142,11 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX902: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX902;
case GK_GFX904: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904;
case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
+ case GK_GFX908: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX908;
case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
+ case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
+ case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
+ case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
}
@@ -157,6 +161,14 @@ AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S,
formatted_raw_ostream &OS)
: AMDGPUTargetStreamer(S), OS(OS) { }
+// A hook for emitting stuff at the end.
+// We use it for emitting the accumulated PAL metadata as directives.
+void AMDGPUTargetAsmStreamer::finish() {
+ std::string S;
+ getPALMetadata()->toString(S);
+ OS << S;
+}
+
void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {
OS << "\t.amdgcn_target \"" << Target << "\"\n";
}
@@ -196,6 +208,12 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
}
}
+void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
+ unsigned Align) {
+ OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", " << Align
+ << '\n';
+}
+
bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) {
OS << "\t.amd_amdgpu_isa \"" << IsaVersionString << "\"\n";
return true;
@@ -214,15 +232,14 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
}
bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
- std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+ msgpack::Document &HSAMetadataDoc, bool Strict) {
V3::MetadataVerifier Verifier(Strict);
- if (!Verifier.verify(*HSAMetadataRoot))
+ if (!Verifier.verify(HSAMetadataDoc.getRoot()))
return false;
std::string HSAMetadataString;
raw_string_ostream StrOS(HSAMetadataString);
- yaml::Output YOut(StrOS);
- YOut << HSAMetadataRoot;
+ HSAMetadataDoc.toYAML(StrOS);
OS << '\t' << V3::AssemblerDirectiveBegin << '\n';
OS << StrOS.str() << '\n';
@@ -230,13 +247,10 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
return true;
}
-bool AMDGPUTargetAsmStreamer::EmitPALMetadata(
- const PALMD::Metadata &PALMetadata) {
- std::string PALMetadataString;
- if (PALMD::toString(PALMetadata, PALMetadataString))
- return false;
-
- OS << '\t' << PALMD::AssemblerDirective << PALMetadataString << '\n';
+bool AMDGPUTargetAsmStreamer::EmitCodeEnd() {
+ const uint32_t Encoded_s_code_end = 0xbf9f0000;
+ OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n';
+ OS << "\t.fill 32, 4, " << Encoded_s_code_end << '\n';
return true;
}
@@ -278,6 +292,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+ if (IVersion.Major >= 10)
+ PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
PRINT_FIELD(
OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD,
compute_pgm_rsrc2,
@@ -331,6 +349,17 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,
compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+ if (IVersion.Major >= 10) {
+ PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE);
+ PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED);
+ PRINT_FIELD(OS, ".amdhsa_forward_progress", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS);
+ }
PRINT_FIELD(
OS, ".amdhsa_exception_fp_ieee_invalid_op", KD,
compute_pgm_rsrc2,
@@ -387,6 +416,19 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
return static_cast<MCELFStreamer &>(Streamer);
}
+// A hook for emitting stuff at the end.
+// We use it for emitting the accumulated PAL metadata as a .note record.
+void AMDGPUTargetELFStreamer::finish() {
+ std::string Blob;
+ const char *Vendor = getPALMetadata()->getVendor();
+ unsigned Type = getPALMetadata()->getType();
+ getPALMetadata()->toBlob(Type, Blob);
+ if (Blob.empty())
+ return;
+ EmitNote(Vendor, MCConstantExpr::create(Blob.size(), getContext()), Type,
+ [&](MCELFStreamer &OS) { OS.EmitBytes(Blob); });
+}
+
void AMDGPUTargetELFStreamer::EmitNote(
StringRef Name, const MCExpr *DescSZ, unsigned NoteType,
function_ref<void(MCELFStreamer &)> EmitDesc) {
@@ -463,6 +505,27 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
Symbol->setType(Type);
}
+void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
+ unsigned Align) {
+ assert(isPowerOf2_32(Align));
+
+ MCSymbolELF *SymbolELF = cast<MCSymbolELF>(Symbol);
+ SymbolELF->setType(ELF::STT_OBJECT);
+
+ if (!SymbolELF->isBindingSet()) {
+ SymbolELF->setBinding(ELF::STB_GLOBAL);
+ SymbolELF->setExternal(true);
+ }
+
+ if (SymbolELF->declareCommon(Size, Align, true)) {
+ report_fatal_error("Symbol: " + Symbol->getName() +
+ " redeclared as different type");
+ }
+
+ SymbolELF->setIndex(ELF::SHN_AMDGPU_LDS);
+ SymbolELF->setSize(MCConstantExpr::create(Size, getContext()));
+}
+
bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
// Create two labels to mark the beginning and end of the desc field
// and a MCExpr to calculate the size of the desc field.
@@ -482,16 +545,14 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
return true;
}
-bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
- std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
+ bool Strict) {
V3::MetadataVerifier Verifier(Strict);
- if (!Verifier.verify(*HSAMetadataRoot))
+ if (!Verifier.verify(HSAMetadataDoc.getRoot()))
return false;
std::string HSAMetadataString;
- raw_string_ostream StrOS(HSAMetadataString);
- msgpack::Writer MPWriter(StrOS);
- HSAMetadataRoot->write(MPWriter);
+ HSAMetadataDoc.writeToBlob(HSAMetadataString);
// Create two labels to mark the beginning and end of the desc field
// and a MCExpr to calculate the size of the desc field.
@@ -505,7 +566,7 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA,
[&](MCELFStreamer &OS) {
OS.EmitLabel(DescBegin);
- OS.EmitBytes(StrOS.str());
+ OS.EmitBytes(HSAMetadataString);
OS.EmitLabel(DescEnd);
});
return true;
@@ -535,15 +596,15 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
return true;
}
-bool AMDGPUTargetELFStreamer::EmitPALMetadata(
- const PALMD::Metadata &PALMetadata) {
- EmitNote(ElfNote::NoteNameV2,
- MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t),
- getContext()),
- ELF::NT_AMD_AMDGPU_PAL_METADATA, [&](MCELFStreamer &OS) {
- for (auto I : PALMetadata)
- OS.EmitIntValue(I, sizeof(uint32_t));
- });
+bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
+ const uint32_t Encoded_s_code_end = 0xbf9f0000;
+
+ MCStreamer &OS = getStreamer();
+ OS.PushSection();
+ OS.EmitValueToAlignment(64, Encoded_s_code_end, 4);
+ for (unsigned I = 0; I < 32; ++I)
+ OS.EmitIntValue(Encoded_s_code_end, 4);
+ OS.PopSection();
return true;
}
@@ -555,16 +616,25 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
auto &Streamer = getStreamer();
auto &Context = Streamer.getContext();
+ MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>(
+ Context.getOrCreateSymbol(Twine(KernelName)));
MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>(
Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd")));
- KernelDescriptorSymbol->setBinding(ELF::STB_GLOBAL);
+
+ // Copy kernel descriptor symbol's binding, other and visibility from the
+ // kernel code symbol.
+ KernelDescriptorSymbol->setBinding(KernelCodeSymbol->getBinding());
+ KernelDescriptorSymbol->setOther(KernelCodeSymbol->getOther());
+ KernelDescriptorSymbol->setVisibility(KernelCodeSymbol->getVisibility());
+ // Kernel descriptor symbol's type and size are fixed.
KernelDescriptorSymbol->setType(ELF::STT_OBJECT);
KernelDescriptorSymbol->setSize(
MCConstantExpr::create(sizeof(KernelDescriptor), Context));
- MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>(
- Context.getOrCreateSymbol(Twine(KernelName)));
- KernelCodeSymbol->setBinding(ELF::STB_LOCAL);
+ // The visibility of the kernel code symbol must be protected or less to allow
+ // static relocations from the kernel descriptor to be used.
+ if (KernelCodeSymbol->getVisibility() == ELF::STV_DEFAULT)
+ KernelCodeSymbol->setVisibility(ELF::STV_PROTECTED);
Streamer.EmitLabel(KernelDescriptorSymbol);
Streamer.EmitBytes(StringRef(
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 9a807c804f9f..683b3e363b9a 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -1,9 +1,8 @@
//===-- AMDGPUTargetStreamer.h - AMDGPU Target Streamer --------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -11,7 +10,8 @@
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
#include "AMDKernelCodeT.h"
-#include "llvm/BinaryFormat/MsgPackTypes.h"
+#include "Utils/AMDGPUPALMetadata.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/AMDGPUMetadata.h"
@@ -29,12 +29,16 @@ class Module;
class Type;
class AMDGPUTargetStreamer : public MCTargetStreamer {
+ AMDGPUPALMetadata PALMetadata;
+
protected:
MCContext &getContext() const { return Streamer.getContext(); }
public:
AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+ AMDGPUPALMetadata *getPALMetadata() { return &PALMetadata; }
+
virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0;
virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
@@ -49,6 +53,9 @@ public:
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
+ virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
+ unsigned Align) = 0;
+
/// \returns True on success, false on failure.
virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
@@ -65,14 +72,13 @@ public:
/// the \p HSAMetadata structure is updated with the correct types.
///
/// \returns True on success, false on failure.
- virtual bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
- bool Strict) = 0;
+ virtual bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) = 0;
/// \returns True on success, false on failure.
virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0;
/// \returns True on success, false on failure.
- virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0;
+ virtual bool EmitCodeEnd() = 0;
virtual void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
@@ -89,6 +95,8 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
public:
AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+ void finish() override;
+
void EmitDirectiveAMDGCNTarget(StringRef Target) override;
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
@@ -102,18 +110,19 @@ public:
void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+ void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override;
+
/// \returns True on success, false on failure.
bool EmitISAVersion(StringRef IsaVersionString) override;
/// \returns True on success, false on failure.
- bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
- bool Strict) override;
+ bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
/// \returns True on success, false on failure.
bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
/// \returns True on success, false on failure.
- bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+ bool EmitCodeEnd() override;
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
@@ -133,6 +142,8 @@ public:
MCELFStreamer &getStreamer();
+ void finish() override;
+
void EmitDirectiveAMDGCNTarget(StringRef Target) override;
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
@@ -146,18 +157,19 @@ public:
void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+ void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override;
+
/// \returns True on success, false on failure.
bool EmitISAVersion(StringRef IsaVersionString) override;
/// \returns True on success, false on failure.
- bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
- bool Strict) override;
+ bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
/// \returns True on success, false on failure.
bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
/// \returns True on success, false on failure.
- bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+ bool EmitCodeEnd() override;
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 28d4bc1829e2..2f1f4e7a0392 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -65,9 +64,10 @@ private:
uint64_t getBinaryCodeForInstr(const MCInst &MI,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
- void verifyInstructionPredicates(const MCInst &MI,
- uint64_t AvailableFeatures) const;
+ FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+ void
+ verifyInstructionPredicates(const MCInst &MI,
+ const FeatureBitset &AvailableFeatures) const;
};
diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
index 1c99a708e5ac..a4809af29daa 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
@@ -1,9 +1,8 @@
//===-- R600MCTargetDesc.cpp - R600 Target Descriptions -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 36913bd04274..f8ec3c36f019 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -1,9 +1,8 @@
//===-- SIMCCodeEmitter.cpp - SI Code Emitter -----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,9 +13,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPURegisterInfo.h"
#include "MCTargetDesc/AMDGPUFixupKinds.h"
#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
@@ -77,6 +78,10 @@ public:
unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
+
+ unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
};
} // end anonymous namespace
@@ -233,6 +238,8 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_INT64:
@@ -245,12 +252,21 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
// FIXME Is this correct? What do inline immediates do on SI for f16 src
// which does not have f16 support?
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ if (!isUInt<16>(Imm) && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal])
+ return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
+ LLVM_FALLTHROUGH;
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
uint16_t Lo16 = static_cast<uint16_t>(Imm);
uint32_t Encoding = getLit16Encoding(Lo16, STI);
return Encoding;
@@ -274,7 +290,25 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
}
- if (bytes > 4)
+ // NSA encoding.
+ if (AMDGPU::isGFX10(STI) && Desc.TSFlags & SIInstrFlags::MIMG) {
+ int vaddr0 = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vaddr0);
+ int srsrc = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::srsrc);
+ assert(vaddr0 >= 0 && srsrc > vaddr0);
+ unsigned NumExtraAddrs = srsrc - vaddr0 - 1;
+ unsigned NumPadding = (-NumExtraAddrs) & 3;
+
+ for (unsigned i = 0; i < NumExtraAddrs; ++i)
+ OS.write((uint8_t)getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i),
+ Fixups, STI));
+ for (unsigned i = 0; i < NumPadding; ++i)
+ OS.write(0);
+ }
+
+ if ((bytes > 8 && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) ||
+ (bytes > 4 && !STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]))
return;
// Check for additional literals in SRC0/1/2 (Op 1/2/3)
@@ -366,7 +400,7 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
const MCOperand &MO = MI.getOperand(OpNo);
unsigned Reg = MO.getReg();
- if (Reg != AMDGPU::VCC) {
+ if (Reg != AMDGPU::VCC && Reg != AMDGPU::VCC_LO) {
RegEnc |= MRI.getEncodingValue(Reg);
RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK;
@@ -374,10 +408,31 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
return RegEnc;
}
+unsigned
+SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ unsigned Reg = MI.getOperand(OpNo).getReg();
+ uint64_t Enc = MRI.getEncodingValue(Reg);
+
+ // VGPR and AGPR have the same encoding, but SrcA and SrcB operands of mfma
+ // instructions use acc[0:1] modifier bits to distinguish. These bits are
+ // encoded as a virtual 9th bit of the register for these operands.
+ if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg))
+ Enc |= 512;
+
+ return Enc;
+}
+
static bool needsPCRel(const MCExpr *Expr) {
switch (Expr->getKind()) {
- case MCExpr::SymbolRef:
- return true;
+ case MCExpr::SymbolRef: {
+ auto *SE = cast<MCSymbolRefExpr>(Expr);
+ MCSymbolRefExpr::VariantKind Kind = SE->getKind();
+ return Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_LO &&
+ Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_HI;
+ }
case MCExpr::Binary: {
auto *BE = cast<MCBinaryExpr>(Expr);
if (BE->getOpcode() == MCBinaryExpr::Sub)
@@ -416,7 +471,13 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
Kind = FK_PCRel_4;
else
Kind = FK_Data_4;
- Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc()));
+
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint32_t Offset = Desc.getSize();
+ assert(Offset == 4 || Offset == 8);
+
+ Fixups.push_back(
+ MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc()));
}
// Figure out the operand number, needed for isSrcOperand check
@@ -429,7 +490,8 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
- if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
+ if (Enc != ~0U &&
+ (Enc != 255 || Desc.getSize() == 4 || Desc.getSize() == 8))
return Enc;
} else if (MO.isImm())
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 1c68dbd78e75..4735e6cb2446 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1,9 +1,8 @@
//===-- MIMGInstructions.td - MIMG Instruction Defintions -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -12,10 +11,14 @@
//
// - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8)
// - MIMGEncGfx8: encoding introduced with gfx8 for atomics
+// - MIMGEncGfx10Default: gfx default (non-NSA) encoding
+// - MIMGEncGfx10NSA: gfx10 NSA encoding
class MIMGEncoding;
def MIMGEncGfx6 : MIMGEncoding;
def MIMGEncGfx8 : MIMGEncoding;
+def MIMGEncGfx10Default : MIMGEncoding;
+def MIMGEncGfx10NSA : MIMGEncoding;
def MIMGEncoding : GenericEnum {
let FilterClass = "MIMGEncoding";
@@ -60,13 +63,28 @@ def MIMGDim : GenericEnum {
def MIMGDimInfoTable : GenericTable {
let FilterClass = "AMDGPUDimProps";
let CppTypeName = "MIMGDimInfo";
- let Fields = ["Dim", "NumCoords", "NumGradients", "DA"];
+ let Fields = ["Dim", "NumCoords", "NumGradients", "DA", "Encoding", "AsmSuffix"];
GenericEnum TypeOf_Dim = MIMGDim;
let PrimaryKey = ["Dim"];
let PrimaryKeyName = "getMIMGDimInfo";
}
+def getMIMGDimInfoByEncoding : SearchIndex {
+ let Table = MIMGDimInfoTable;
+ let Key = ["Encoding"];
+}
+
+def getMIMGDimInfoByAsmSuffix : SearchIndex {
+ let Table = MIMGDimInfoTable;
+ let Key = ["AsmSuffix"];
+}
+
+class mimg <bits<8> si_gfx10, bits<8> vi = si_gfx10> {
+ field bits<8> SI_GFX10 = si_gfx10;
+ field bits<8> VI = vi;
+}
+
class MIMGLZMapping<MIMGBaseOpcode l, MIMGBaseOpcode lz> {
MIMGBaseOpcode L = l;
MIMGBaseOpcode LZ = lz;
@@ -83,12 +101,23 @@ def MIMGLZMappingTable : GenericTable {
let PrimaryKeyName = "getMIMGLZMappingInfo";
}
-class mimg <bits<7> si, bits<7> vi = si> {
- field bits<7> SI = si;
- field bits<7> VI = vi;
+class MIMGMIPMapping<MIMGBaseOpcode mip, MIMGBaseOpcode nonmip> {
+ MIMGBaseOpcode MIP = mip;
+ MIMGBaseOpcode NONMIP = nonmip;
}
-class MIMG <dag outs, string dns = "">
+def MIMGMIPMappingTable : GenericTable {
+ let FilterClass = "MIMGMIPMapping";
+ let CppTypeName = "MIMGMIPMappingInfo";
+ let Fields = ["MIP", "NONMIP"];
+ GenericEnum TypeOf_MIP = MIMGBaseOpcode;
+ GenericEnum TypeOf_NONMIP = MIMGBaseOpcode;
+
+ let PrimaryKey = ["MIP"];
+ let PrimaryKeyName = "getMIMGMIPMappingInfo";
+}
+
+class MIMG_Base <dag outs, string dns = "">
: InstSI <outs, (ins), "", []> {
let VM_CNT = 1;
@@ -97,20 +126,24 @@ class MIMG <dag outs, string dns = "">
let Uses = [EXEC];
let mayLoad = 1;
let mayStore = 0;
- let hasPostISelHook = 1;
let SchedRW = [WriteVMEM];
let UseNamedOperandTable = 1;
let hasSideEffects = 0; // XXX ????
- let SubtargetPredicate = isGCN;
let DecoderNamespace = dns;
let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
- let AsmMatchConverter = "cvtMIMG";
let usesCustomInserter = 1;
+}
+
+class MIMG <dag outs, string dns = "">
+ : MIMG_Base <outs, dns> {
+
+ let hasPostISelHook = 1;
+ let AsmMatchConverter = "cvtMIMG";
Instruction Opcode = !cast<Instruction>(NAME);
MIMGBaseOpcode BaseOpcode;
- MIMGEncoding MIMGEncoding = MIMGEncGfx6;
+ MIMGEncoding MIMGEncoding;
bits<8> VDataDwords;
bits<8> VAddrDwords;
}
@@ -131,15 +164,66 @@ def getMIMGInfo : SearchIndex {
let Key = ["Opcode"];
}
-class MIMG_NoSampler_Helper <bits<7> op, string asm,
+// This is a separate class so that TableGen memoizes the computations.
+class MIMGNSAHelper<int num_addrs> {
+ list<string> AddrAsmNames =
+ !foldl([]<string>, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], lhs, i,
+ !if(!lt(i, num_addrs), !listconcat(lhs, ["vaddr"#!size(lhs)]), lhs));
+ dag AddrIns = !dag(ins, !foreach(arg, AddrAsmNames, VGPR_32), AddrAsmNames);
+ string AddrAsm = "[" # !foldl("$" # !head(AddrAsmNames), !tail(AddrAsmNames), lhs, rhs,
+ lhs # ", $" # rhs) # "]";
+
+ int NSA = !if(!le(num_addrs, 1), ?,
+ !if(!le(num_addrs, 5), 1,
+ !if(!le(num_addrs, 9), 2,
+ !if(!le(num_addrs, 13), 3, ?))));
+}
+
+// Base class of all pre-gfx10 MIMG instructions.
+class MIMG_gfx6789<bits<8> op, dag outs, string dns = "">
+ : MIMG<outs, dns>, MIMGe_gfx6789<op> {
+ let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
+ let AssemblerPredicates = [isGFX6GFX7GFX8GFX9];
+
+ let MIMGEncoding = MIMGEncGfx6;
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+}
+
+// Base class of all non-NSA gfx10 MIMG instructions.
+class MIMG_gfx10<int op, dag outs, string dns = "">
+ : MIMG<outs, dns>, MIMGe_gfx10<op> {
+ let SubtargetPredicate = isGFX10Plus;
+ let AssemblerPredicates = [isGFX10Plus];
+
+ let MIMGEncoding = MIMGEncGfx10Default;
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+ let nsa = 0;
+}
+
+// Base class for all NSA MIMG instructions. Note that 1-dword addresses always
+// use non-NSA variants.
+class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns="">
+ : MIMG<outs, dns>, MIMGe_gfx10<op> {
+ let SubtargetPredicate = isGFX10Plus;
+ let AssemblerPredicates = [isGFX10Plus];
+
+ let MIMGEncoding = MIMGEncGfx10NSA;
+
+ MIMGNSAHelper nsah = MIMGNSAHelper<num_addrs>;
+ dag AddrIns = nsah.AddrIns;
+ string AddrAsm = nsah.AddrAsm;
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+ let nsa = nsah.NSA;
+}
+
+class MIMG_NoSampler_Helper <bits<8> op, string asm,
RegisterClass dst_rc,
RegisterClass addr_rc,
string dns="">
- : MIMG <(outs dst_rc:$vdata), dns>,
- MIMGe<op> {
- let ssamp = 0;
- let d16 = !if(BaseOpcode.HasD16, ?, 0);
-
+ : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -148,23 +232,66 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm,
#!if(BaseOpcode.HasD16, "$d16", "");
}
-multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
+class MIMG_NoSampler_gfx10<int op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ string dns="">
+ : MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
+ let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
+ SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_NoSampler_nsa_gfx10<int op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ string dns="">
+ : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> {
+ let InOperandList = !con(AddrIns,
+ (ins SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
+ SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm,
RegisterClass dst_rc,
bit enableDisasm> {
- let VAddrDwords = 1 in
- def NAME # _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
- let VAddrDwords = 2 in
- def NAME # _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
- let VAddrDwords = 3 in
- def NAME # _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
- let VAddrDwords = 4 in
- def NAME # _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
-}
-
-multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
+ let ssamp = 0 in {
+ let VAddrDwords = 1 in {
+ def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
+
+ let VAddrDwords = 2 in {
+ def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
+ def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
+ def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
+ }
+
+ let VAddrDwords = 3 in {
+ def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
+ def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
+ def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
+ }
+
+ let VAddrDwords = 4 in {
+ def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
+ def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>;
+ def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
+ }
+}
+
+multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0,
bit isResInfo = 0> {
- def "" : MIMGBaseOpcode {
+ def "" : MIMGBaseOpcode, PredicateControl {
let Coordinates = !if(isResInfo, 0, 1);
let LodOrClampOrMip = mip;
let HasD16 = has_d16;
@@ -180,26 +307,16 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
let VDataDwords = 4 in
defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
- let VDataDwords = 8 in
- defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>;
+ let VDataDwords = 5 in
+ defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0>;
}
}
-class MIMG_Store_Helper <bits<7> op, string asm,
+class MIMG_Store_Helper <bits<8> op, string asm,
RegisterClass data_rc,
RegisterClass addr_rc,
string dns = "">
- : MIMG <(outs), dns>,
- MIMGe<op> {
- let ssamp = 0;
- let d16 = !if(BaseOpcode.HasD16, ?, 0);
-
- let mayLoad = 0;
- let mayStore = 1;
- let hasSideEffects = 0;
- let hasPostISelHook = 0;
- let DisableWQM = 1;
-
+ : MIMG_gfx6789<op, (outs), dns> {
let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -208,21 +325,63 @@ class MIMG_Store_Helper <bits<7> op, string asm,
#!if(BaseOpcode.HasD16, "$d16", "");
}
-multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
+class MIMG_Store_gfx10<int op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ string dns="">
+ : MIMG_gfx10<op, (outs), dns> {
+ let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
+ GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Store_nsa_gfx10<int op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ string dns="">
+ : MIMG_nsa_gfx10<op, (outs), num_addrs, dns> {
+ let InOperandList = !con((ins DataRC:$vdata),
+ AddrIns,
+ (ins SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
+ SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+multiclass MIMG_Store_Addr_Helper <int op, string asm,
RegisterClass data_rc,
bit enableDisasm> {
- let VAddrDwords = 1 in
- def NAME # _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
- let VAddrDwords = 2 in
- def NAME # _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
- let VAddrDwords = 3 in
- def NAME # _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
- let VAddrDwords = 4 in
- def NAME # _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
-}
-
-multiclass MIMG_Store <bits<7> op, string asm, bit has_d16, bit mip = 0> {
+ let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0,
+ DisableWQM = 1, ssamp = 0 in {
+ let VAddrDwords = 1 in {
+ def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
+ let VAddrDwords = 2 in {
+ def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
+ def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
+ def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
+ }
+ let VAddrDwords = 3 in {
+ def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
+ def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
+ def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
+ }
+ let VAddrDwords = 4 in {
+ def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
+ def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
+ def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
+ }
+}
+
+multiclass MIMG_Store <bits<8> op, string asm, bit has_d16, bit mip = 0> {
def "" : MIMGBaseOpcode {
let Store = 1;
let LodOrClampOrMip = mip;
@@ -241,15 +400,9 @@ multiclass MIMG_Store <bits<7> op, string asm, bit has_d16, bit mip = 0> {
}
}
-class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
- RegisterClass addr_rc, string dns="",
- bit enableDasm = 0>
- : MIMG <(outs data_rc:$vdst), !if(enableDasm, dns, "")> {
- let mayLoad = 1;
- let mayStore = 1;
- let hasSideEffects = 1; // FIXME: Remove this
- let hasPostISelHook = 0;
- let DisableWQM = 1;
+class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, string dns="">
+ : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> {
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
@@ -259,39 +412,80 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da";
}
-multiclass MIMG_Atomic_Helper_m <mimg op, string asm, RegisterClass data_rc,
- RegisterClass addr_rc, bit enableDasm = 0> {
- let ssamp = 0, d16 = 0 in {
- def _si : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "SICI", enableDasm>,
- SIMCInstr<NAME, SIEncodingFamily.SI>,
- MIMGe<op.SI> {
- let AssemblerPredicates = [isSICI];
- let DisableDecoder = DisableSIDecoder;
- }
+class MIMG_Atomic_si<mimg op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, bit enableDasm = 0>
+ : MIMG_Atomic_gfx6789_base<op.SI_GFX10, asm, data_rc, addr_rc,
+ !if(enableDasm, "GFX6GFX7", "")> {
+ let AssemblerPredicates = [isGFX6GFX7];
+}
- def _vi : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "VI", enableDasm>,
- SIMCInstr<NAME, SIEncodingFamily.VI>,
- MIMGe<op.VI> {
- let AssemblerPredicates = [isVI];
- let DisableDecoder = DisableVIDecoder;
- let MIMGEncoding = MIMGEncGfx8;
- }
- }
+class MIMG_Atomic_vi<mimg op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, bit enableDasm = 0>
+ : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
+ let AssemblerPredicates = [isGFX8GFX9];
+ let MIMGEncoding = MIMGEncGfx8;
+}
+
+class MIMG_Atomic_gfx10<mimg op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ bit enableDisasm = 0>
+ : MIMG_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst),
+ !if(enableDisasm, "AMDGPU", "")> {
+ let Constraints = "$vdst = $vdata";
+ let AsmMatchConverter = "cvtMIMGAtomic";
+
+ let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
+ GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe);
+ let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe";
+}
+
+class MIMG_Atomic_nsa_gfx10<mimg op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ bit enableDisasm = 0>
+ : MIMG_nsa_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst), num_addrs,
+ !if(enableDisasm, "AMDGPU", "")> {
+ let Constraints = "$vdst = $vdata";
+ let AsmMatchConverter = "cvtMIMGAtomic";
+
+ let InOperandList = !con((ins DataRC:$vdata),
+ AddrIns,
+ (ins SReg_256:$srsrc, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
+ SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe";
}
multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm,
RegisterClass data_rc,
bit enableDasm = 0> {
- // _V* variants have different address size, but the size is not encoded.
- // So only one variant can be disassembled. V1 looks the safest to decode.
- let VAddrDwords = 1 in
- defm _V1 : MIMG_Atomic_Helper_m <op, asm, data_rc, VGPR_32, enableDasm>;
- let VAddrDwords = 2 in
- defm _V2 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_64>;
- let VAddrDwords = 3 in
- defm _V3 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_96>;
- let VAddrDwords = 4 in
- defm _V4 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_128>;
+ let hasSideEffects = 1, // FIXME: remove this
+ mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1,
+ ssamp = 0 in {
+ let VAddrDwords = 1 in {
+ def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
+ let VAddrDwords = 2 in {
+ def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
+ def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
+ def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
+ def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+ }
+ let VAddrDwords = 3 in {
+ def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
+ def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
+ def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
+ def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+ }
+ let VAddrDwords = 4 in {
+ def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
+ def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
+ def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
+ def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+ }
+ }
}
multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atomics
@@ -311,12 +505,9 @@ multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atom
}
}
-class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc,
+class MIMG_Sampler_Helper <bits<8> op, string asm, RegisterClass dst_rc,
RegisterClass src_rc, string dns="">
- : MIMG <(outs dst_rc:$vdata), dns>,
- MIMGe<op> {
- let d16 = !if(BaseOpcode.HasD16, ?, 0);
-
+ : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
@@ -325,6 +516,33 @@ class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc,
#!if(BaseOpcode.HasD16, "$d16", "");
}
+class MIMG_Sampler_gfx10<int op, string opcode,
+ RegisterClass DataRC, RegisterClass AddrRC,
+ string dns="">
+ : MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
+ let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
+ GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm"
+ #"$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Sampler_nsa_gfx10<int op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ string dns="">
+ : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> {
+ let InOperandList = !con(AddrIns,
+ (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
+ Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
+ SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm"
+ #"$dlc$glc$slc$r128$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
class MIMGAddrSize<int dw, bit enable_disasm> {
int NumWords = dw;
@@ -341,6 +559,11 @@ class MIMGAddrSize<int dw, bit enable_disasm> {
bit Disassemble = enable_disasm;
}
+// Return whether x is in lst.
+class isIntInList<int x, list<int> lst> {
+ bit ret = !foldl(0, lst, lhs, y, !or(lhs, !eq(x, y)));
+}
+
// Return whether a value inside the range [min, max] (endpoints inclusive)
// is in the given list.
class isRangeInList<int min, int max, list<int> lst> {
@@ -376,16 +599,41 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
!listconcat(lhs.List, [MIMGAddrSize<dw, !empty(lhs.List)>]),
!if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords
lhs)).List;
-}
-multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+ // For NSA, generate machine instructions for all possible numbers of words
+ // except 1 (which is already covered by the non-NSA case).
+ // The disassembler defaults to the largest number of arguments among the
+ // variants with the same number of NSA words, and custom code then derives
+ // the exact variant based on the sample variant and the image dimension.
+ list<MIMGAddrSize> NSAInstrs =
+ !foldl([]<MIMGAddrSize>, [[12, 11, 10], [9, 8, 7, 6], [5, 4, 3, 2]], prev, nsa_group,
+ !listconcat(prev,
+ !foldl([]<MIMGAddrSize>, nsa_group, lhs, dw,
+ !if(isIntInList<dw, AllNumAddrWords>.ret,
+ !listconcat(lhs, [MIMGAddrSize<dw, !empty(lhs)>]),
+ lhs))));
+}
+
+multiclass MIMG_Sampler_Src_Helper <bits<8> op, string asm,
AMDGPUSampleVariant sample, RegisterClass dst_rc,
bit enableDisasm = 0> {
foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in {
- let VAddrDwords = addr.NumWords in
- def _V # addr.NumWords
- : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
- !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ let VAddrDwords = addr.NumWords in {
+ def _V # addr.NumWords
+ : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ def _V # addr.NumWords # _gfx10
+ : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
+ }
+
+ foreach addr = MIMG_Sampler_AddrSizes<sample>.NSAInstrs in {
+ let VAddrDwords = addr.NumWords in {
+ def _V # addr.NumWords # _nsa_gfx10
+ : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
}
}
@@ -397,7 +645,7 @@ class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
let LodOrClampOrMip = !ne(sample.LodOrClamp, "");
}
-multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
bit isGetLod = 0,
string asm = "image_sample"#sample.LowerCaseMod> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
@@ -414,15 +662,15 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
let VDataDwords = 4 in
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
- let VDataDwords = 8 in
- defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
+ let VDataDwords = 5 in
+ defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
}
}
-multiclass MIMG_Sampler_WQM <bits<7> op, AMDGPUSampleVariant sample>
+multiclass MIMG_Sampler_WQM <bits<8> op, AMDGPUSampleVariant sample>
: MIMG_Sampler<op, sample, 1>;
-multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
+multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
string asm = "image_gather4"#sample.LowerCaseMod> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
let HasD16 = 1;
@@ -435,12 +683,12 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
let VDataDwords = 4 in
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
- let VDataDwords = 8 in
- defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
+ let VDataDwords = 5 in
+ defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
}
}
-multiclass MIMG_Gather_WQM <bits<7> op, AMDGPUSampleVariant sample>
+multiclass MIMG_Gather_WQM <bits<8> op, AMDGPUSampleVariant sample>
: MIMG_Gather<op, sample, 1>;
//===----------------------------------------------------------------------===//
@@ -473,9 +721,11 @@ defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
+//let FPAtomic = 1 in {
//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI
//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
+//} // End let FPAtomic = 1
defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
@@ -581,3 +831,7 @@ def : MIMGLZMapping<IMAGE_GATHER4_L, IMAGE_GATHER4_LZ>;
def : MIMGLZMapping<IMAGE_GATHER4_C_L, IMAGE_GATHER4_C_LZ>;
def : MIMGLZMapping<IMAGE_GATHER4_L_O, IMAGE_GATHER4_LZ_O>;
def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>;
+
+// MIP to NONMIP Optimization Mapping
+def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>;
+def : MIMGMIPMapping<IMAGE_STORE_MIP, IMAGE_STORE>;
diff --git a/lib/Target/AMDGPU/R600.td b/lib/Target/AMDGPU/R600.td
index 5c9c1c1ed504..1d11da969474 100644
--- a/lib/Target/AMDGPU/R600.td
+++ b/lib/Target/AMDGPU/R600.td
@@ -1,9 +1,8 @@
//===-- R600.td - R600 Tablegen files ----------------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp
index 68f8c30775b8..3fb18862fca8 100644
--- a/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -1,9 +1,8 @@
//===-- R600AsmPrinter.cpp - R600 Assebly printer ------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.h b/lib/Target/AMDGPU/R600AsmPrinter.h
index 079fc707b03c..0da9526d716e 100644
--- a/lib/Target/AMDGPU/R600AsmPrinter.h
+++ b/lib/Target/AMDGPU/R600AsmPrinter.h
@@ -1,9 +1,8 @@
//===-- R600AsmPrinter.h - Print R600 assembly code -------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 0c62d6a4b3d9..290a960ae901 100644
--- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -1,9 +1,8 @@
//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index a19020276f35..8098b81d1ea2 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -1,9 +1,8 @@
//===- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h
index 0d33d82e8e0f..d72534908dcf 100644
--- a/lib/Target/AMDGPU/R600Defines.h
+++ b/lib/Target/AMDGPU/R600Defines.h
@@ -1,9 +1,8 @@
//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 679cf18d2c20..b97e3c8b8dd7 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -1,9 +1,8 @@
//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index b924ff019dd1..c6e8a060d8a0 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -1,9 +1,8 @@
//===- R600ExpandSpecialInstrs.cpp - Expand special instructions ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600FrameLowering.cpp b/lib/Target/AMDGPU/R600FrameLowering.cpp
index 37787b3c5f72..d9aa9ebe878d 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -1,9 +1,8 @@
//===----------------------- R600FrameLowering.cpp ------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h
index fe367d73682f..950e238f4979 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/lib/Target/AMDGPU/R600FrameLowering.h
@@ -1,9 +1,8 @@
//===--------------------- R600FrameLowering.h ------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index e2a0f05d2b34..f80a53ba1dc6 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -1240,11 +1239,13 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
+ const bool TruncatingStore = StoreNode->isTruncatingStore();
+
// Neither LOCAL nor PRIVATE can do vectors at the moment
- if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
+ if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS ||
+ TruncatingStore) &&
VT.isVector()) {
- if ((AS == AMDGPUAS::PRIVATE_ADDRESS) &&
- StoreNode->isTruncatingStore()) {
+ if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
// Add an extra level of chain to isolate this vector
SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
// TODO: can the chain be replaced without creating a new store?
@@ -1260,7 +1261,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
unsigned Align = StoreNode->getAlignment();
if (Align < MemVT.getStoreSize() &&
- !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
+ !allowsMisalignedMemoryAccesses(
+ MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
return expandUnalignedStore(StoreNode, DAG);
}
@@ -1270,7 +1272,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
// It is beneficial to create MSKOR here instead of combiner to avoid
// artificial dependencies introduced by RMW
- if (StoreNode->isTruncatingStore()) {
+ if (TruncatingStore) {
assert(VT.bitsLE(MVT::i32));
SDValue MaskConstant;
if (MemVT == MVT::i8) {
@@ -1310,8 +1312,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// Convert pointer from byte address to dword address.
Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
- if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
- llvm_unreachable("Truncated and indexed stores not supported yet");
+ if (StoreNode->isIndexed()) {
+ llvm_unreachable("Indexed stores not supported yet");
} else {
Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
}
@@ -1662,10 +1664,9 @@ bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
return true;
}
-bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned AddrSpace,
- unsigned Align,
- bool *IsFast) const {
+bool R600TargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *IsFast) const {
if (IsFast)
*IsFast = false;
@@ -1713,6 +1714,12 @@ static SDValue CompactSwizzlableVector(
if (NewBldVec[i].isUndef())
continue;
+ // Fix spurious warning with gcc 7.3 -O3
+ // warning: array subscript is above array bounds [-Warray-bounds]
+ // if (NewBldVec[i] == NewBldVec[j]) {
+ // ~~~~~~~~~~~^
+ if (i >= 4)
+ continue;
for (unsigned j = 0; j < i; j++) {
if (NewBldVec[i] == NewBldVec[j]) {
NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 767c3c7bd5bf..b560da8e91d9 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -1,9 +1,8 @@
//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -50,9 +49,10 @@ public:
bool canMergeStoresTo(unsigned AS, EVT MemVT,
const SelectionDAG &DAG) const override;
- bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
- unsigned Align,
- bool *IsFast) const override;
+ bool allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AS, unsigned Align,
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ bool *IsFast = nullptr) const override;
private:
unsigned Gen;
diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td
index 687a9affa138..f62e6313b148 100644
--- a/lib/Target/AMDGPU/R600InstrFormats.td
+++ b/lib/Target/AMDGPU/R600InstrFormats.td
@@ -1,9 +1,8 @@
//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 9cc3e5f3c314..d9e839fe2035 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -1,9 +1,8 @@
//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -402,6 +401,7 @@ Swizzle(std::vector<std::pair<int, unsigned>> Src,
}
static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+ assert(Op < 3 && "Out of range swizzle index");
switch (Swz) {
case R600InstrInfo::ALU_VEC_012_SCL_210: {
unsigned Cycles[3] = { 2, 1, 0};
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index e6e34dc125f4..00d96c9676aa 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -1,9 +1,8 @@
//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 10e873755222..f40eece859ee 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -1,9 +1,8 @@
//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -296,6 +295,34 @@ class VTX_READ <string name, dag outs, list<dag> pattern>
let VTXInst = 1;
}
+// FIXME: Deprecated.
+class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress;
+
+class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
+ (ld_node node:$ptr), [{
+ LoadSDNode *L = cast<LoadSDNode>(N);
+ return L->getExtensionType() == ISD::ZEXTLOAD ||
+ L->getExtensionType() == ISD::EXTLOAD;
+}]>;
+
+def az_extload : AZExtLoadBase <unindexedload>;
+
+def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// FIXME: These are deprecated
+def az_extloadi8_local : LocalLoad <az_extloadi8>;
+def az_extloadi16_local : LocalLoad <az_extloadi16>;
+
class LoadParamFrag <PatFrag load_type> : PatFrag <
(ops node:$ptr), (load_type node:$ptr),
[{ return isConstantLoad(cast<LoadSDNode>(N), 0) ||
diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
index 3ca319c6c6c2..65011a9eadf8 100644
--- a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
index 29ac0920f997..6a5ac9023329 100644
--- a/lib/Target/AMDGPU/R600MachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h
@@ -1,9 +1,8 @@
//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 7769a35aadce..34267a909b5e 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -1,9 +1,8 @@
//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h
index 8a9a8d3d1e23..bc66f2ef5907 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.h
+++ b/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -1,9 +1,8 @@
//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index 7de5e2c9577d..1fe92d2269d3 100644
--- a/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -1,9 +1,8 @@
//===- R600OpenCLImageTypeLoweringPass.cpp ------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 692451cb8fe0..9f1cb6582b5c 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -1,9 +1,8 @@
//===- R600MergeVectorRegisters.cpp ---------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -57,17 +56,12 @@ using namespace llvm;
#define DEBUG_TYPE "vec-merger"
-static bool
-isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
- for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg),
- E = MRI.def_instr_end(); It != E; ++It) {
- return (*It).isImplicitDef();
- }
- if (MRI.isReserved(Reg)) {
+static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
+ assert(MRI.isSSA());
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
return false;
- }
- llvm_unreachable("Reg without a def");
- return false;
+ const MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+ return MI && MI->isImplicitDef();
}
namespace {
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index 612c62b514fd..df200baf11c1 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -1,9 +1,8 @@
//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -187,8 +186,8 @@ public:
// Does MII and MIJ share the same pred_sel ?
int OpI = TII->getOperandIdx(MII->getOpcode(), R600::OpName::pred_sel),
OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600::OpName::pred_sel);
- unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
- PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
+ Register PredI = (OpI > -1)?MII->getOperand(OpI).getReg() : Register(),
+ PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg() : Register();
if (PredI != PredJ)
return false;
if (SUJ->isSucc(SUI)) {
diff --git a/lib/Target/AMDGPU/R600Processors.td b/lib/Target/AMDGPU/R600Processors.td
index f39b3dc1bfd4..fff884e4848e 100644
--- a/lib/Target/AMDGPU/R600Processors.td
+++ b/lib/Target/AMDGPU/R600Processors.td
@@ -1,9 +1,8 @@
//===-- R600Processors.td - R600 Processor definitions --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -41,23 +40,24 @@ def FeatureCFALUBug : SubtargetFeature<"cfalubug",
"GPU has CF_ALU bug"
>;
-class R600SubtargetFeatureGeneration <string Value,
+class R600SubtargetFeatureGeneration <string Value, string FeatureName,
list<SubtargetFeature> Implies> :
- SubtargetFeatureGeneration <Value, "R600Subtarget", Implies>;
+ SubtargetFeatureGeneration <Value, FeatureName, "R600Subtarget", Implies>;
-def FeatureR600 : R600SubtargetFeatureGeneration<"R600",
+def FeatureR600 : R600SubtargetFeatureGeneration<"R600", "r600",
[FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
>;
-def FeatureR700 : R600SubtargetFeatureGeneration<"R700",
+def FeatureR700 : R600SubtargetFeatureGeneration<"R700", "r700",
[FeatureFetchLimit16, FeatureLocalMemorySize0]
>;
-def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN",
+def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", "evergreen",
[FeatureFetchLimit16, FeatureLocalMemorySize32768]
>;
def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+ "northern-islands",
[FeatureFetchLimit16, FeatureWavefrontSize64,
FeatureLocalMemorySize32768]
>;
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 38933e7616a0..685df74490fe 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -68,7 +67,7 @@ const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
return &CalleeSavedReg;
}
-unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+Register R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return R600::NoRegister;
}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h
index c4c77172b299..9378b70ca580 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -1,9 +1,8 @@
//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -27,7 +26,7 @@ struct R600RegisterInfo final : public R600GenRegisterInfo {
BitVector getReservedRegs(const MachineFunction &MF) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
/// get the HW encoding for a register's channel.
unsigned getHWRegChan(unsigned reg) const;
diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td
index 70fb46c1a7d6..c998fe848193 100644
--- a/lib/Target/AMDGPU/R600Schedule.td
+++ b/lib/Target/AMDGPU/R600Schedule.td
@@ -1,9 +1,8 @@
//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/R700Instructions.td b/lib/Target/AMDGPU/R700Instructions.td
index 613a0d729bb3..9c9a03209ec2 100644
--- a/lib/Target/AMDGPU/R700Instructions.td
+++ b/lib/Target/AMDGPU/R700Instructions.td
@@ -1,9 +1,8 @@
//===-- R700Instructions.td - R700 Instruction defs -------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp
index 69cafef4a351..f8094e35816c 100644
--- a/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -1,9 +1,8 @@
//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 98e9ea662324..b764ca7d7061 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -1,9 +1,8 @@
//===- SIAnnotateControlFlow.cpp ------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -13,12 +12,13 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
@@ -38,6 +38,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
@@ -56,13 +57,13 @@ class SIAnnotateControlFlow : public FunctionPass {
Type *Boolean;
Type *Void;
- Type *Int64;
+ Type *IntMask;
Type *ReturnStruct;
ConstantInt *BoolTrue;
ConstantInt *BoolFalse;
UndefValue *BoolUndef;
- Constant *Int64Zero;
+ Constant *IntMaskZero;
Function *If;
Function *Else;
@@ -75,6 +76,8 @@ class SIAnnotateControlFlow : public FunctionPass {
LoopInfo *LI;
+ void initialize(Module &M, const GCNSubtarget &ST);
+
bool isUniform(BranchInst *T);
bool isTopOfStack(BasicBlock *BB);
@@ -104,8 +107,6 @@ public:
SIAnnotateControlFlow() : FunctionPass(ID) {}
- bool doInitialization(Module &M) override;
-
bool runOnFunction(Function &F) override;
StringRef getPassName() const override { return "SI annotate control flow"; }
@@ -115,6 +116,7 @@ public:
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LegacyDivergenceAnalysis>();
AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
FunctionPass::getAnalysisUsage(AU);
}
};
@@ -125,31 +127,34 @@ INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
"Annotate SI Control Flow", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
"Annotate SI Control Flow", false, false)
char SIAnnotateControlFlow::ID = 0;
/// Initialize all the types and constants used in the pass
-bool SIAnnotateControlFlow::doInitialization(Module &M) {
+void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
LLVMContext &Context = M.getContext();
Void = Type::getVoidTy(Context);
Boolean = Type::getInt1Ty(Context);
- Int64 = Type::getInt64Ty(Context);
- ReturnStruct = StructType::get(Boolean, Int64);
+ IntMask = ST.isWave32() ? Type::getInt32Ty(Context)
+ : Type::getInt64Ty(Context);
+ ReturnStruct = StructType::get(Boolean, IntMask);
BoolTrue = ConstantInt::getTrue(Context);
BoolFalse = ConstantInt::getFalse(Context);
BoolUndef = UndefValue::get(Boolean);
- Int64Zero = ConstantInt::get(Int64, 0);
-
- If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if);
- Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else);
- IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break);
- Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop);
- EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf);
- return false;
+ IntMaskZero = ConstantInt::get(IntMask, 0);
+
+ If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if, { IntMask });
+ Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else,
+ { IntMask, IntMask });
+ IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
+ { IntMask, IntMask });
+ Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
+ EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
}
/// Is the branch condition uniform or did the StructurizeCFG pass
@@ -259,14 +264,23 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
return;
BasicBlock *Target = Term->getSuccessor(1);
- PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());
+ PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken", &Target->front());
Value *Cond = Term->getCondition();
Term->setCondition(BoolTrue);
Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
- for (BasicBlock *Pred : predecessors(Target))
- Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred);
+ for (BasicBlock *Pred : predecessors(Target)) {
+ Value *PHIValue = IntMaskZero;
+ if (Pred == BB) // Remember the value of the previous iteration.
+ PHIValue = Arg;
+ // If the backedge from Pred to Target could be executed before the exit
+ // of the loop at BB, it should not reset or change "Broken", which keeps
+ // track of the number of threads exited the loop at BB.
+ else if (L->contains(Pred) && DT->dominates(Pred, BB))
+ PHIValue = Broken;
+ Broken->addIncoming(PHIValue, Pred);
+ }
Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
@@ -308,6 +322,10 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DA = &getAnalysis<LegacyDivergenceAnalysis>();
+ TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+ const TargetMachine &TM = TPC.getTM<TargetMachine>();
+
+ initialize(*F.getParent(), TM.getSubtarget<GCNSubtarget>(F));
for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
E = df_end(&F.getEntryBlock()); I != E; ++I) {
diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
deleted file mode 100644
index 7e884ad93a23..000000000000
--- a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//===--- SIDebuggerInsertNops.cpp - Inserts nops for debugger usage -------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Inserts one nop instruction for each high level source statement for
-/// debugger usage.
-///
-/// Tools, such as a debugger, need to pause execution based on user input (i.e.
-/// breakpoint). In order to do this, one nop instruction is inserted before the
-/// first isa instruction of each high level source statement. Further, the
-/// debugger may replace nop instructions with trap instructions based on user
-/// input.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "si-debugger-insert-nops"
-#define PASS_NAME "SI Debugger Insert Nops"
-
-namespace {
-
-class SIDebuggerInsertNops : public MachineFunctionPass {
-public:
- static char ID;
-
- SIDebuggerInsertNops() : MachineFunctionPass(ID) { }
- StringRef getPassName() const override { return PASS_NAME; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-};
-
-} // anonymous namespace
-
-INITIALIZE_PASS(SIDebuggerInsertNops, DEBUG_TYPE, PASS_NAME, false, false)
-
-char SIDebuggerInsertNops::ID = 0;
-char &llvm::SIDebuggerInsertNopsID = SIDebuggerInsertNops::ID;
-
-FunctionPass *llvm::createSIDebuggerInsertNopsPass() {
- return new SIDebuggerInsertNops();
-}
-
-bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
- // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not
- // specified.
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.debuggerInsertNops())
- return false;
-
- // Skip machine functions without debug info.
- if (!MF.getMMI().hasDebugInfo())
- return false;
-
- // Target instruction info.
- const SIInstrInfo *TII = ST.getInstrInfo();
-
- // Set containing line numbers that have nop inserted.
- DenseSet<unsigned> NopInserted;
-
- for (auto &MBB : MF) {
- for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
- // Skip debug instructions and instructions without location.
- if (MI->isDebugInstr() || !MI->getDebugLoc())
- continue;
-
- // Insert nop instruction if line number does not have nop inserted.
- auto DL = MI->getDebugLoc();
- if (NopInserted.find(DL.getLine()) == NopInserted.end()) {
- BuildMI(MBB, *MI, DL, TII->get(AMDGPU::S_NOP))
- .addImm(0);
- NopInserted.insert(DL.getLine());
- }
- }
- }
-
- return true;
-}
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 7f6abc34cff3..a0e1ec6ac235 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -1,9 +1,8 @@
//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
@@ -90,13 +89,22 @@ enum : uint64_t {
// Is a D16 buffer instruction.
D16Buf = UINT64_C(1) << 50,
+ // FLAT instruction accesses FLAT_GLBL or FLAT_SCRATCH segment.
+ IsNonFlatSeg = UINT64_C(1) << 51,
+
// Uses floating point double precision rounding mode
- FPDPRounding = UINT64_C(1) << 51
+ FPDPRounding = UINT64_C(1) << 52,
+
+ // Instruction is FP atomic.
+ FPAtomic = UINT64_C(1) << 53,
+
+ // Is a MFMA instruction.
+ IsMAI = UINT64_C(1) << 54
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
// The result is true if any of these tests are true.
-enum ClassFlags {
+enum ClassFlags : unsigned {
S_NAN = 1 << 0, // Signaling NaN
Q_NAN = 1 << 1, // Quiet NaN
N_INFINITY = 1 << 2, // Negative infinity
@@ -111,7 +119,7 @@ enum ClassFlags {
}
namespace AMDGPU {
- enum OperandType {
+ enum OperandType : unsigned {
/// Operands with register or 32-bit immediate
OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
OPERAND_REG_IMM_INT64,
@@ -119,6 +127,8 @@ namespace AMDGPU {
OPERAND_REG_IMM_FP32,
OPERAND_REG_IMM_FP64,
OPERAND_REG_IMM_FP16,
+ OPERAND_REG_IMM_V2FP16,
+ OPERAND_REG_IMM_V2INT16,
/// Operands with register or inline constant
OPERAND_REG_INLINE_C_INT16,
@@ -130,11 +140,22 @@ namespace AMDGPU {
OPERAND_REG_INLINE_C_V2FP16,
OPERAND_REG_INLINE_C_V2INT16,
+ /// Operands with an AccVGPR register or inline constant
+ OPERAND_REG_INLINE_AC_INT16,
+ OPERAND_REG_INLINE_AC_INT32,
+ OPERAND_REG_INLINE_AC_FP16,
+ OPERAND_REG_INLINE_AC_FP32,
+ OPERAND_REG_INLINE_AC_V2FP16,
+ OPERAND_REG_INLINE_AC_V2INT16,
+
OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
- OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16,
+ OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2INT16,
OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
- OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_V2INT16,
+ OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2INT16,
+
+ OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16,
+ OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2INT16,
OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
@@ -151,17 +172,10 @@ namespace AMDGPU {
};
}
-namespace SIStackID {
-enum StackTypes : uint8_t {
- SCRATCH = 0,
- SGPR_SPILL = 1
-};
-}
-
// Input operand modifiers bit-masks
// NEG and SEXT share same bit-mask because they can't be set simultaneously.
namespace SISrcMods {
- enum {
+ enum : unsigned {
NEG = 1 << 0, // Floating-point negate modifier
ABS = 1 << 1, // Floating-point absolute modifier
SEXT = 1 << 0, // Integer sign-extend modifier
@@ -173,7 +187,7 @@ namespace SISrcMods {
}
namespace SIOutMods {
- enum {
+ enum : unsigned {
NONE = 0,
MUL2 = 1,
MUL4 = 2,
@@ -181,17 +195,33 @@ namespace SIOutMods {
};
}
+namespace AMDGPU {
namespace VGPRIndexMode {
- enum {
- SRC0_ENABLE = 1 << 0,
- SRC1_ENABLE = 1 << 1,
- SRC2_ENABLE = 1 << 2,
- DST_ENABLE = 1 << 3
- };
-}
+
+enum Id : unsigned { // id of symbolic names
+ ID_SRC0 = 0,
+ ID_SRC1,
+ ID_SRC2,
+ ID_DST,
+
+ ID_MIN = ID_SRC0,
+ ID_MAX = ID_DST
+};
+
+enum EncBits : unsigned {
+ OFF = 0,
+ SRC0_ENABLE = 1 << ID_SRC0,
+ SRC1_ENABLE = 1 << ID_SRC1,
+ SRC2_ENABLE = 1 << ID_SRC2,
+ DST_ENABLE = 1 << ID_DST,
+ ENABLE_MASK = SRC0_ENABLE | SRC1_ENABLE | SRC2_ENABLE | DST_ENABLE
+};
+
+} // namespace VGPRIndexMode
+} // namespace AMDGPU
namespace AMDGPUAsmVariants {
- enum {
+ enum : unsigned {
DEFAULT = 0,
VOP3 = 1,
SDWA = 2,
@@ -203,13 +233,14 @@ namespace AMDGPUAsmVariants {
namespace AMDGPU {
namespace EncValues { // Encoding values of enum9/8/7 operands
-enum {
+enum : unsigned {
SGPR_MIN = 0,
- SGPR_MAX = 101,
+ SGPR_MAX_SI = 101,
+ SGPR_MAX_GFX10 = 105,
TTMP_VI_MIN = 112,
TTMP_VI_MAX = 123,
- TTMP_GFX9_MIN = 108,
- TTMP_GFX9_MAX = 123,
+ TTMP_GFX9_GFX10_MIN = 108,
+ TTMP_GFX9_GFX10_MAX = 123,
INLINE_INTEGER_C_MIN = 128,
INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64
INLINE_INTEGER_C_MAX = 208,
@@ -231,6 +262,8 @@ enum Id { // Message ID, width(4) [3:0].
ID_INTERRUPT = 1,
ID_GS,
ID_GS_DONE,
+ ID_GS_ALLOC_REQ = 9,
+ ID_GET_DOORBELL = 10,
ID_SYSMSG = 15,
ID_GAPS_LAST_, // Indicate that sequence has gaps.
ID_GAPS_FIRST_ = ID_INTERRUPT,
@@ -242,27 +275,28 @@ enum Id { // Message ID, width(4) [3:0].
enum Op { // Both GS and SYS operation IDs.
OP_UNKNOWN_ = -1,
OP_SHIFT_ = 4,
- // width(2) [5:4]
+ OP_NONE_ = 0,
+ // Bits used for operation encoding
+ OP_WIDTH_ = 3,
+ OP_MASK_ = (((1 << OP_WIDTH_) - 1) << OP_SHIFT_),
+ // GS operations are encoded in bits 5:4
OP_GS_NOP = 0,
OP_GS_CUT,
OP_GS_EMIT,
OP_GS_EMIT_CUT,
OP_GS_LAST_,
OP_GS_FIRST_ = OP_GS_NOP,
- OP_GS_WIDTH_ = 2,
- OP_GS_MASK_ = (((1 << OP_GS_WIDTH_) - 1) << OP_SHIFT_),
- // width(3) [6:4]
+ // SYS operations are encoded in bits 6:4
OP_SYS_ECC_ERR_INTERRUPT = 1,
OP_SYS_REG_RD,
OP_SYS_HOST_TRAP_ACK,
OP_SYS_TTRACE_PC,
OP_SYS_LAST_,
OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT,
- OP_SYS_WIDTH_ = 3,
- OP_SYS_MASK_ = (((1 << OP_SYS_WIDTH_) - 1) << OP_SHIFT_)
};
-enum StreamId { // Stream ID, (2) [9:8].
+enum StreamId : unsigned { // Stream ID, (2) [9:8].
+ STREAM_ID_NONE_ = 0,
STREAM_ID_DEFAULT_ = 0,
STREAM_ID_LAST_ = 4,
STREAM_ID_FIRST_ = STREAM_ID_DEFAULT_,
@@ -287,23 +321,34 @@ enum Id { // HwRegCode, (6) [5:0]
ID_IB_STS = 7,
ID_MEM_BASES = 15,
ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES,
- ID_SYMBOLIC_LAST_ = 16,
+ ID_TBA_LO = 16,
+ ID_SYMBOLIC_FIRST_GFX10_ = ID_TBA_LO,
+ ID_TBA_HI = 17,
+ ID_TMA_LO = 18,
+ ID_TMA_HI = 19,
+ ID_FLAT_SCR_LO = 20,
+ ID_FLAT_SCR_HI = 21,
+ ID_XNACK_MASK = 22,
+ ID_POPS_PACKER = 25,
+ ID_SYMBOLIC_LAST_ = 26,
ID_SHIFT_ = 0,
ID_WIDTH_ = 6,
ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
};
-enum Offset { // Offset, (5) [10:6]
+enum Offset : unsigned { // Offset, (5) [10:6]
OFFSET_DEFAULT_ = 0,
OFFSET_SHIFT_ = 6,
OFFSET_WIDTH_ = 5,
OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_),
+ OFFSET_MEM_VIOL = 8,
+
OFFSET_SRC_SHARED_BASE = 16,
OFFSET_SRC_PRIVATE_BASE = 0
};
-enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
+enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11]
WIDTH_M1_DEFAULT_ = 31,
WIDTH_M1_SHIFT_ = 11,
WIDTH_M1_WIDTH_ = 5,
@@ -313,11 +358,16 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
WIDTH_M1_SRC_PRIVATE_BASE = 15
};
+// Some values from WidthMinusOne mapped into Width domain.
+enum Width : unsigned {
+ WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1,
+};
+
} // namespace Hwreg
namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
-enum Id { // id of symbolic names
+enum Id : unsigned { // id of symbolic names
ID_QUAD_PERM = 0,
ID_BITMASK_PERM,
ID_SWAP,
@@ -325,7 +375,7 @@ enum Id { // id of symbolic names
ID_BROADCAST
};
-enum EncBits {
+enum EncBits : unsigned {
// swizzle mode encodings
@@ -357,7 +407,7 @@ enum EncBits {
namespace SDWA {
-enum SdwaSel {
+enum SdwaSel : unsigned {
BYTE_0 = 0,
BYTE_1 = 1,
BYTE_2 = 2,
@@ -367,13 +417,13 @@ enum SdwaSel {
DWORD = 6,
};
-enum DstUnused {
+enum DstUnused : unsigned {
UNUSED_PAD = 0,
UNUSED_SEXT = 1,
UNUSED_PRESERVE = 2,
};
-enum SDWA9EncValues{
+enum SDWA9EncValues : unsigned {
SRC_SGPR_MASK = 0x100,
SRC_VGPR_MASK = 0xFF,
VOPC_DST_VCC_MASK = 0x80,
@@ -382,7 +432,8 @@ enum SDWA9EncValues{
SRC_VGPR_MIN = 0,
SRC_VGPR_MAX = 255,
SRC_SGPR_MIN = 256,
- SRC_SGPR_MAX = 357,
+ SRC_SGPR_MAX_SI = 357,
+ SRC_SGPR_MAX_GFX10 = 361,
SRC_TTMP_MIN = 364,
SRC_TTMP_MAX = 379,
};
@@ -391,7 +442,7 @@ enum SDWA9EncValues{
namespace DPP {
-enum DppCtrl {
+enum DppCtrl : unsigned {
QUAD_PERM_FIRST = 0,
QUAD_PERM_LAST = 0xFF,
DPP_UNUSED1 = 0x100,
@@ -422,7 +473,20 @@ enum DppCtrl {
ROW_HALF_MIRROR = 0x141,
BCAST15 = 0x142,
BCAST31 = 0x143,
- DPP_LAST = BCAST31
+ DPP_UNUSED8_FIRST = 0x144,
+ DPP_UNUSED8_LAST = 0x14F,
+ ROW_SHARE_FIRST = 0x150,
+ ROW_SHARE_LAST = 0x15F,
+ ROW_XMASK_FIRST = 0x160,
+ ROW_XMASK_LAST = 0x16F,
+ DPP_LAST = ROW_XMASK_LAST
+};
+
+enum DppFiMode {
+ DPP_FI_0 = 0,
+ DPP_FI_1 = 1,
+ DPP8_FI_0 = 0xE9,
+ DPP8_FI_1 = 0xEA,
};
} // namespace DPP
@@ -505,6 +569,15 @@ enum DppCtrl {
#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23)
#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1)
#define C_00B848_IEEE_MODE 0xFF7FFFFF
+#define S_00B848_WGP_MODE(x) (((x) & 0x1) << 29)
+#define G_00B848_WGP_MODE(x) (((x) >> 29) & 0x1)
+#define C_00B848_WGP_MODE 0xDFFFFFFF
+#define S_00B848_MEM_ORDERED(x) (((x) & 0x1) << 30)
+#define G_00B848_MEM_ORDERED(x) (((x) >> 30) & 0x1)
+#define C_00B848_MEM_ORDERED 0xBFFFFFFF
+#define S_00B848_FWD_PROGRESS(x) (((x) & 0x1) << 31)
+#define G_00B848_FWD_PROGRESS(x) (((x) >> 31) & 0x1)
+#define C_00B848_FWD_PROGRESS 0x7FFFFFFF
// Helpers for setting FLOAT_MODE
@@ -535,6 +608,15 @@ enum DppCtrl {
#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8
#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12)
+#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54
+#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21)
+#define S_028B54_GS_W32_EN(x) (((x) & 0x1) << 22)
+#define S_028B54_VS_W32_EN(x) (((x) & 0x1) << 23)
+#define R_0286D8_SPI_PS_IN_CONTROL 0x0286D8
+#define S_0286D8_PS_W32_EN(x) (((x) & 0x1) << 15)
+#define R_00B800_COMPUTE_DISPATCH_INITIATOR 0x00B800
+#define S_00B800_CS_W32_EN(x) (((x) & 0x1) << 15)
+
#define R_SPILLED_SGPRS 0x4
#define R_SPILLED_VGPRS 0x8
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 809f5bab4693..624953963cf4 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1,9 +1,8 @@
//===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -104,7 +103,7 @@ using namespace llvm;
static cl::opt<bool> EnableM0Merge(
"amdgpu-enable-merge-m0",
cl::desc("Merge and hoist M0 initializations"),
- cl::init(false));
+ cl::init(true));
namespace {
@@ -144,14 +143,15 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass() {
return new SIFixSGPRCopies();
}
-static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
+static bool hasVectorOperands(const MachineInstr &MI,
+ const SIRegisterInfo *TRI) {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
if (!MI.getOperand(i).isReg() ||
!TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
continue;
- if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
+ if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
return true;
}
return false;
@@ -184,14 +184,14 @@ static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
- TRI.hasVGPRs(SrcRC);
+ TRI.hasVectorRegisters(SrcRC);
}
static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
- TRI.hasVGPRs(DstRC);
+ TRI.hasVectorRegisters(DstRC);
}
static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
@@ -278,6 +278,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
// VGPRz = REG_SEQUENCE VGPRx, sub0
MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
+ bool IsAGPR = TRI->hasAGPRs(DstRC);
for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
unsigned SrcReg = MI.getOperand(I).getReg();
@@ -296,6 +297,17 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
TmpReg)
.add(MI.getOperand(I));
+ if (IsAGPR) {
+ const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
+ unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC);
+ unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
+ AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
+ BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
+ TmpAReg)
+ .addReg(TmpReg, RegState::Kill);
+ TmpReg = TmpAReg;
+ }
+
MI.getOperand(I).setReg(TmpReg);
}
@@ -440,18 +452,32 @@ static bool isReachable(const MachineInstr *From,
(const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
}
+// Return the first non-prologue instruction in the block.
+static MachineBasicBlock::iterator
+getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) {
+ MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
+ while (I != MBB->end() && TII->isBasicBlockPrologue(*I))
+ ++I;
+
+ return I;
+}
+
// Hoist and merge identical SGPR initializations into a common predecessor.
// This is intended to combine M0 initializations, but can work with any
// SGPR. A VGPR cannot be processed since we cannot guarantee vector
// executioon.
static bool hoistAndMergeSGPRInits(unsigned Reg,
const MachineRegisterInfo &MRI,
- MachineDominatorTree &MDT) {
+ MachineDominatorTree &MDT,
+ const TargetInstrInfo *TII) {
// List of inits by immediate value.
using InitListMap = std::map<unsigned, std::list<MachineInstr *>>;
InitListMap Inits;
// List of clobbering instructions.
SmallVector<MachineInstr*, 8> Clobbers;
+ // List of instructions marked for deletion.
+ SmallSet<MachineInstr*, 8> MergedInstrs;
+
bool Changed = false;
for (auto &MI : MRI.def_instructions(Reg)) {
@@ -480,8 +506,8 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
MachineInstr *MI2 = *I2;
// Check any possible interference
- auto intereferes = [&](MachineBasicBlock::iterator From,
- MachineBasicBlock::iterator To) -> bool {
+ auto interferes = [&](MachineBasicBlock::iterator From,
+ MachineBasicBlock::iterator To) -> bool {
assert(MDT.dominates(&*To, &*From));
@@ -513,23 +539,23 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
};
if (MDT.dominates(MI1, MI2)) {
- if (!intereferes(MI2, MI1)) {
+ if (!interferes(MI2, MI1)) {
LLVM_DEBUG(dbgs()
<< "Erasing from "
<< printMBBReference(*MI2->getParent()) << " " << *MI2);
- MI2->eraseFromParent();
- Defs.erase(I2++);
+ MergedInstrs.insert(MI2);
Changed = true;
+ ++I2;
continue;
}
} else if (MDT.dominates(MI2, MI1)) {
- if (!intereferes(MI1, MI2)) {
+ if (!interferes(MI1, MI2)) {
LLVM_DEBUG(dbgs()
<< "Erasing from "
<< printMBBReference(*MI1->getParent()) << " " << *MI1);
- MI1->eraseFromParent();
- Defs.erase(I1++);
+ MergedInstrs.insert(MI1);
Changed = true;
+ ++I1;
break;
}
} else {
@@ -540,8 +566,8 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
continue;
}
- MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
- if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
+ MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII);
+ if (!interferes(MI1, I) && !interferes(MI2, I)) {
LLVM_DEBUG(dbgs()
<< "Erasing from "
<< printMBBReference(*MI1->getParent()) << " " << *MI1
@@ -549,9 +575,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
<< printMBBReference(*MI2->getParent()) << " to "
<< printMBBReference(*I->getParent()) << " " << *MI2);
I->getParent()->splice(I, MI2->getParent(), MI2);
- MI1->eraseFromParent();
- Defs.erase(I1++);
+ MergedInstrs.insert(MI1);
Changed = true;
+ ++I1;
break;
}
}
@@ -561,6 +587,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
}
}
+ for (auto MI : MergedInstrs)
+ MI->removeFromParent();
+
if (Changed)
MRI.clearKillFlags(Reg);
@@ -679,11 +708,12 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
TII->moveToVALU(MI, MDT);
}
+
break;
}
case AMDGPU::REG_SEQUENCE:
- if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
- !hasVGPROperands(MI, TRI)) {
+ if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
+ !hasVectorOperands(MI, TRI)) {
foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
continue;
}
@@ -698,7 +728,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
if (TRI->isSGPRClass(DstRC) &&
- (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
+ (TRI->hasVectorRegisters(Src0RC) ||
+ TRI->hasVectorRegisters(Src1RC))) {
LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
TII->moveToVALU(MI, MDT);
}
@@ -709,7 +740,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}
if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
- hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT);
+ hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII);
return true;
}
diff --git a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index 15ba78edf919..29484668a01d 100644
--- a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -1,9 +1,8 @@
//===-- SIFixVGPRCopies.cpp - Fix VGPR Copies after regalloc --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
deleted file mode 100644
index 7761418c5336..000000000000
--- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ /dev/null
@@ -1,418 +0,0 @@
-//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Computations in WWM can overwrite values in inactive channels for
-/// variables that the register allocator thinks are dead. This pass adds fake
-/// uses of those variables to their def(s) to make sure that they aren't
-/// overwritten.
-///
-/// As an example, consider this snippet:
-/// %vgpr0 = V_MOV_B32_e32 0.0
-/// if (...) {
-/// %vgpr1 = ...
-/// %vgpr2 = WWM killed %vgpr1
-/// ... = killed %vgpr2
-/// %vgpr0 = V_MOV_B32_e32 1.0
-/// }
-/// ... = %vgpr0
-///
-/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
-/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
-/// writing %vgpr1 would only write to channels that would be clobbered by the
-/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
-/// it would clobber even the inactive channels for which the if-condition is
-/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
-/// of %vgpr0 to its def to make sure they aren't allocated to the
-/// same register.
-///
-/// In general, we need to figure out what registers might have their inactive
-/// channels which are eventually used accidentally clobbered by a WWM
-/// instruction. We do that by spotting three separate cases of registers:
-///
-/// 1. A "then phi": the value resulting from phi elimination of a phi node at
-/// the end of an if..endif. If there is WWM code in the "then", then we
-/// make the def at the end of the "then" branch a partial def by adding an
-/// implicit use of the register.
-///
-/// 2. A "loop exit register": a value written inside a loop but used outside the
-/// loop, where there is WWM code inside the loop (the case in the example
-/// above). We add an implicit_def of the register in the loop pre-header,
-/// and make the original def a partial def by adding an implicit use of the
-/// register.
-///
-/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node
-/// in a loop header. If there is WWM code inside the loop, then we make all
-/// defs inside the loop partial defs by adding an implicit use of the
-/// register on each one.
-///
-/// Note that we do not need to consider an if..else..endif phi. We only need to
-/// consider non-uniform control flow, and control flow structurization would
-/// have transformed a non-uniform if..else..endif into two if..endifs.
-///
-/// The analysis to detect these cases relies on a property of the MIR
-/// arising from this pass running straight after PHIElimination and before any
-/// coalescing: that any virtual register with more than one definition must be
-/// the new register added to lower a phi node by PHIElimination.
-///
-/// FIXME: We should detect whether a register in one of the above categories is
-/// already live at the WWM code before deciding to add the implicit uses to
-/// synthesize its liveness.
-///
-/// FIXME: I believe this whole scheme may be flawed due to the possibility of
-/// the register allocator doing live interval splitting.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-fix-wwm-liveness"
-
-namespace {
-
-class SIFixWWMLiveness : public MachineFunctionPass {
-private:
- MachineDominatorTree *DomTree;
- MachineLoopInfo *LoopInfo;
- LiveIntervals *LIS = nullptr;
- const SIInstrInfo *TII;
- const SIRegisterInfo *TRI;
- MachineRegisterInfo *MRI;
-
- std::vector<MachineInstr *> WWMs;
- std::vector<MachineOperand *> ThenDefs;
- std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopExitDefs;
- std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopPhiDefs;
-
-public:
- static char ID;
-
- SIFixWWMLiveness() : MachineFunctionPass(ID) {
- initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequiredID(MachineDominatorsID);
- AU.addRequiredID(MachineLoopInfoID);
- // Should preserve the same set that TwoAddressInstructions does.
- AU.addPreserved<SlotIndexes>();
- AU.addPreserved<LiveIntervals>();
- AU.addPreservedID(LiveVariablesID);
- AU.addPreservedID(MachineLoopInfoID);
- AU.addPreservedID(MachineDominatorsID);
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
-private:
- void processDef(MachineOperand &DefOpnd);
- bool processThenDef(MachineOperand *DefOpnd);
- bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop);
- bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop);
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE,
- "SI fix WWM liveness", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE,
- "SI fix WWM liveness", false, false)
-
-char SIFixWWMLiveness::ID = 0;
-
-char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;
-
-FunctionPass *llvm::createSIFixWWMLivenessPass() {
- return new SIFixWWMLiveness();
-}
-
-bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
- LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n");
- bool Modified = false;
-
- // This doesn't actually need LiveIntervals, but we can preserve them.
- LIS = getAnalysisIfAvailable<LiveIntervals>();
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-
- TII = ST.getInstrInfo();
- TRI = &TII->getRegisterInfo();
- MRI = &MF.getRegInfo();
-
- DomTree = &getAnalysis<MachineDominatorTree>();
- LoopInfo = &getAnalysis<MachineLoopInfo>();
-
- // Scan the function to find the WWM sections and the candidate registers for
- // having liveness modified.
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- if (MI.getOpcode() == AMDGPU::EXIT_WWM)
- WWMs.push_back(&MI);
- else {
- for (MachineOperand &DefOpnd : MI.defs()) {
- if (DefOpnd.isReg()) {
- unsigned Reg = DefOpnd.getReg();
- if (TRI->isVGPR(*MRI, Reg))
- processDef(DefOpnd);
- }
- }
- }
- }
- }
- if (!WWMs.empty()) {
- // Synthesize liveness over WWM sections as required.
- for (auto ThenDef : ThenDefs)
- Modified |= processThenDef(ThenDef);
- for (auto LoopExitDef : LoopExitDefs)
- Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second);
- for (auto LoopPhiDef : LoopPhiDefs)
- Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second);
- }
-
- WWMs.clear();
- ThenDefs.clear();
- LoopExitDefs.clear();
- LoopPhiDefs.clear();
-
- return Modified;
-}
-
-// During the function scan, process an operand that defines a VGPR.
-// This categorizes the register and puts it in the appropriate list for later
-// use when processing a WWM section.
-void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) {
- unsigned Reg = DefOpnd.getReg();
- // Get all the defining instructions. For convenience, make Defs[0] the def
- // we are on now.
- SmallVector<const MachineInstr *, 4> Defs;
- Defs.push_back(DefOpnd.getParent());
- for (auto &MI : MRI->def_instructions(Reg)) {
- if (&MI != DefOpnd.getParent())
- Defs.push_back(&MI);
- }
- // Check whether this def dominates all the others. If not, ignore this def.
- // Either it is going to be processed when the scan encounters its other def
- // that dominates all defs, or there is no def that dominates all others.
- // The latter case is an eliminated phi from an if..else..endif or similar,
- // which must be for uniform control flow so can be ignored.
- // Because this pass runs shortly after PHIElimination, we assume that any
- // multi-def register is a lowered phi, and thus has each def in a separate
- // basic block.
- for (unsigned I = 1; I != Defs.size(); ++I) {
- if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent()))
- return;
- }
- // Check for the case of an if..endif lowered phi: It has two defs, one
- // dominates the other, and there is a single use in a successor of the
- // dominant def.
- // Later we will spot any WWM code inside
- // the "then" clause and turn the second def into a partial def so its
- // liveness goes through the WWM code in the "then" clause.
- if (Defs.size() == 2) {
- auto DomDefBlock = Defs[0]->getParent();
- if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) {
- auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
- for (auto Succ : DomDefBlock->successors()) {
- if (Succ == UseBlock) {
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n");
- ThenDefs.push_back(&DefOpnd);
- return;
- }
- }
- }
- }
- // Check for the case of a non-lowered-phi register (single def) that exits
- // a loop, that is, it has a use that is outside a loop that the def is
- // inside. We find the outermost loop that the def is inside but a use is
- // outside. Later we will spot any WWM code inside that loop and then make
- // the def a partial def so its liveness goes round the loop and through the
- // WWM code.
- if (Defs.size() == 1) {
- auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent());
- if (!Loop)
- return;
- bool IsLoopExit = false;
- for (auto &Use : MRI->use_instructions(Reg)) {
- auto UseBlock = Use.getParent();
- if (Loop->contains(UseBlock))
- continue;
- IsLoopExit = true;
- while (auto Parent = Loop->getParentLoop()) {
- if (Parent->contains(UseBlock))
- break;
- Loop = Parent;
- }
- }
- if (!IsLoopExit)
- return;
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
- << " is a loop exit reg with loop header at "
- << "bb." << Loop->getHeader()->getNumber() << "\n");
- LoopExitDefs.push_back(std::pair<MachineOperand *, MachineLoop *>(
- &DefOpnd, Loop));
- return;
- }
- // Check for the case of a lowered single-preheader-loop phi, that is, a
- // multi-def register where the dominating def is in the loop pre-header and
- // all other defs are in backedges. Later we will spot any WWM code inside
- // that loop and then make the backedge defs partial defs so the liveness
- // goes through the WWM code.
- // Note that we are ignoring multi-preheader loops on the basis that the
- // structurizer does not allow that for non-uniform loops.
- // There must be a single use in the loop header.
- if (!MRI->hasOneUse(Reg))
- return;
- auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
- auto Loop = LoopInfo->getLoopFor(UseBlock);
- if (!Loop || Loop->getHeader() != UseBlock
- || Loop->contains(Defs[0]->getParent())) {
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
- << " is multi-def but single use not in loop header\n");
- return;
- }
- for (unsigned I = 1; I != Defs.size(); ++I) {
- if (!Loop->contains(Defs[I]->getParent()))
- return;
- }
- LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
- << " is a loop phi reg with loop header at "
- << "bb." << Loop->getHeader()->getNumber() << "\n");
- LoopPhiDefs.push_back(
- std::pair<MachineOperand *, MachineLoop *>(&DefOpnd, Loop));
-}
-
-// Process a then phi def: It has two defs, one dominates the other, and there
-// is a single use in a successor of the dominant def. Here we spot any WWM
-// code inside the "then" clause and turn the second def into a partial def so
-// its liveness goes through the WWM code in the "then" clause.
-bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) {
- LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent());
- if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
- // Ignore if dominating def is undef.
- LLVM_DEBUG(dbgs() << " ignoring as dominating def is undef\n");
- return false;
- }
- unsigned Reg = DefOpnd->getReg();
- // Get the use block, which is the endif block.
- auto UseBlock = MRI->use_instr_begin(Reg)->getParent();
- // Check whether there is WWM code inside the then branch. The WWM code must
- // be dominated by the if but not dominated by the endif.
- bool ContainsWWM = false;
- for (auto WWM : WWMs) {
- if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent())
- && !DomTree->dominates(UseBlock, WWM->getParent())) {
- LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
- ContainsWWM = true;
- break;
- }
- }
- if (!ContainsWWM)
- return false;
- // Get the other def.
- MachineInstr *OtherDef = nullptr;
- for (auto &MI : MRI->def_instructions(Reg)) {
- if (&MI != DefOpnd->getParent())
- OtherDef = &MI;
- }
- // Make it a partial def.
- OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
- LLVM_DEBUG(dbgs() << *OtherDef);
- return true;
-}
-
-// Process a loop exit def, that is, a register with a single use in a loop
-// that has a use outside the loop. Here we spot any WWM code inside that loop
-// and then make the def a partial def so its liveness goes round the loop and
-// through the WWM code.
-bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd,
- MachineLoop *Loop) {
- LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent());
- // Check whether there is WWM code inside the loop.
- bool ContainsWWM = false;
- for (auto WWM : WWMs) {
- if (Loop->contains(WWM->getParent())) {
- LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
- ContainsWWM = true;
- break;
- }
- }
- if (!ContainsWWM)
- return false;
- unsigned Reg = DefOpnd->getReg();
- // Add a new implicit_def in loop preheader(s).
- for (auto Pred : Loop->getHeader()->predecessors()) {
- if (!Loop->contains(Pred)) {
- auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(),
- TII->get(TargetOpcode::IMPLICIT_DEF), Reg);
- LLVM_DEBUG(dbgs() << *ImplicitDef);
- (void)ImplicitDef;
- }
- }
- // Make the original def partial.
- DefOpnd->getParent()->addOperand(MachineOperand::CreateReg(
- Reg, false, /*isImp=*/true));
- LLVM_DEBUG(dbgs() << *DefOpnd->getParent());
- return true;
-}
-
-// Process a loop phi def, that is, a multi-def register where the dominating
-// def is in the loop pre-header and all other defs are in backedges. Here we
-// spot any WWM code inside that loop and then make the backedge defs partial
-// defs so the liveness goes through the WWM code.
-bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd,
- MachineLoop *Loop) {
- LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent());
- // Check whether there is WWM code inside the loop.
- bool ContainsWWM = false;
- for (auto WWM : WWMs) {
- if (Loop->contains(WWM->getParent())) {
- LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
- ContainsWWM = true;
- break;
- }
- }
- if (!ContainsWWM)
- return false;
- unsigned Reg = DefOpnd->getReg();
- // Remove kill mark from uses.
- for (auto &Use : MRI->use_operands(Reg))
- Use.setIsKill(false);
- // Make all defs except the dominating one partial defs.
- SmallVector<MachineInstr *, 4> Defs;
- for (auto &Def : MRI->def_instructions(Reg))
- Defs.push_back(&Def);
- for (auto Def : Defs) {
- if (DefOpnd->getParent() == Def)
- continue;
- Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
- LLVM_DEBUG(dbgs() << *Def);
- }
- return true;
-}
-
diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
index ee39eb04d831..5b834c8de13a 100644
--- a/lib/Target/AMDGPU/SIFixupVectorISel.cpp
+++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -1,9 +1,8 @@
//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
/// SIFixupVectorISel pass cleans up post ISEL Vector issues.
@@ -198,6 +197,11 @@ static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
// Atomics dont have a GLC, so omit the field if not there.
if (Glc)
NewGlob->addOperand(MF, *Glc);
+
+ MachineOperand *DLC = TII->getNamedOperand(MI, AMDGPU::OpName::dlc);
+ if (DLC)
+ NewGlob->addOperand(MF, *DLC);
+
NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc));
// _D16 have an vdst_in operand, copy it in.
MachineOperand *VDstInOp = TII->getNamedOperand(MI,
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index f4e866958369..74d77d328019 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1,9 +1,8 @@
//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// \file
//===----------------------------------------------------------------------===//
@@ -51,7 +50,7 @@ struct FoldCandidate {
} else if (FoldOp->isFI()) {
FrameIndexToFold = FoldOp->getIndex();
} else {
- assert(FoldOp->isReg());
+ assert(FoldOp->isReg() || FoldOp->isGlobal());
OpToFold = FoldOp;
}
}
@@ -68,6 +67,8 @@ struct FoldCandidate {
return Kind == MachineOperand::MO_Register;
}
+ bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
+
bool isCommuted() const {
return Commuted;
}
@@ -88,10 +89,11 @@ public:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const GCNSubtarget *ST;
+ const SIMachineFunctionInfo *MFI;
void foldOperand(MachineOperand &OpToFold,
MachineInstr *UseMI,
- unsigned UseOpIdx,
+ int UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
@@ -160,19 +162,34 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
}
}
+// TODO: Add heuristic that the frame index might not fit in the addressing mode
+// immediate offset to avoid materializing in loops.
+static bool frameIndexMayFold(const SIInstrInfo *TII,
+ const MachineInstr &UseMI,
+ int OpNo,
+ const MachineOperand &OpToFold) {
+ return OpToFold.isFI() &&
+ (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
+ OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
+}
+
FunctionPass *llvm::createSIFoldOperandsPass() {
return new SIFoldOperands();
}
static bool updateOperand(FoldCandidate &Fold,
const SIInstrInfo &TII,
- const TargetRegisterInfo &TRI) {
+ const TargetRegisterInfo &TRI,
+ const GCNSubtarget &ST) {
MachineInstr *MI = Fold.UseMI;
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
assert(Old.isReg());
if (Fold.isImm()) {
- if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {
+ if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
+ !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
+ AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold),
+ ST.hasInv2PiInlineImm())) {
// Set op_sel/op_sel_hi on this operand or bail out if op_sel is
// already set.
unsigned Opcode = MI->getOpcode();
@@ -190,77 +207,94 @@ static bool updateOperand(FoldCandidate &Fold,
unsigned Val = Mod.getImm();
if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
return false;
- // If upper part is all zero we do not need op_sel_hi.
- if (!isUInt<16>(Fold.ImmToFold)) {
- if (!(Fold.ImmToFold & 0xffff)) {
- Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+ // Only apply the following transformation if that operand requries
+ // a packed immediate.
+ switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ // If upper part is all zero we do not need op_sel_hi.
+ if (!isUInt<16>(Fold.ImmToFold)) {
+ if (!(Fold.ImmToFold & 0xffff)) {
+ Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+ return true;
+ }
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+ Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
return true;
}
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ break;
+ default:
+ break;
}
}
+ }
- if (Fold.needsShrink()) {
- MachineBasicBlock *MBB = MI->getParent();
- auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
- if (Liveness != MachineBasicBlock::LQR_Dead)
- return false;
-
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- int Op32 = Fold.getShrinkOpcode();
- MachineOperand &Dst0 = MI->getOperand(0);
- MachineOperand &Dst1 = MI->getOperand(1);
- assert(Dst0.isDef() && Dst1.isDef());
-
- bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
+ if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
+ MachineBasicBlock *MBB = MI->getParent();
+ auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
+ if (Liveness != MachineBasicBlock::LQR_Dead)
+ return false;
- const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
- unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
- const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg());
- unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC);
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ int Op32 = Fold.getShrinkOpcode();
+ MachineOperand &Dst0 = MI->getOperand(0);
+ MachineOperand &Dst1 = MI->getOperand(1);
+ assert(Dst0.isDef() && Dst1.isDef());
- MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
+ bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
- if (HaveNonDbgCarryUse) {
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
- .addReg(AMDGPU::VCC, RegState::Kill);
- }
+ const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
+ unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
- // Keep the old instruction around to avoid breaking iterators, but
- // replace the outputs with dummy registers.
- Dst0.setReg(NewReg0);
- Dst1.setReg(NewReg1);
+ MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
- if (Fold.isCommuted())
- TII.commuteInstruction(*Inst32, false);
- return true;
+ if (HaveNonDbgCarryUse) {
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
+ .addReg(AMDGPU::VCC, RegState::Kill);
}
- Old.ChangeToImmediate(Fold.ImmToFold);
+ // Keep the old instruction around to avoid breaking iterators, but
+ // replace it with a dummy instruction to remove uses.
+ //
+ // FIXME: We should not invert how this pass looks at operands to avoid
+ // this. Should track set of foldable movs instead of looking for uses
+ // when looking at a use.
+ Dst0.setReg(NewReg0);
+ for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
+ MI->RemoveOperand(I);
+ MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
+
+ if (Fold.isCommuted())
+ TII.commuteInstruction(*Inst32, false);
return true;
}
assert(!Fold.needsShrink() && "not handled");
- if (Fold.isFI()) {
- Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
+ if (Fold.isImm()) {
+ Old.ChangeToImmediate(Fold.ImmToFold);
return true;
}
- MachineOperand *New = Fold.OpToFold;
- if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
- TargetRegisterInfo::isVirtualRegister(New->getReg())) {
- Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
-
- Old.setIsUndef(New->isUndef());
+ if (Fold.isGlobal()) {
+ Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
+ Fold.OpToFold->getTargetFlags());
return true;
}
- // FIXME: Handle physical registers.
+ if (Fold.isFI()) {
+ Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
+ return true;
+ }
- return false;
+ MachineOperand *New = Fold.OpToFold;
+ Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+ Old.setIsUndef(New->isUndef());
+ return true;
}
static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
@@ -277,7 +311,6 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineOperand *OpToFold,
const SIInstrInfo *TII) {
if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
-
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
unsigned Opc = MI->getOpcode();
if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
@@ -344,7 +377,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
if ((Opc == AMDGPU::V_ADD_I32_e64 ||
Opc == AMDGPU::V_SUB_I32_e64 ||
Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
- OpToFold->isImm()) {
+ (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
// Verify the other operand is a VGPR, otherwise we would violate the
@@ -357,7 +390,10 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
assert(MI->getOperand(1).isDef());
- int Op32 = AMDGPU::getVOPe32(Opc);
+ // Make sure to get the 32-bit version of the commuted opcode.
+ unsigned MaybeCommutedOpc = MI->getOpcode();
+ int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
+
FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
Op32));
return true;
@@ -384,10 +420,75 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
}
+static bool tryToFoldACImm(const SIInstrInfo *TII,
+ const MachineOperand &OpToFold,
+ MachineInstr *UseMI,
+ unsigned UseOpIdx,
+ SmallVectorImpl<FoldCandidate> &FoldList) {
+ const MCInstrDesc &Desc = UseMI->getDesc();
+ const MCOperandInfo *OpInfo = Desc.OpInfo;
+ if (!OpInfo || UseOpIdx >= Desc.getNumOperands())
+ return false;
+
+ uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
+ if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
+ OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)
+ return false;
+
+ if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) {
+ UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
+ return true;
+ }
+
+ if (!OpToFold.isReg())
+ return false;
+
+ unsigned UseReg = OpToFold.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(UseReg))
+ return false;
+
+ if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
+ return FC.UseMI == UseMI; }) != FoldList.end())
+ return false;
+
+ MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
+ const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
+ if (!Def || !Def->isRegSequence())
+ return false;
+
+ int64_t Imm;
+ MachineOperand *Op;
+ for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
+ const MachineOperand &Sub = Def->getOperand(I);
+ if (!Sub.isReg() || Sub.getSubReg())
+ return false;
+ MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg());
+ while (SubDef && !SubDef->isMoveImmediate() &&
+ !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef))
+ SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg());
+ if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm())
+ return false;
+ Op = &SubDef->getOperand(1);
+ auto SubImm = Op->getImm();
+ if (I == 1) {
+ if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy))
+ return false;
+
+ Imm = SubImm;
+ continue;
+ }
+ if (Imm != SubImm)
+ return false; // Can only fold splat constants
+ }
+
+ FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
+ return true;
+}
+
void SIFoldOperands::foldOperand(
MachineOperand &OpToFold,
MachineInstr *UseMI,
- unsigned UseOpIdx,
+ int UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
@@ -420,11 +521,18 @@ void SIFoldOperands::foldOperand(
unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+ MachineRegisterInfo::use_iterator Next;
for (MachineRegisterInfo::use_iterator
RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
- RSUse != RSE; ++RSUse) {
+ RSUse != RSE; RSUse = Next) {
+ Next = std::next(RSUse);
MachineInstr *RSUseMI = RSUse->getParent();
+
+ if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,
+ RSUse.getOperandNo(), FoldList))
+ continue;
+
if (RSUse->getSubReg() != RegSeqDstSubReg)
continue;
@@ -435,10 +543,32 @@ void SIFoldOperands::foldOperand(
return;
}
+ if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList))
+ return;
- bool FoldingImm = OpToFold.isImm();
+ if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
+ // Sanity check that this is a stack access.
+ // FIXME: Should probably use stack pseudos before frame lowering.
+ MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+ if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
+ SOff->getReg() != MFI->getStackPtrOffsetReg()))
+ return;
+
+ if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
+ MFI->getScratchRSrcReg())
+ return;
- if (FoldingImm && UseMI->isCopy()) {
+ // A frame index will resolve to a positive constant, so it should always be
+ // safe to fold the addressing mode, even pre-GFX9.
+ UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
+ SOff->setReg(MFI->getStackPtrOffsetReg());
+ return;
+ }
+
+ bool FoldingImmLike =
+ OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
+
+ if (FoldingImmLike && UseMI->isCopy()) {
unsigned DestReg = UseMI->getOperand(0).getReg();
const TargetRegisterClass *DestRC
= TargetRegisterInfo::isVirtualRegister(DestReg) ?
@@ -449,7 +579,7 @@ void SIFoldOperands::foldOperand(
if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
TargetRegisterInfo::isVirtualRegister(SrcReg)) {
const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
- if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) {
+ if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
MachineRegisterInfo::use_iterator NextUse;
SmallVector<FoldCandidate, 4> CopyUses;
for (MachineRegisterInfo::use_iterator
@@ -467,6 +597,14 @@ void SIFoldOperands::foldOperand(
}
}
+ if (DestRC == &AMDGPU::AGPR_32RegClass &&
+ TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
+ UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+ CopiesToReplace.push_back(UseMI);
+ return;
+ }
+
// In order to fold immediates into copies, we need to change the
// copy to a MOV.
@@ -479,18 +617,71 @@ void SIFoldOperands::foldOperand(
} else {
if (UseMI->isCopy() && OpToFold.isReg() &&
TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
- TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) &&
- TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
- TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) &&
+ TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) &&
!UseMI->getOperand(1).getSubReg()) {
+ unsigned Size = TII->getOpSize(*UseMI, 1);
UseMI->getOperand(1).setReg(OpToFold.getReg());
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
UseMI->getOperand(1).setIsKill(false);
CopiesToReplace.push_back(UseMI);
OpToFold.setIsKill(false);
+ if (Size != 4)
+ return;
+ if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()))
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
+ else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32));
return;
}
+ unsigned UseOpc = UseMI->getOpcode();
+ if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
+ (UseOpc == AMDGPU::V_READLANE_B32 &&
+ (int)UseOpIdx ==
+ AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
+ // %vgpr = V_MOV_B32 imm
+ // %sgpr = V_READFIRSTLANE_B32 %vgpr
+ // =>
+ // %sgpr = S_MOV_B32 imm
+ if (FoldingImmLike) {
+ if (execMayBeModifiedBeforeUse(*MRI,
+ UseMI->getOperand(UseOpIdx).getReg(),
+ *OpToFold.getParent(),
+ *UseMI))
+ return;
+
+ UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
+
+ // FIXME: ChangeToImmediate should clear subreg
+ UseMI->getOperand(1).setSubReg(0);
+ if (OpToFold.isImm())
+ UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+ else
+ UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
+ UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
+ return;
+ }
+
+ if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
+ if (execMayBeModifiedBeforeUse(*MRI,
+ UseMI->getOperand(UseOpIdx).getReg(),
+ *OpToFold.getParent(),
+ *UseMI))
+ return;
+
+ // %vgpr = COPY %sgpr0
+ // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
+ // =>
+ // %sgpr1 = COPY %sgpr0
+ UseMI->setDesc(TII->get(AMDGPU::COPY));
+ UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
+ return;
+ }
+ }
+
const MCInstrDesc &UseDesc = UseMI->getDesc();
// Don't fold into target independent nodes. Target independent opcodes
@@ -501,7 +692,7 @@ void SIFoldOperands::foldOperand(
return;
}
- if (!FoldingImm) {
+ if (!FoldingImmLike) {
tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
// FIXME: We could try to change the instruction from 64-bit to 32-bit
@@ -515,14 +706,10 @@ void SIFoldOperands::foldOperand(
const TargetRegisterClass *FoldRC =
TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
-
// Split 64-bit constants into 32-bits for folding.
if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
unsigned UseReg = UseOp.getReg();
- const TargetRegisterClass *UseRC
- = TargetRegisterInfo::isVirtualRegister(UseReg) ?
- MRI->getRegClass(UseReg) :
- TRI->getPhysRegClass(UseReg);
+ const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
return;
@@ -763,14 +950,23 @@ static bool tryFoldInst(const SIInstrInfo *TII,
Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
- if (Src1->isIdenticalTo(*Src0)) {
+ int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
+ int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ if (Src1->isIdenticalTo(*Src0) &&
+ (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) &&
+ (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) {
LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
+ auto &NewDesc =
+ TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (Src2Idx != -1)
MI->RemoveOperand(Src2Idx);
MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
- mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
- : getMovOpc(false)));
+ if (Src1ModIdx != -1)
+ MI->RemoveOperand(Src1ModIdx);
+ if (Src0ModIdx != -1)
+ MI->RemoveOperand(Src0ModIdx);
+ mutateCopyOp(*MI, NewDesc);
LLVM_DEBUG(dbgs() << *MI << '\n');
return true;
}
@@ -788,7 +984,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
SmallVector<FoldCandidate, 4> FoldList;
MachineOperand &Dst = MI.getOperand(0);
- bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+ bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
if (FoldingImm) {
unsigned NumLiteralUses = 0;
MachineOperand *NonInlineUse = nullptr;
@@ -840,6 +1036,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
// in some cases. A better heuristic is needed.
if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+ } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
+ foldOperand(OpToFold, UseMI, OpNo, FoldList,
+ CopiesToReplace);
} else {
if (++NumLiteralUses == 1) {
NonInlineUse = &*Use;
@@ -874,7 +1073,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
Copy->addImplicitDefUseOperands(*MF);
for (FoldCandidate &Fold : FoldList) {
- if (updateOperand(Fold, *TII, *TRI)) {
+ if (updateOperand(Fold, *TII, *TRI, *ST)) {
// Clear kill flags.
if (Fold.isReg()) {
assert(Fold.OpToFold && Fold.OpToFold->isReg());
@@ -926,7 +1125,8 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
// Having a 0 op_sel_hi would require swizzling the output in the source
// instruction, which we can't do.
- unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0;
+ unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
+ : 0u;
if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
return nullptr;
return Src0;
@@ -1105,13 +1305,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
-
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MFI = MF.getInfo<SIMachineFunctionInfo>();
// omod is ignored by hardware if IEEE bit is enabled. omod also does not
// correctly handle signed zeros.
//
- bool IsIEEEMode = ST->enableIEEEBit(MF);
+ // FIXME: Also need to check strictfp
+ bool IsIEEEMode = MFI->getMode().IEEE;
bool HasNSZ = MFI->hasNoSignedZerosFPMath();
for (MachineBasicBlock *MBB : depth_first(&MF)) {
@@ -1132,7 +1332,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
}
MachineOperand &OpToFold = MI.getOperand(1);
- bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+ bool FoldingImm =
+ OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
// FIXME: We could also be folding things like TargetIndexes.
if (!FoldingImm && !OpToFold.isReg())
diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index aa976d5141f8..f3c9ad63a80a 100644
--- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -1,9 +1,8 @@
//===-- SIFormMemoryClauses.cpp -------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -119,6 +118,17 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
return false;
if (!IsVMEMClause && !isSMEMClauseInst(MI))
return false;
+ // If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it.
+ for (const MachineOperand &ResMO : MI.defs()) {
+ unsigned ResReg = ResMO.getReg();
+ for (const MachineOperand &MO : MI.uses()) {
+ if (!MO.isReg() || MO.isDef())
+ continue;
+ if (MO.getReg() == ResReg)
+ return false;
+ }
+ break; // Only check the first def.
+ }
return true;
}
@@ -309,6 +319,8 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count();
MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count();
+ unsigned FuncMaxClause = AMDGPU::getIntegerAttribute(
+ MF.getFunction(), "amdgpu-max-memory-clause", MaxClause);
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::instr_iterator Next;
@@ -329,7 +341,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
continue;
unsigned Length = 1;
- for ( ; Next != E && Length < MaxClause; ++Next) {
+ for ( ; Next != E && Length < FuncMaxClause; ++Next) {
if (!isValidClauseInst(*Next, IsVMEM))
break;
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index e4633c88e18f..feab6bed2603 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1,9 +1,8 @@
//===----------------------- SIFrameLowering.cpp --------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
@@ -22,6 +21,8 @@
using namespace llvm;
+#define DEBUG_TYPE "frame-info"
+
static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
const MachineFunction &MF) {
@@ -35,6 +36,150 @@ static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
ST.getMaxNumSGPRs(MF));
}
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer. We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
+ LivePhysRegs &LiveRegs,
+ const TargetRegisterClass &RC,
+ bool Unused = false) {
+ // Mark callee saved registers as used so we will not choose them.
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ LiveRegs.addReg(CSRegs[i]);
+
+ if (Unused) {
+ // We are looking for a register that can be used throughout the entire
+ // function, so any use is unacceptable.
+ for (unsigned Reg : RC) {
+ if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+ } else {
+ for (unsigned Reg : RC) {
+ if (LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+ }
+
+ // If we require an unused register, this is used in contexts where failure is
+ // an option and has an alternative plan. In other contexts, this must
+ // succeed0.
+ if (!Unused)
+ report_fatal_error("failed to find free scratch register");
+
+ return AMDGPU::NoRegister;
+}
+
+static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
+ LivePhysRegs LiveRegs;
+ LiveRegs.init(*MRI.getTargetRegisterInfo());
+ return findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+}
+
+// We need to specially emit stack operations here because a different frame
+// register is used than in the rest of the function, as getFrameRegister would
+// use.
+static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const SIInstrInfo *TII, unsigned SpillReg,
+ unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+ MachineFunction *MF = MBB.getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+
+ int64_t Offset = MFI.getObjectOffset(FI);
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
+ MFI.getObjectAlignment(FI));
+
+ if (isUInt<12>(Offset)) {
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
+ .addReg(SpillReg, RegState::Kill)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+ return;
+ }
+
+ MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+ MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
+ .addImm(Offset);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
+ .addReg(SpillReg, RegState::Kill)
+ .addReg(OffsetReg, RegState::Kill)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(0)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+}
+
+static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const SIInstrInfo *TII, unsigned SpillReg,
+ unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+ MachineFunction *MF = MBB.getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ int64_t Offset = MFI.getObjectOffset(FI);
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
+ MFI.getObjectAlignment(FI));
+
+ if (isUInt<12>(Offset)) {
+ BuildMI(MBB, I, DebugLoc(),
+ TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+ return;
+ }
+
+ MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+ MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
+ .addImm(Offset);
+
+ BuildMI(MBB, I, DebugLoc(),
+ TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
+ .addReg(OffsetReg, RegState::Kill)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(0)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+}
+
void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
MachineFunction &MF,
MachineBasicBlock &MBB) const {
@@ -71,6 +216,24 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
// Do a 64-bit pointer add.
if (ST.flatScratchIsPointer()) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
+ .addReg(FlatScrInitLo)
+ .addReg(ScratchWaveOffsetReg);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
+ .addReg(FlatScrInitHi)
+ .addImm(0);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
+ addReg(FlatScrInitLo).
+ addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
+ (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
+ addReg(FlatScrInitHi).
+ addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
+ (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
+ return;
+ }
+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitLo)
.addReg(ScratchWaveOffsetReg);
@@ -81,6 +244,8 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
return;
}
+ assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
+
// Copy the size in bytes.
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitHi, RegState::Kill);
@@ -145,34 +310,30 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
return ScratchRsrcReg;
}
-// Shift down registers reserved for the scratch wave offset and stack pointer
-// SGPRs.
-std::pair<unsigned, unsigned>
+// Shift down registers reserved for the scratch wave offset.
+std::pair<unsigned, bool>
SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
- const GCNSubtarget &ST,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- SIMachineFunctionInfo *MFI,
- MachineFunction &MF) const {
+ const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI, MachineFunction &MF) const {
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ assert(MFI->isEntryFunction());
+
// No replacement necessary.
if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
- !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
- assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG);
- return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
+ (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) {
+ return std::make_pair(AMDGPU::NoRegister, false);
}
- unsigned SPReg = MFI->getStackPtrOffsetReg();
if (ST.hasSGPRInitBug())
- return std::make_pair(ScratchWaveOffsetReg, SPReg);
+ return std::make_pair(ScratchWaveOffsetReg, false);
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
if (NumPreloaded > AllSGPRs.size())
- return std::make_pair(ScratchWaveOffsetReg, SPReg);
+ return std::make_pair(ScratchWaveOffsetReg, false);
AllSGPRs = AllSGPRs.slice(NumPreloaded);
@@ -193,10 +354,11 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
unsigned ReservedRegCount = 13;
if (AllSGPRs.size() < ReservedRegCount)
- return std::make_pair(ScratchWaveOffsetReg, SPReg);
+ return std::make_pair(ScratchWaveOffsetReg, false);
bool HandledScratchWaveOffsetReg =
ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+ bool FPAdjusted = false;
for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
// Pick the first unallocated SGPR. Be careful not to pick an alias of the
@@ -206,24 +368,25 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
HandledScratchWaveOffsetReg = true;
MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+ if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) {
+ assert(!hasFP(MF));
+ MFI->setStackPtrOffsetReg(Reg);
+ }
+
MFI->setScratchWaveOffsetReg(Reg);
+ MFI->setFrameOffsetReg(Reg);
ScratchWaveOffsetReg = Reg;
+ FPAdjusted = true;
break;
}
}
}
- return std::make_pair(ScratchWaveOffsetReg, SPReg);
+ return std::make_pair(ScratchWaveOffsetReg, FPAdjusted);
}
void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
- // specified.
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (ST.debuggerEmitPrologue())
- emitDebuggerPrologue(MF, MBB);
-
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -234,6 +397,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// FIXME: We should be cleaning up these unused SGPR spill frame indices
// somewhere.
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -251,38 +415,13 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
if (MFI->hasFlatScratchInit())
emitFlatScratchInit(ST, MF, MBB);
- unsigned SPReg = MFI->getStackPtrOffsetReg();
- if (SPReg != AMDGPU::SP_REG) {
- assert(MRI.isReserved(SPReg) && "SPReg used but not reserved");
-
- DebugLoc DL;
- const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- int64_t StackSize = FrameInfo.getStackSize();
-
- if (StackSize == 0) {
- BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
- .addReg(MFI->getScratchWaveOffsetReg());
- } else {
- BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
- .addReg(MFI->getScratchWaveOffsetReg())
- .addImm(StackSize * ST.getWavefrontSize());
- }
- }
-
unsigned ScratchRsrcReg
= getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
unsigned ScratchWaveOffsetReg;
- std::tie(ScratchWaveOffsetReg, SPReg)
- = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
-
- // It's possible to have uses of only ScratchWaveOffsetReg without
- // ScratchRsrcReg if it's only used for the initialization of flat_scratch,
- // but the inverse is not true.
- if (ScratchWaveOffsetReg == AMDGPU::NoRegister) {
- assert(ScratchRsrcReg == AMDGPU::NoRegister);
- return;
- }
+ bool FPAdjusted;
+ std::tie(ScratchWaveOffsetReg, FPAdjusted) =
+ getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
// We need to insert initialization of the scratch resource descriptor.
unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
@@ -294,18 +433,19 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
}
- bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);
+ bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister &&
+ MRI.isPhysRegUsed(ScratchWaveOffsetReg);
bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
MRI.isPhysRegUsed(ScratchRsrcReg);
+ // FIXME: Hack to not crash in situations which emitted an error.
+ if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister)
+ return;
+
// We added live-ins during argument lowering, but since they were not used
// they were deleted. We're adding the uses now, so add them back.
- if (OffsetRegUsed) {
- assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
- "scratch wave offset input is required");
- MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
- MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
- }
+ MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+ MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
@@ -318,7 +458,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
if (&OtherBB == &MBB)
continue;
- if (OffsetRegUsed)
+ if (OffsetRegUsed || FPAdjusted)
OtherBB.addLiveIn(ScratchWaveOffsetReg);
if (ResourceRegUsed)
@@ -346,11 +486,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
.addReg(PreloadedPrivateBufferReg, RegState::Kill);
}
- if (OffsetRegUsed &&
- PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+ unsigned SPReg = MFI->getStackPtrOffsetReg();
+ assert(SPReg != AMDGPU::SP_REG);
+
+ // FIXME: Remove the isPhysRegUsed checks
+ const bool HasFP = hasFP(MF);
+
+ if (HasFP || OffsetRegUsed) {
+ assert(ScratchWaveOffsetReg);
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
- .addReg(PreloadedScratchWaveOffsetReg,
- MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill);
+ .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0);
}
if (CopyBuffer && !CopyBufferFirst) {
@@ -358,9 +503,26 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
.addReg(PreloadedPrivateBufferReg, RegState::Kill);
}
- if (ResourceRegUsed)
+ if (ResourceRegUsed) {
emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
PreloadedPrivateBufferReg, ScratchRsrcReg);
+ }
+
+ if (HasFP) {
+ DebugLoc DL;
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ int64_t StackSize = FrameInfo.getStackSize();
+
+ // On kernel entry, the private scratch wave offset is the SP value.
+ if (StackSize == 0) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ } else {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
+ .addReg(MFI->getScratchWaveOffsetReg())
+ .addImm(StackSize * ST.getWavefrontSize());
+ }
+ }
}
// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
@@ -405,7 +567,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
}
}
MF.getRegInfo().addLiveIn(GitPtrLo);
- MF.front().addLiveIn(GitPtrLo);
+ MBB.addLiveIn(GitPtrLo);
BuildMI(MBB, I, DL, SMovB32, RsrcLo)
.addReg(GitPtrLo)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
@@ -421,12 +583,15 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
MachineMemOperand::MOLoad |
MachineMemOperand::MOInvariant |
MachineMemOperand::MODereferenceable,
- 0, 0);
+ 16, 4);
unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
+ const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
+ unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset);
BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
.addReg(Rsrc01)
- .addImm(Offset) // offset
+ .addImm(EncodedOffset) // offset
.addImm(0) // glc
+ .addImm(0) // dlc
.addReg(ScratchRsrcReg, RegState::ImplicitDefine)
.addMemOperand(MMO);
return;
@@ -462,13 +627,17 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
MachineMemOperand::MOLoad |
MachineMemOperand::MOInvariant |
MachineMemOperand::MODereferenceable,
- 0, 0);
+ 8, 4);
BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
.addReg(MFI->getImplicitBufferPtrUserSGPR())
.addImm(0) // offset
.addImm(0) // glc
+ .addImm(0) // dlc
.addMemOperand(MMO)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
+ MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
}
} else {
unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
@@ -494,38 +663,14 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
}
}
-// Find a scratch register that we can use at the start of the prologue to
-// re-align the stack pointer. We avoid using callee-save registers since they
-// may appear to be free when this is called from canUseAsPrologue (during
-// shrink wrapping), but then no longer be free when this is called from
-// emitPrologue.
-//
-// FIXME: This is a bit conservative, since in the above case we could use one
-// of the callee-save registers as a scratch temp to re-align the stack pointer,
-// but we would then have to make sure that we were in fact saving at least one
-// callee-save register in the prologue, which is additional complexity that
-// doesn't seem worth the benefit.
-static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
- MachineFunction *MF = MBB.getParent();
-
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
- const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
- LivePhysRegs LiveRegs(TRI);
- LiveRegs.addLiveIns(MBB);
-
- // Mark callee saved registers as used so we will not choose them.
- const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
- for (unsigned i = 0; CSRegs[i]; ++i)
- LiveRegs.addReg(CSRegs[i]);
-
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
- for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
- if (LiveRegs.available(MRI, Reg))
- return Reg;
+bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
+ switch (ID) {
+ case TargetStackID::Default:
+ case TargetStackID::NoAlloc:
+ case TargetStackID::SGPRSpill:
+ return true;
}
-
- return AMDGPU::NoRegister;
+ llvm_unreachable("Invalid TargetStackID::Value");
}
void SIFrameLowering::emitPrologue(MachineFunction &MF,
@@ -537,31 +682,105 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
}
const MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+ LivePhysRegs LiveRegs;
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
- // XXX - Is this the right predicate?
-
- bool NeedFP = hasFP(MF);
+ bool HasFP = false;
uint32_t NumBytes = MFI.getStackSize();
uint32_t RoundedSize = NumBytes;
- const bool NeedsRealignment = TRI.needsStackRealignment(MF);
+ // To avoid clobbering VGPRs in lanes that weren't active on function entry,
+ // turn on all lanes before doing the spill to memory.
+ unsigned ScratchExecCopy = AMDGPU::NoRegister;
+
+ // Emit the copy if we need an FP, and are using a free SGPR to save it.
+ if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
+ .addReg(FramePtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+ : FuncInfo->getSGPRSpillVGPRs()) {
+ if (!Reg.FI.hasValue())
+ continue;
+
+ if (ScratchExecCopy == AMDGPU::NoRegister) {
+ if (LiveRegs.empty()) {
+ LiveRegs.init(TRI);
+ LiveRegs.addLiveIns(MBB);
+ if (FuncInfo->SGPRForFPSaveRestoreCopy)
+ LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ }
+
+ ScratchExecCopy
+ = findScratchNonCalleeSaveRegister(MRI, LiveRegs,
+ *TRI.getWaveMaskRegClass());
+ assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy);
+
+ const unsigned OrSaveExec = ST.isWave32() ?
+ AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
+ ScratchExecCopy)
+ .addImm(-1);
+ }
- if (NeedsRealignment) {
- assert(NeedFP);
+ buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+ FuncInfo->getScratchRSrcReg(),
+ StackPtrReg,
+ Reg.FI.getValue());
+ }
+
+ if (ScratchExecCopy != AMDGPU::NoRegister) {
+ // FIXME: Split block and make terminator.
+ unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+ .addReg(ScratchExecCopy, RegState::Kill);
+ LiveRegs.addReg(ScratchExecCopy);
+ }
+
+
+ if (FuncInfo->FramePointerSaveIndex) {
+ const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+ assert(!MFI.isDeadObjectIndex(FI) &&
+ MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
+ = FuncInfo->getSGPRToVGPRSpills(FI);
+ assert(Spill.size() == 1);
+
+ // Save FP before setting it up.
+ // FIXME: This should respect spillSGPRToVGPR;
+ BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+ Spill[0].VGPR)
+ .addReg(FramePtrReg)
+ .addImm(Spill[0].Lane)
+ .addReg(Spill[0].VGPR, RegState::Undef);
+ }
+
+ if (TRI.needsStackRealignment(MF)) {
+ HasFP = true;
const unsigned Alignment = MFI.getMaxAlignment();
RoundedSize += Alignment;
+ if (LiveRegs.empty()) {
+ LiveRegs.init(TRI);
+ LiveRegs.addLiveIns(MBB);
+ LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ }
- unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
- assert(ScratchSPReg != AMDGPU::NoRegister);
+ unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
+ assert(ScratchSPReg != AMDGPU::NoRegister &&
+ ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
// s_add_u32 tmp_reg, s32, NumBytes
// s_and_b32 s32, tmp_reg, 0b111...0000
@@ -574,7 +793,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.addImm(-Alignment * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameSetup);
FuncInfo->setIsStackRealigned(true);
- } else if (NeedFP) {
+ } else if ((HasFP = hasFP(MF))) {
// If we need a base pointer, set it up here. It's whatever the value of
// the stack pointer is at this point. Any variable size objects will be
// allocated after this, so we can still use the base pointer to reference
@@ -584,21 +803,20 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
- if (RoundedSize != 0 && hasSP(MF)) {
+ if (HasFP && RoundedSize != 0) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
.addReg(StackPtrReg)
.addImm(RoundedSize * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameSetup);
}
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
- : FuncInfo->getSGPRSpillVGPRs()) {
- if (!Reg.FI.hasValue())
- continue;
- TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
- Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
- &TII->getRegisterInfo());
- }
+ assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister ||
+ FuncInfo->FramePointerSaveIndex)) &&
+ "Needed to save FP but didn't save it anywhere");
+
+ assert((HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister &&
+ !FuncInfo->FramePointerSaveIndex)) &&
+ "Saved FP but didn't need it");
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -609,39 +827,87 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ LivePhysRegs LiveRegs;
+ DebugLoc DL;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ uint32_t NumBytes = MFI.getStackSize();
+ uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+ NumBytes + MFI.getMaxAlignment() : NumBytes;
+
+ if (RoundedSize != 0 && hasFP(MF)) {
+ const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
+ .addReg(StackPtrReg)
+ .addImm(RoundedSize * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
+
+ if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg())
+ .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (FuncInfo->FramePointerSaveIndex) {
+ const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+
+ assert(!MF.getFrameInfo().isDeadObjectIndex(FI) &&
+ MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill);
+
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
+ = FuncInfo->getSGPRToVGPRSpills(FI);
+ assert(Spill.size() == 1);
+ BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ FuncInfo->getFrameOffsetReg())
+ .addReg(Spill[0].VGPR)
+ .addImm(Spill[0].Lane);
+ }
+ unsigned ScratchExecCopy = AMDGPU::NoRegister;
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
: FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI.hasValue())
continue;
- TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
- Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
- &TII->getRegisterInfo());
- }
- unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
- if (StackPtrReg == AMDGPU::NoRegister)
- return;
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ if (ScratchExecCopy == AMDGPU::NoRegister) {
+ // See emitPrologue
+ if (LiveRegs.empty()) {
+ LiveRegs.init(*ST.getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ LiveRegs.stepBackward(*MBBI);
+ }
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- uint32_t NumBytes = MFI.getStackSize();
+ ScratchExecCopy = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, *TRI.getWaveMaskRegClass());
+ LiveRegs.removeReg(ScratchExecCopy);
- DebugLoc DL;
+ const unsigned OrSaveExec =
+ ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
- // FIXME: Clarify distinction between no set SP and SP. For callee functions,
- // it's really whether we need SP to be accurate or not.
+ BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
+ .addImm(-1);
+ }
- if (NumBytes != 0 && hasSP(MF)) {
- uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
- NumBytes + MFI.getMaxAlignment() : NumBytes;
+ buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+ FuncInfo->getScratchRSrcReg(),
+ FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
+ }
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
- .addReg(StackPtrReg)
- .addImm(RoundedSize * ST.getWavefrontSize());
+ if (ScratchExecCopy != AMDGPU::NoRegister) {
+ // FIXME: Split block and make terminator.
+ unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+ .addReg(ScratchExecCopy, RegState::Kill);
}
}
+// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
+// memory. They should have been removed by now.
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
I != E; ++I) {
@@ -652,6 +918,22 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
return true;
}
+#ifndef NDEBUG
+static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
+ Optional<int> FramePointerSaveIndex) {
+ for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
+ I != E; ++I) {
+ if (!MFI.isDeadObjectIndex(I) &&
+ MFI.getStackID(I) == TargetStackID::SGPRSpill &&
+ FramePointerSaveIndex && I != FramePointerSaveIndex) {
+ return false;
+ }
+ }
+
+ return true;
+}
+#endif
+
int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const {
const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
@@ -665,81 +947,145 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
RegScavenger *RS) const {
MachineFrameInfo &MFI = MF.getFrameInfo();
- if (!MFI.hasStackObjects())
- return;
-
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- bool AllSGPRSpilledToVGPRs = false;
-
- if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
- AllSGPRSpilledToVGPRs = true;
-
- // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
- // are spilled to VGPRs, in which case we can eliminate the stack usage.
- //
- // XXX - This operates under the assumption that only other SGPR spills are
- // users of the frame index. I'm not 100% sure this is correct. The
- // StackColoring pass has a comment saying a future improvement would be to
- // merging of allocas with spill slots, but for now according to
- // MachineFrameInfo isSpillSlot can't alias any other object.
- for (MachineBasicBlock &MBB : MF) {
- MachineBasicBlock::iterator Next;
- for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
- MachineInstr &MI = *I;
- Next = std::next(I);
-
- if (TII->isSGPRSpill(MI)) {
- int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
- assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL);
- if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
- bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
- (void)Spilled;
- assert(Spilled && "failed to spill SGPR to VGPR when allocated");
- } else
- AllSGPRSpilledToVGPRs = false;
- }
- }
- }
- FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
- }
+ FuncInfo->removeDeadFrameIndices(MFI);
+ assert(allSGPRSpillsAreDead(MFI, None) &&
+ "SGPR spill should have been removed in SILowerSGPRSpills");
// FIXME: The other checks should be redundant with allStackObjectsAreDead,
// but currently hasNonSpillStackObjects is set only from source
// allocas. Stack temps produced from legalization are not counted currently.
- if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
- !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
+ if (!allStackObjectsAreDead(MFI)) {
assert(RS && "RegScavenger required if spilling");
- // We force this to be at offset 0 so no user object ever has 0 as an
- // address, so we may use 0 as an invalid pointer value. This is because
- // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
- // is required to be address space 0, we are forced to accept this for
- // now. Ideally we could have the stack in another address space with 0 as a
- // valid pointer, and -1 as the null value.
- //
- // This will also waste additional space when user stack objects require > 4
- // byte alignment.
- //
- // The main cost here is losing the offset for addressing modes. However
- // this also ensures we shouldn't need a register for the offset when
- // emergency scavenging.
- int ScavengeFI = MFI.CreateFixedObject(
- TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
- RS->addScavengingFrameIndex(ScavengeFI);
+ if (FuncInfo->isEntryFunction()) {
+ int ScavengeFI = MFI.CreateFixedObject(
+ TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
+ RS->addScavengingFrameIndex(ScavengeFI);
+ } else {
+ int ScavengeFI = MFI.CreateStackObject(
+ TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
+ TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
+ false);
+ RS->addScavengingFrameIndex(ScavengeFI);
+ }
}
}
-void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+// Only report VGPRs to generic code.
+void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedVGPRs,
RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ if (MFI->isEntryFunction())
+ return;
+
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ // Ignore the SGPRs the default implementation found.
+ SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
+
+ // hasFP only knows about stack objects that already exist. We're now
+ // determining the stack slots that will be created, so we have to predict
+ // them. Stack objects force FP usage with calls.
+ //
+ // Note a new VGPR CSR may be introduced if one is used for the spill, but we
+ // don't want to report it here.
+ //
+ // FIXME: Is this really hasReservedCallFrame?
+ const bool WillHaveFP =
+ FrameInfo.hasCalls() &&
+ (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
+
+ // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
+ // so don't allow the default insertion to handle them.
+ for (auto SSpill : MFI->getSGPRSpillVGPRs())
+ SavedVGPRs.reset(SSpill.VGPR);
+
+ const bool HasFP = WillHaveFP || hasFP(MF);
+ if (!HasFP)
+ return;
+
+ if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
+ int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
+ TargetStackID::SGPRSpill);
+
+ // If there is already a VGPR with free lanes, use it. We may already have
+ // to pay the penalty for spilling a CSR VGPR.
+ if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+ llvm_unreachable("allocate SGPR spill should have worked");
+
+ MFI->FramePointerSaveIndex = NewFI;
+
+ LLVM_DEBUG(
+ auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+ dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI)
+ << ':' << Spill.Lane << '\n');
+ return;
+ }
+
+ MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
+
+ if (!MFI->SGPRForFPSaveRestoreCopy) {
+ // There's no free lane to spill, and no free register to save FP, so we're
+ // forced to spill another VGPR to use for the spill.
+ int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
+ TargetStackID::SGPRSpill);
+ if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+ llvm_unreachable("allocate SGPR spill should have worked");
+ MFI->FramePointerSaveIndex = NewFI;
+
+ LLVM_DEBUG(
+ auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+ dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
+ << ':' << Spill.Lane << '\n';);
+ } else {
+ LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
+ printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
+ }
+}
+
+void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ if (MFI->isEntryFunction())
+ return;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
// The SP is specifically managed and we don't want extra spills of it.
SavedRegs.reset(MFI->getStackPtrOffsetReg());
+ SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
+}
+
+bool SIFrameLowering::assignCalleeSavedSpillSlots(
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const {
+ if (CSI.empty())
+ return true; // Early exit if no callee saved registers are modified!
+
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ if (!FuncInfo->SGPRForFPSaveRestoreCopy)
+ return false;
+
+ for (auto &CS : CSI) {
+ if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
+ if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
+ CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ break;
+ }
+ }
+
+ return false;
}
MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
@@ -757,8 +1103,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
- const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
- if (!TFI->hasReservedCallFrame(MF)) {
+ if (!hasReservedCallFrame(MF)) {
unsigned Align = getStackAlignment();
Amount = alignTo(Amount, Align);
@@ -777,60 +1122,25 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
return MBB.erase(I);
}
-void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
- MachineBasicBlock &MBB) const {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo *TRI = &TII->getRegisterInfo();
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
- MachineBasicBlock::iterator I = MBB.begin();
- DebugLoc DL;
-
- // For each dimension:
- for (unsigned i = 0; i < 3; ++i) {
- // Get work group ID SGPR, and make it live-in again.
- unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i);
- MF.getRegInfo().addLiveIn(WorkGroupIDSGPR);
- MBB.addLiveIn(WorkGroupIDSGPR);
-
- // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in
- // order to spill it to scratch.
- unsigned WorkGroupIDVGPR =
- MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR)
- .addReg(WorkGroupIDSGPR);
-
- // Spill work group ID.
- int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i);
- TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false,
- WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
-
- // Get work item ID VGPR, and make it live-in again.
- unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i);
- MF.getRegInfo().addLiveIn(WorkItemIDVGPR);
- MBB.addLiveIn(WorkItemIDVGPR);
-
- // Spill work item ID.
- int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i);
- TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false,
- WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
- }
-}
-
bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
- // All stack operations are relative to the frame offset SGPR.
- // TODO: Still want to eliminate sometimes.
const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (MFI.hasCalls()) {
+ // All offsets are unsigned, so need to be addressed in the same direction
+ // as stack growth.
+
+ // FIXME: This function is pretty broken, since it can be called before the
+ // frame layout is determined or CSR spills are inserted.
+ if (MFI.getStackSize() != 0)
+ return true;
+
+ // For the entry point, the input wave scratch offset must be copied to the
+ // API SP if there are calls.
+ if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction())
+ return true;
+ }
- // XXX - Is this only called after frame is finalized? Should be able to check
- // frame size.
- return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI);
-}
-
-bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
- const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
- // All stack operations are relative to the frame offset SGPR.
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
+ return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+ MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
+ MF.getTarget().Options.DisableFramePointerElim(MF);
}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index 2f35b3631cdc..c644f4726e2c 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -1,9 +1,8 @@
//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -37,6 +36,14 @@ public:
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const override;
+ void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS = nullptr) const;
+ bool
+ assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const override;
+
+ bool isSupportedStackID(TargetStackID::Value ID) const override;
void processFunctionBeforeFrameFinalized(
MachineFunction &MF,
@@ -59,15 +66,9 @@ private:
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const;
- std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg(
- const GCNSubtarget &ST,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- SIMachineFunctionInfo *MFI,
- MachineFunction &MF) const;
-
- /// Emits debugger prologue.
- void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+ std::pair<unsigned, bool> getReservedPrivateSegmentWaveByteOffsetReg(
+ const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+ SIMachineFunctionInfo *MFI, MachineFunction &MF) const;
// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF,
@@ -77,7 +78,6 @@ private:
public:
bool hasFP(const MachineFunction &MF) const override;
- bool hasSP(const MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 0ba921647097..db0782e2bf3e 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1,9 +1,8 @@
//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -19,7 +18,6 @@
#include "SIISelLowering.h"
#include "AMDGPU.h"
-#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "SIDefines.h"
@@ -95,11 +93,10 @@ static cl::opt<bool> EnableVGPRIndexMode(
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
cl::init(false));
-static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
- "amdgpu-frame-index-zero-bits",
- cl::desc("High bits of frame index assumed to be zero"),
- cl::init(5),
- cl::ReallyHidden);
+static cl::opt<bool> DisableLoopAlignment(
+ "amdgpu-disable-loop-alignment",
+ cl::desc("Do not align and prefetch loops"),
+ cl::init(false));
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
@@ -125,12 +122,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+ addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
+ addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
+
addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
+ addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
+
addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
@@ -148,18 +151,27 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
}
+ if (Subtarget->hasMAIInsts()) {
+ addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
+ }
+
computeRegisterProperties(Subtarget->getRegisterInfo());
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
setOperationAction(ISD::LOAD, MVT::i1, Custom);
setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v3i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v5i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
@@ -218,11 +230,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -248,8 +264,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
- for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
- MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
+ for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
+ MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
+ MVT::v32i32, MVT::v32f32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -323,6 +340,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+ // Deal with vec3 vector operations when widened to vec4.
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
+
+ // Deal with vec5 vector operations when widened to vec8.
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
+
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
@@ -400,7 +429,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
+ if (Subtarget->haveRoundOpsF64()) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FRINT, MVT::f64, Legal);
@@ -492,7 +521,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// F16 - VOP3 Actions.
setOperationAction(ISD::FMA, MVT::f16, Legal);
- if (!Subtarget->hasFP16Denormals())
+ if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
@@ -607,6 +636,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+
setOperationAction(ISD::SHL, MVT::v4i16, Custom);
setOperationAction(ISD::SRA, MVT::v4i16, Custom);
setOperationAction(ISD::SRL, MVT::v4i16, Custom);
@@ -679,6 +711,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FCANONICALIZE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
@@ -701,13 +734,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
setSchedulingPreference(Sched::RegPressure);
-
- // SI at least has hardware support for floating point exceptions, but no way
- // of using or handling them is implemented. They are also optional in OpenCL
- // (Section 7.3)
- setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
}
const GCNSubtarget *SITargetLowering::getSubtarget() const {
@@ -910,6 +939,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
@@ -919,13 +950,75 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align = 0;
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
+ if (!Vol->isZero())
+ Info.flags |= MachineMemOperand::MOVolatile;
+
+ return true;
+ }
+ case Intrinsic::amdgcn_buffer_atomic_fadd: {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
+ Info.ptrVal = MFI->getBufferPSV(
+ *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+ CI.getArgOperand(1));
+ Info.align = 0;
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
if (!Vol || !Vol->isZero())
Info.flags |= MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_global_atomic_fadd: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
+ ->getPointerElementType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align = 0;
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
+ return true;
+ }
+ case Intrinsic::amdgcn_ds_append:
+ case Intrinsic::amdgcn_ds_consume: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align = 0;
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
+ const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
+ if (!Vol->isZero())
+ Info.flags |= MachineMemOperand::MOVolatile;
+
+ return true;
+ }
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ case Intrinsic::amdgcn_ds_gws_sema_br:
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ case Intrinsic::amdgcn_ds_gws_sema_release_all: {
+ Info.opc = ISD::INTRINSIC_VOID;
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ Info.ptrVal =
+ MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ // This is an abstract access, but we need to specify a type and size.
+ Info.memVT = MVT::i32;
+ Info.size = 4;
+ Info.align = 4;
+
+ Info.flags = MachineMemOperand::MOStore;
+ if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
+ Info.flags = MachineMemOperand::MOLoad;
+ return true;
+ }
default:
return false;
}
@@ -937,6 +1030,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
@@ -960,6 +1055,13 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
// GFX9 added a 13-bit signed offset. When using regular flat instructions,
// the sign bit is ignored and is treated as a 12-bit unsigned offset.
+ // GFX10 shrinked signed offset to 12 bits. When using regular flat
+ // instructions, the sign bit is also ignored and is treated as 11-bit
+ // unsigned offset.
+
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
+ return isUInt<11>(AM.BaseOffs) && AM.Scale == 0;
+
// Just r + i
return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
}
@@ -1030,7 +1132,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return isLegalGlobalAddressingMode(AM);
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -1106,16 +1209,15 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
return (MemVT.getSizeInBits() <= MaxPrivateBits);
- } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
return (MemVT.getSizeInBits() <= 2 * 32);
}
return true;
}
-bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned AddrSpace,
- unsigned Align,
- bool *IsFast) const {
+bool SITargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *IsFast) const {
if (IsFast)
*IsFast = false;
@@ -1178,11 +1280,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
return VT.bitsGT(MVT::i32) && Align % 4 == 0;
}
-EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
- unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset,
- bool MemcpyStrSrc,
- MachineFunction &MF) const {
+EVT SITargetLowering::getOptimalMemOpType(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
// FIXME: Should account for address space here.
// The default fallback uses the private pointer size as a guess for a type to
@@ -1201,7 +1302,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
static bool isFlatGlobalAddrSpace(unsigned AS) {
return AS == AMDGPUAS::GLOBAL_ADDRESS ||
AS == AMDGPUAS::FLAT_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS;
+ AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
}
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -1216,8 +1318,8 @@ bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
return I && I->getMetadata("amdgpu.noclobber");
}
-bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
- unsigned DestAS) const {
+bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
// Flat -> private/local is a simple truncate.
// Flat -> global is no-op
if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
@@ -1305,6 +1407,17 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Val,
bool Signed,
const ISD::InputArg *Arg) const {
+ // First, if it is a widened vector, narrow it.
+ if (VT.isVector() &&
+ VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
+ EVT NarrowedVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
+ VT.getVectorNumElements());
+ Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
+ DAG.getConstant(0, SL, MVT::i32));
+ }
+
+ // Then convert the vector elements or scalar value.
if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
VT.bitsLT(MemVT)) {
unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
@@ -1441,8 +1554,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
// First check if it's a PS input addr.
if (CallConv == CallingConv::AMDGPU_PS &&
- !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
-
+ !Arg->Flags.isInReg() && PSInputNum <= 15) {
bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
// Inconveniently only the first part of the split is marked as isSplit,
@@ -1508,7 +1620,13 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
// Try to allocate a VGPR at the end of the argument list, or if no argument
// VGPRs are left allocating a stack slot.
-static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
+// If \p Mask is is given it indicates bitfield position in the register.
+// If \p Arg is given use it with new ]p Mask instead of allocating new.
+static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
+ ArgDescriptor Arg = ArgDescriptor()) {
+ if (Arg.isSet())
+ return ArgDescriptor::createArg(Arg, Mask);
+
ArrayRef<MCPhysReg> ArgVGPRs
= makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
@@ -1516,7 +1634,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
// Spill to stack required.
int64_t Offset = CCInfo.AllocateStack(4, 4);
- return ArgDescriptor::createStack(Offset);
+ return ArgDescriptor::createStack(Offset, Mask);
}
unsigned Reg = ArgVGPRs[RegIdx];
@@ -1525,7 +1643,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
MachineFunction &MF = CCInfo.getMachineFunction();
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
- return ArgDescriptor::createRegister(Reg);
+ return ArgDescriptor::createRegister(Reg, Mask);
}
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
@@ -1557,14 +1675,21 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
- if (Info.hasWorkItemIDX())
- Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
+ const unsigned Mask = 0x3ff;
+ ArgDescriptor Arg;
+
+ if (Info.hasWorkItemIDX()) {
+ Arg = allocateVGPR32Input(CCInfo, Mask);
+ Info.setWorkItemIDX(Arg);
+ }
- if (Info.hasWorkItemIDY())
- Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
+ if (Info.hasWorkItemIDY()) {
+ Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
+ Info.setWorkItemIDY(Arg);
+ }
if (Info.hasWorkItemIDZ())
- Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
+ Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
}
static void allocateSpecialInputSGPRs(CCState &CCInfo,
@@ -1714,6 +1839,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// should reserve the arguments and use them directly.
MachineFrameInfo &MFI = MF.getFrameInfo();
bool HasStackObjects = MFI.hasStackObjects();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
// Record that we know we have non-spill stack objects so we don't need to
// check all stack objects later.
@@ -1729,65 +1855,89 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (ST.isAmdHsaOrMesa(MF.getFunction())) {
- if (RequiresStackAccess) {
- // If we have stack objects, we unquestionably need the private buffer
- // resource. For the Code Object V2 ABI, this will be the first 4 user
- // SGPR inputs. We can reserve those and use them directly.
-
- unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
- AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
- Info.setScratchRSrcReg(PrivateSegmentBufferReg);
-
- if (MFI.hasCalls()) {
- // If we have calls, we need to keep the frame register in a register
- // that won't be clobbered by a call, so ensure it is copied somewhere.
-
- // This is not a problem for the scratch wave offset, because the same
- // registers are reserved in all functions.
-
- // FIXME: Nothing is really ensuring this is a call preserved register,
- // it's just selected from the end so it happens to be.
- unsigned ReservedOffsetReg
- = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
- Info.setScratchWaveOffsetReg(ReservedOffsetReg);
- } else {
- unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
- AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
- Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
- }
- } else {
- unsigned ReservedBufferReg
- = TRI.reservedPrivateSegmentBufferReg(MF);
- unsigned ReservedOffsetReg
- = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
-
- // We tentatively reserve the last registers (skipping the last two
- // which may contain VCC). After register allocation, we'll replace
- // these with the ones immediately after those which were really
- // allocated. In the prologue copies will be inserted from the argument
- // to these reserved registers.
- Info.setScratchRSrcReg(ReservedBufferReg);
- Info.setScratchWaveOffsetReg(ReservedOffsetReg);
- }
+ if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
+ // If we have stack objects, we unquestionably need the private buffer
+ // resource. For the Code Object V2 ABI, this will be the first 4 user
+ // SGPR inputs. We can reserve those and use them directly.
+
+ unsigned PrivateSegmentBufferReg =
+ Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
+ Info.setScratchRSrcReg(PrivateSegmentBufferReg);
} else {
unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
+ // We tentatively reserve the last registers (skipping the last registers
+ // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
+ // we'll replace these with the ones immediately after those which were
+ // really allocated. In the prologue copies will be inserted from the
+ // argument to these reserved registers.
// Without HSA, relocations are used for the scratch pointer and the
// buffer resource setup is always inserted in the prologue. Scratch wave
// offset is still in an input SGPR.
Info.setScratchRSrcReg(ReservedBufferReg);
+ }
- if (HasStackObjects && !MFI.hasCalls()) {
- unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
- AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
- Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ // hasFP should be accurate for kernels even before the frame is finalized.
+ if (ST.getFrameLowering()->hasFP(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Try to use s32 as the SP, but move it if it would interfere with input
+ // arguments. This won't work with calls though.
+ //
+ // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
+ // registers.
+ if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
+ Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
} else {
- unsigned ReservedOffsetReg
- = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+ assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
+
+ if (MFI.hasCalls())
+ report_fatal_error("call in graphics shader with too many input SGPRs");
+
+ for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
+ if (!MRI.isLiveIn(Reg)) {
+ Info.setStackPtrOffsetReg(Reg);
+ break;
+ }
+ }
+
+ if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
+ report_fatal_error("failed to find register for SP");
+ }
+
+ if (MFI.hasCalls()) {
+ Info.setScratchWaveOffsetReg(AMDGPU::SGPR33);
+ Info.setFrameOffsetReg(AMDGPU::SGPR33);
+ } else {
+ unsigned ReservedOffsetReg =
+ TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
+ Info.setFrameOffsetReg(ReservedOffsetReg);
}
+ } else if (RequiresStackAccess) {
+ assert(!MFI.hasCalls());
+ // We know there are accesses and they will be done relative to SP, so just
+ // pin it to the input.
+ //
+ // FIXME: Should not do this if inline asm is reading/writing these
+ // registers.
+ unsigned PreloadedSP = Info.getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
+ Info.setStackPtrOffsetReg(PreloadedSP);
+ Info.setScratchWaveOffsetReg(PreloadedSP);
+ Info.setFrameOffsetReg(PreloadedSP);
+ } else {
+ assert(!MFI.hasCalls());
+
+ // There may not be stack access at all. There may still be spills, or
+ // access of a constant pointer (in which cases an extra copy will be
+ // emitted in the prolog).
+ unsigned ReservedOffsetReg
+ = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info.setStackPtrOffsetReg(ReservedOffsetReg);
+ Info.setScratchWaveOffsetReg(ReservedOffsetReg);
+ Info.setFrameOffsetReg(ReservedOffsetReg);
}
}
@@ -1845,7 +1995,6 @@ SDValue SITargetLowering::LowerFormalArguments(
const Function &Fn = MF.getFunction();
FunctionType *FType = MF.getFunction().getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
DiagnosticInfoUnsupported NoGraphicsHSA(
@@ -1854,11 +2003,6 @@ SDValue SITargetLowering::LowerFormalArguments(
return DAG.getEntryNode();
}
- // Create stack objects that are used for emitting debugger prologue if
- // "amdgpu-debugger-emit-prologue" attribute was specified.
- if (ST.debuggerEmitPrologue())
- createDebuggerPrologueStackObjects(MF);
-
SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;
BitVector Skipped(Ins.size());
@@ -1869,12 +2013,6 @@ SDValue SITargetLowering::LowerFormalArguments(
bool IsKernel = AMDGPU::isKernel(CallConv);
bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
- if (!IsEntryFunc) {
- // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
- // this when allocating argument fixed offsets.
- CCInfo.AllocateStack(4, 4);
- }
-
if (IsShader) {
processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
@@ -1975,7 +2113,8 @@ SDValue SITargetLowering::LowerFormalArguments(
auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
- ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
// On SI local pointers are just offsets into LDS, so they are always
// less than 16-bits. On CI and newer they could potentially be
// real pointers, so we can't guarantee their size.
@@ -2002,13 +2141,14 @@ SDValue SITargetLowering::LowerFormalArguments(
Reg = MF.addLiveIn(Reg, RC);
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
- if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
+ if (Arg.Flags.isSRet()) {
// The return object should be reasonably addressable.
// FIXME: This helps when the return is a real sret. If it is a
// automatically inserted sret (i.e. CanLowerReturn returns false), an
// extra copy is inserted in SelectionDAGBuilder which obscures this.
- unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
+ unsigned NumBits
+ = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
}
@@ -2126,16 +2266,13 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SDValue ReturnAddrReg = CreateLiveInRegister(
DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
- // FIXME: Should be able to use a vreg here, but need a way to prevent it
- // from being allcoated to a CSR.
-
- SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
- MVT::i64);
-
- Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
+ SDValue ReturnAddrVirtualReg = DAG.getRegister(
+ MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass),
+ MVT::i64);
+ Chain =
+ DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
Flag = Chain.getValue(1);
-
- RetOps.push_back(PhysReturnAddrReg);
+ RetOps.push_back(ReturnAddrVirtualReg);
}
// Copy the result values into the output registers.
@@ -2295,9 +2432,6 @@ void SITargetLowering::passSpecialInputs(
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
- AMDGPUFunctionArgInfo::WORKITEM_ID_X,
- AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
- AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
};
@@ -2337,6 +2471,71 @@ void SITargetLowering::passSpecialInputs(
MemOpChains.push_back(ArgStore);
}
}
+
+ // Pack workitem IDs into a single register or pass it as is if already
+ // packed.
+ const ArgDescriptor *OutgoingArg;
+ const TargetRegisterClass *ArgRC;
+
+ std::tie(OutgoingArg, ArgRC) =
+ CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+ if (!OutgoingArg)
+ std::tie(OutgoingArg, ArgRC) =
+ CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+ if (!OutgoingArg)
+ std::tie(OutgoingArg, ArgRC) =
+ CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+ if (!OutgoingArg)
+ return;
+
+ const ArgDescriptor *IncomingArgX
+ = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first;
+ const ArgDescriptor *IncomingArgY
+ = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first;
+ const ArgDescriptor *IncomingArgZ
+ = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first;
+
+ SDValue InputReg;
+ SDLoc SL;
+
+ // If incoming ids are not packed we need to pack them.
+ if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX)
+ InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
+
+ if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) {
+ SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
+ Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
+ DAG.getShiftAmountConstant(10, MVT::i32, SL));
+ InputReg = InputReg.getNode() ?
+ DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
+ }
+
+ if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) {
+ SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
+ Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
+ DAG.getShiftAmountConstant(20, MVT::i32, SL));
+ InputReg = InputReg.getNode() ?
+ DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
+ }
+
+ if (!InputReg.getNode()) {
+ // Workitem ids are already packed, any of present incoming arguments
+ // will carry all required fields.
+ ArgDescriptor IncomingArg = ArgDescriptor::createArg(
+ IncomingArgX ? *IncomingArgX :
+ IncomingArgY ? *IncomingArgY :
+ *IncomingArgZ, ~0u);
+ InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
+ }
+
+ if (OutgoingArg->isRegister()) {
+ RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+ } else {
+ unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4);
+ SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
+ SpecialArgOffset);
+ MemOpChains.push_back(ArgStore);
+ }
}
static bool canGuaranteeTCO(CallingConv::ID CC) {
@@ -2478,7 +2677,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
"unsupported call from graphics shader of function ");
}
- // The first 4 bytes are reserved for the callee's emergency stack slot.
if (IsTailCall) {
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
@@ -2505,9 +2703,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- // The first 4 bytes are reserved for the callee's emergency stack slot.
- CCInfo.AllocateStack(4, 4);
-
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
@@ -2528,31 +2723,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
MachineFrameInfo &MFI = MF.getFrameInfo();
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
- SDValue CallerSavedFP;
-
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall) {
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
- unsigned OffsetReg = Info->getScratchWaveOffsetReg();
+ SmallVector<SDValue, 4> CopyFromChains;
// In the HSA case, this should be an identity copy.
SDValue ScratchRSrcReg
= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
-
- // TODO: Don't hardcode these registers and get from the callee function.
- SDValue ScratchWaveOffsetReg
- = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
- RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
-
- if (!Info->isEntryFunction()) {
- // Avoid clobbering this function's FP value. In the current convention
- // callee will overwrite this, so do save/restore around the call site.
- CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
- Info->getFrameOffsetReg(), MVT::i32);
- }
+ CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
+ Chain = DAG.getTokenFactor(DL, CopyFromChains);
}
SmallVector<SDValue, 8> MemOpChains;
@@ -2694,6 +2877,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
std::vector<SDValue> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
+ // Add a redundant copy of the callee global which will not be legalized, as
+ // we need direct access to the callee later.
+ GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee);
+ const GlobalValue *GV = GSD->getGlobal();
+ Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
if (IsTailCall) {
// Each tail call may have to adjust the stack by a different amount, so
@@ -2735,12 +2923,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = Call.getValue(0);
InFlag = Call.getValue(1);
- if (CallerSavedFP) {
- SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
- Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
- InFlag = Chain.getValue(1);
- }
-
uint64_t CalleePopBytes = NumBytes;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
@@ -2773,8 +2955,8 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
}
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
- Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
+ if (!Subtarget->hasFlatScrRegister() &&
+ Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
report_fatal_error(Twine("invalid register \""
+ StringRef(RegName) + "\" for subtarget."));
}
@@ -2830,6 +3012,107 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
return SplitBB;
}
+// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
+// \p MI will be the only instruction in the loop body block. Otherwise, it will
+// be the first instruction in the remainder block.
+//
+/// \returns { LoopBody, Remainder }
+static std::pair<MachineBasicBlock *, MachineBasicBlock *>
+splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
+ MachineFunction *MF = MBB.getParent();
+ MachineBasicBlock::iterator I(&MI);
+
+ // To insert the loop we need to split the block. Move everything after this
+ // point to a new block, and insert a new empty block between the two.
+ MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF->insert(MBBI, LoopBB);
+ MF->insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(LoopBB);
+ LoopBB->addSuccessor(RemainderBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+ if (InstInLoop) {
+ auto Next = std::next(I);
+
+ // Move instruction to loop body.
+ LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
+
+ // Move the rest of the block.
+ RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
+ } else {
+ RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+ }
+
+ MBB.addSuccessor(LoopBB);
+
+ return std::make_pair(LoopBB, RemainderBB);
+}
+
+MachineBasicBlock *
+SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+ MachineBasicBlock *LoopBB;
+ MachineBasicBlock *RemainderBB;
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ MachineBasicBlock::iterator Prev = std::prev(MI.getIterator());
+
+ std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
+
+ MachineBasicBlock::iterator I = LoopBB->end();
+ MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+
+ const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
+ AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
+
+ // Clear TRAP_STS.MEM_VIOL
+ BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+ .addImm(0)
+ .addImm(EncodedReg);
+
+ // This is a pain, but we're not allowed to have physical register live-ins
+ // yet. Insert a pair of copies if the VGPR0 hack is necessary.
+ if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) {
+ unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0)
+ .add(*Src);
+
+ BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg())
+ .addReg(Data0);
+
+ MRI.setSimpleHint(Data0, Src->getReg());
+ }
+
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT))
+ .addImm(0);
+
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ // Load and check TRAP_STS.MEM_VIOL
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
+ .addImm(EncodedReg);
+
+ // FIXME: Do we need to use an isel pseudo that may clobber scc?
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
+ .addReg(Reg, RegState::Kill)
+ .addImm(0);
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ .addMBB(LoopBB);
+
+ return RemainderBB;
+}
+
// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
// wavefront. If the value is uniform and just happens to be in a VGPR, this
// will only do one iteration. In the worst case, this will loop 64 times.
@@ -2849,12 +3132,16 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
int Offset,
bool UseGPRIdxMode,
bool IsIndirectSrc) {
+ MachineFunction *MF = OrigBB.getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineBasicBlock::iterator I = LoopBB.begin();
- unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
+ unsigned PhiExec = MRI.createVirtualRegister(BoolRC);
+ unsigned NewExec = MRI.createVirtualRegister(BoolRC);
unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned CondReg = MRI.createVirtualRegister(BoolRC);
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
.addReg(InitReg)
@@ -2878,7 +3165,9 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
// Update EXEC, save the original EXEC value to VCC.
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+ BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
+ : AMDGPU::S_AND_SAVEEXEC_B64),
+ NewExec)
.addReg(CondReg, RegState::Kill);
MRI.setSimpleHint(NewExec, CondReg);
@@ -2894,7 +3183,7 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addImm(Offset);
}
unsigned IdxMode = IsIndirectSrc ?
- VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+ AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
MachineInstr *SetOn =
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
.addReg(IdxReg, RegState::Kill)
@@ -2913,10 +3202,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
}
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
MachineInstr *InsertPt =
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(NewExec);
+ BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
+ : AMDGPU::S_XOR_B64_term), Exec)
+ .addReg(Exec)
+ .addReg(NewExec);
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
// s_cbranch_scc0?
@@ -2942,38 +3233,28 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
bool UseGPRIdxMode,
bool IsIndirectSrc) {
MachineFunction *MF = MBB.getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
+ const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
// Save the EXEC mask
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
- .addReg(AMDGPU::EXEC);
+ BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
+ .addReg(Exec);
- // To insert the loop we need to split the block. Move everything after this
- // point to a new block, and insert a new empty block between the two.
- MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
- MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
- MachineFunction::iterator MBBI(MBB);
- ++MBBI;
-
- MF->insert(MBBI, LoopBB);
- MF->insert(MBBI, RemainderBB);
-
- LoopBB->addSuccessor(LoopBB);
- LoopBB->addSuccessor(RemainderBB);
-
- // Move the rest of the block into a new block.
- RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
- RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
-
- MBB.addSuccessor(LoopBB);
+ MachineBasicBlock *LoopBB;
+ MachineBasicBlock *RemainderBB;
+ std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
@@ -2982,7 +3263,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
Offset, UseGPRIdxMode, IsIndirectSrc);
MachineBasicBlock::iterator First = RemainderBB->begin();
- BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ BuildMI(*RemainderBB, First, DL, TII->get(MovExecOpc), Exec)
.addReg(SaveExec);
return InsPt;
@@ -3025,7 +3306,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
if (UseGPRIdxMode) {
unsigned IdxMode = IsIndirectSrc ?
- VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+ AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
if (Offset == 0) {
MachineInstr *SetOn =
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
@@ -3274,6 +3555,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
const DebugLoc &DL = MI.getDebugLoc();
MachineOperand &Dest = MI.getOperand(0);
@@ -3284,17 +3568,17 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+ Src0, BoolRC, AMDGPU::sub0,
&AMDGPU::SReg_32_XM0RegClass);
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+ Src0, BoolRC, AMDGPU::sub1,
&AMDGPU::SReg_32_XM0RegClass);
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
+ Src1, BoolRC, AMDGPU::sub0,
&AMDGPU::SReg_32_XM0RegClass);
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
+ Src1, BoolRC, AMDGPU::sub1,
&AMDGPU::SReg_32_XM0RegClass);
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
@@ -3330,6 +3614,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
+ case AMDGPU::SI_INIT_EXEC_LO:
+ // This should be before all vector instructions.
+ BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
+ AMDGPU::EXEC_LO)
+ .addImm(MI.getOperand(0).getImm());
+ MI.eraseFromParent();
+ return BB;
+
case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
// Extract the thread count from an SGPR input and set EXEC accordingly.
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
@@ -3363,24 +3655,31 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
(void)Found;
// This should be before all vector instructions.
+ unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
+ bool isWave32 = getSubtarget()->isWave32();
+ unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
.addReg(InputReg)
- .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
- BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
- AMDGPU::EXEC)
+ .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+ BuildMI(*BB, FirstMI, DebugLoc(),
+ TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
+ Exec)
.addReg(CountReg)
.addImm(0);
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
.addReg(CountReg, RegState::Kill)
- .addImm(64);
- BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
- AMDGPU::EXEC)
+ .addImm(getSubtarget()->getWavefrontSize());
+ BuildMI(*BB, FirstMI, DebugLoc(),
+ TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+ Exec)
.addImm(-1);
MI.eraseFromParent();
return BB;
}
case AMDGPU::GET_GROUPSTATICSIZE: {
+ assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+ getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
DebugLoc DL = MI.getDebugLoc();
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
.add(MI.getOperand(0))
@@ -3405,6 +3704,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
return splitKillBlock(MI, BB);
case AMDGPU::V_CNDMASK_B64_PSEUDO: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Src0 = MI.getOperand(1).getReg();
@@ -3414,16 +3715,21 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
.addReg(SrcCond);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
+ .addImm(0)
.addReg(Src0, 0, AMDGPU::sub0)
+ .addImm(0)
.addReg(Src1, 0, AMDGPU::sub0)
.addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
+ .addImm(0)
.addReg(Src0, 0, AMDGPU::sub1)
+ .addImm(0)
.addReg(Src1, 0, AMDGPU::sub1)
.addReg(SrcCondCopy);
@@ -3457,40 +3763,60 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
.addReg(Info->getFrameOffsetReg(), RegState::Implicit);
return BB;
}
- case AMDGPU::SI_CALL_ISEL:
- case AMDGPU::SI_TCRETURN_ISEL: {
+ case AMDGPU::SI_CALL_ISEL: {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
const DebugLoc &DL = MI.getDebugLoc();
+
unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
- MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned GlobalAddrReg = MI.getOperand(0).getReg();
- MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
- assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
- const GlobalValue *G = PCRel->getOperand(1).getGlobal();
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
- MachineInstrBuilder MIB;
- if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
- MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
- .add(MI.getOperand(0))
- .addGlobalAddress(G);
- } else {
- MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
- .add(MI.getOperand(0))
- .addGlobalAddress(G);
+ MIB.cloneMemRefs(MI);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::V_ADD_I32_e32:
+ case AMDGPU::V_SUB_I32_e32:
+ case AMDGPU::V_SUBREV_I32_e32: {
+ // TODO: Define distinct V_*_I32_Pseudo instructions instead.
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opc = MI.getOpcode();
- // There is an additional imm operand for tcreturn, but it should be in the
- // right place already.
+ bool NeedClampOperand = false;
+ if (TII->pseudoToMCOpcode(Opc) == -1) {
+ Opc = AMDGPU::getVOPe64(Opc);
+ NeedClampOperand = true;
}
- for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
- MIB.add(MI.getOperand(I));
+ auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
+ if (TII->isVOP3(*I)) {
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ I.addReg(TRI->getVCC(), RegState::Define);
+ }
+ I.add(MI.getOperand(1))
+ .add(MI.getOperand(2));
+ if (NeedClampOperand)
+ I.addImm(0); // clamp bit for e64 encoding
+
+ TII->legalizeOperands(*I);
- MIB.cloneMemRefs(MI);
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::DS_GWS_INIT:
+ case AMDGPU::DS_GWS_SEMA_V:
+ case AMDGPU::DS_GWS_SEMA_BR:
+ case AMDGPU::DS_GWS_SEMA_P:
+ case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
+ case AMDGPU::DS_GWS_BARRIER:
+ if (getSubtarget()->hasGWSAutoReplay())
+ return BB;
+ return emitGWSMemViolTestLoop(MI, BB);
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
@@ -3617,6 +3943,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::LOAD: {
SDValue Result = LowerLOAD(Op, DAG);
assert((!Result.getNode() ||
@@ -3641,10 +3968,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
+ case ISD::INSERT_SUBVECTOR:
+ return lowerINSERT_SUBVECTOR(Op, DAG);
case ISD::INSERT_VECTOR_ELT:
return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::VECTOR_SHUFFLE:
+ return lowerVECTOR_SHUFFLE(Op, DAG);
case ISD::BUILD_VECTOR:
return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:
@@ -3742,10 +4073,7 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
- const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
- if (!CD)
- return DAG.getUNDEF(VT);
-
+ const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
int CondCode = CD->getSExtValue();
if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
@@ -3753,7 +4081,6 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
-
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
@@ -3769,16 +4096,20 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
- return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
- DAG.getCondCode(CCOpcode));
+ unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
+ EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
+
+ SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
+ DAG.getCondCode(CCOpcode));
+ if (VT.bitsEq(CCVT))
+ return SetCC;
+ return DAG.getZExtOrTrunc(SetCC, DL, VT);
}
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
- const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
- if (!CD)
- return DAG.getUNDEF(VT);
+ const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
int CondCode = CD->getSExtValue();
if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
@@ -3798,8 +4129,13 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
- return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
- Src1, DAG.getCondCode(CCOpcode));
+ unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
+ EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
+ SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
+ Src1, DAG.getCondCode(CCOpcode));
+ if (VT.bitsEq(CCVT))
+ return SetCC;
+ return DAG.getZExtOrTrunc(SetCC, SL, VT);
}
void SITargetLowering::ReplaceNodeResults(SDNode *N,
@@ -3957,32 +4293,6 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
return 0;
}
-void SITargetLowering::createDebuggerPrologueStackObjects(
- MachineFunction &MF) const {
- // Create stack objects that are used for emitting debugger prologue.
- //
- // Debugger prologue writes work group IDs and work item IDs to scratch memory
- // at fixed location in the following format:
- // offset 0: work group ID x
- // offset 4: work group ID y
- // offset 8: work group ID z
- // offset 16: work item ID x
- // offset 20: work item ID y
- // offset 24: work item ID z
- SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- int ObjectIdx = 0;
-
- // For each dimension:
- for (unsigned i = 0; i < 3; ++i) {
- // Create fixed stack object for work group ID.
- ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
- Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
- // Create fixed stack object for work item ID.
- ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
- Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
- }
-}
-
bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
const Triple &TT = getTargetMachine().getTargetTriple();
return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
@@ -3991,7 +4301,10 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
}
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
- return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ // FIXME: Either avoid relying on address space here or change the default
+ // address space for functions to avoid the explicit check.
+ return (GV->getValueType()->isFunctionTy() ||
+ GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
!shouldEmitFixup(GV) &&
@@ -4103,6 +4416,31 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
return Chain;
}
+SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ // Checking the depth
+ if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0)
+ return DAG.getConstant(0, DL, VT);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ // Check for kernel and shader functions
+ if (Info->isEntryFunction())
+ return DAG.getConstant(0, DL, VT);
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ // There is a call to @llvm.returnaddress in this function
+ MFI.setReturnAddressIsTaken(true);
+
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+ // Get the return address reg and mark it as an implicit live-in
+ unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
+
+ return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
SDValue Op,
const SDLoc &DL,
@@ -4131,7 +4469,9 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
- bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ bool IsIEEEMode = Info->getMode().IEEE;
// FIXME: Assert during eslection that this is only selected for
// ieee_mode. Currently a combine can produce the ieee version for non-ieee
@@ -4302,6 +4642,32 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
return DAG.getUNDEF(ASC->getValueType(0));
}
+// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
+// the small vector and inserting them into the big vector. That is better than
+// the default expansion of doing it via a stack slot. Even though the use of
+// the stack slot would be optimized away afterwards, the stack slot itself
+// remains.
+SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Ins = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+ EVT VecVT = Vec.getValueType();
+ EVT InsVT = Ins.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ unsigned InsNumElts = InsVT.getVectorNumElements();
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ SDLoc SL(Op);
+
+ for (unsigned I = 0; I != InsNumElts; ++I) {
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
+ DAG.getConstant(I, SL, MVT::i32));
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
+ DAG.getConstant(IdxVal + I, SL, MVT::i32));
+ }
+ return Vec;
+}
+
SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
SDValue Vec = Op.getOperand(0);
@@ -4352,12 +4718,12 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
MVT IntVT = MVT::getIntegerVT(VecSize);
// Avoid stack access for dynamic indexing.
- SDValue Val = InsVal;
- if (InsVal.getValueType() == MVT::f16)
- Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
-
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
- SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
+
+ // Create a congruent vector with the target value in each element so that
+ // the required element can be masked and ORed into the target vector.
+ SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
+ DAG.getSplatBuildVector(VecVT, SL, InsVal));
assert(isPowerOf2_32(EltSize));
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
@@ -4419,6 +4785,63 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
}
+static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
+ assert(Elt % 2 == 0);
+ return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
+}
+
+SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ EVT ResultVT = Op.getValueType();
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+
+ EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ EVT EltVT = PackVT.getVectorElementType();
+ int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
+
+ // vector_shuffle <0,1,6,7> lhs, rhs
+ // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
+ //
+ // vector_shuffle <6,7,2,3> lhs, rhs
+ // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
+ //
+ // vector_shuffle <6,7,0,1> lhs, rhs
+ // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
+
+ // Avoid scalarizing when both halves are reading from consecutive elements.
+ SmallVector<SDValue, 4> Pieces;
+ for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
+ if (elementPairIsContiguous(SVN->getMask(), I)) {
+ const int Idx = SVN->getMaskElt(I);
+ int VecIdx = Idx < SrcNumElts ? 0 : 1;
+ int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
+ PackVT, SVN->getOperand(VecIdx),
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+ Pieces.push_back(SubVec);
+ } else {
+ const int Idx0 = SVN->getMaskElt(I);
+ const int Idx1 = SVN->getMaskElt(I + 1);
+ int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
+ int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
+ int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
+ int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
+
+ SDValue Vec0 = SVN->getOperand(VecIdx0);
+ SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
+
+ SDValue Vec1 = SVN->getOperand(VecIdx1);
+ SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
+ Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
+ }
+ }
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
+}
+
SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);
@@ -4512,11 +4935,18 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
// small. This requires us to add 4 to the global variable offset in order to
// compute the correct address.
- SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
- GAFlags);
- SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
- GAFlags == SIInstrInfo::MO_NONE ?
- GAFlags : GAFlags + 1);
+ unsigned LoFlags = GAFlags;
+ if (LoFlags == SIInstrInfo::MO_NONE)
+ LoFlags = SIInstrInfo::MO_REL32;
+ SDValue PtrLo =
+ DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags);
+ SDValue PtrHi;
+ if (GAFlags == SIInstrInfo::MO_NONE) {
+ PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
+ } else {
+ PtrHi =
+ DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags + 1);
+ }
return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
}
@@ -4525,7 +4955,10 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GSD->getGlobal();
- if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+ (!GV->hasExternalLinkage() ||
+ getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+ getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) ||
GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
@@ -4533,7 +4966,12 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDLoc DL(GSD);
EVT PtrVT = Op.getValueType();
- // FIXME: Should not make address space based decisions here.
+ if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
+ SIInstrInfo::MO_ABS32_LO);
+ return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
+ }
+
if (shouldEmitFixup(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
else if (shouldEmitPCReloc(GV))
@@ -4641,10 +5079,8 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
}
static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
- SDValue *GLC, SDValue *SLC) {
- auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
- if (!CachePolicyConst)
- return false;
+ SDValue *GLC, SDValue *SLC, SDValue *DLC) {
+ auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
uint64_t Value = CachePolicyConst->getZExtValue();
SDLoc DL(CachePolicy);
@@ -4656,6 +5092,10 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
*SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
Value &= ~(uint64_t)0x2;
}
+ if (DLC) {
+ *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
+ Value &= ~(uint64_t)0x4;
+ }
return Value == 0;
}
@@ -4689,14 +5129,14 @@ static SDValue constructRetValue(SelectionDAG &DAG,
EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
: AdjEltVT;
- // Special case for v8f16. Rather than add support for this, use v4i32 to
+ // Special case for v6f16. Rather than add support for this, use v3i32 to
// extract the data elements
- bool V8F16Special = false;
- if (CastVT == MVT::v8f16) {
- CastVT = MVT::v4i32;
+ bool V6F16Special = false;
+ if (NumElts == 6) {
+ CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
DMaskPop >>= 1;
ReqRetNumElts >>= 1;
- V8F16Special = true;
+ V6F16Special = true;
AdjVT = MVT::v2i32;
}
@@ -4726,7 +5166,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
PreTFCRes = BVElts[0];
}
- if (V8F16Special)
+ if (V6F16Special)
PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
if (!IsTexFail) {
@@ -4745,9 +5185,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
SDValue *LWE, bool &IsTexFail) {
- auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode());
- if (!TexFailCtrlConst)
- return false;
+ auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
uint64_t Value = TexFailCtrlConst->getZExtValue();
if (Value) {
@@ -4774,7 +5212,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
+ const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
+ AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
unsigned IntrOpcode = Intr->BaseOpcode;
+ bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
@@ -4810,9 +5251,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
} else {
unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
- auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
- if (!DMaskConst)
- return Op;
+ auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
DMask = DMaskConst->getZExtValue();
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
@@ -4821,8 +5260,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
MVT StoreVT = VData.getSimpleValueType();
if (StoreVT.getScalarType() == MVT::f16) {
- if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
- !BaseOpcode->HasD16)
+ if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
return Op; // D16 is unsupported for this instruction
IsD16 = true;
@@ -4835,8 +5273,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
// and whether packing is supported.
MVT LoadVT = ResultTypes[0].getSimpleVT();
if (LoadVT.getScalarType() == MVT::f16) {
- if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
- !BaseOpcode->HasD16)
+ if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
return Op; // D16 is unsupported for this instruction
IsD16 = true;
@@ -4878,6 +5315,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
}
+ // Optimize _mip away, when 'lod' is zero
+ if (MIPMappingInfo) {
+ if (auto ConstantLod =
+ dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
+ if (ConstantLod->isNullValue()) {
+ IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
+ NumMIVAddrs--; // remove 'lod'
+ }
+ }
+ }
+
// Check for 16 bit addresses and pack if true.
unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
@@ -4915,7 +5363,22 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
VAddrs.push_back(Op.getOperand(AddrIdx + i));
}
- SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
+ // If the register allocator cannot place the address registers contiguously
+ // without introducing moves, then using the non-sequential address encoding
+ // is always preferable, since it saves VALU instructions and is usually a
+ // wash in terms of code size or even better.
+ //
+ // However, we currently have no way of hinting to the register allocator that
+ // MIMG addresses should be placed contiguously when it is possible to do so,
+ // so force non-NSA for the common 2-address case as a heuristic.
+ //
+ // SIShrinkInstructions will convert NSA encodings to non-NSA after register
+ // allocation when possible.
+ bool UseNSA =
+ ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
+ SDValue VAddr;
+ if (!UseNSA)
+ VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
@@ -4926,9 +5389,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
CtrlIdx = AddrIdx + NumVAddrs + 1;
} else {
auto UnormConst =
- dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
- if (!UnormConst)
- return Op;
+ cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
Unorm = UnormConst->getZExtValue() ? True : False;
CtrlIdx = AddrIdx + NumVAddrs + 3;
@@ -4965,9 +5426,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
return Undef;
}
- // Have to use a power of 2 number of dwords
- NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
-
EVT NewVT = NumVDataDwords > 1 ?
EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
: MVT::f32;
@@ -4983,45 +5441,66 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SDValue GLC;
SDValue SLC;
+ SDValue DLC;
if (BaseOpcode->Atomic) {
GLC = True; // TODO no-return optimization
- if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
+ if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC,
+ IsGFX10 ? &DLC : nullptr))
return Op;
} else {
- if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
+ if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC,
+ IsGFX10 ? &DLC : nullptr))
return Op;
}
- SmallVector<SDValue, 14> Ops;
+ SmallVector<SDValue, 26> Ops;
if (BaseOpcode->Store || BaseOpcode->Atomic)
Ops.push_back(VData); // vdata
- Ops.push_back(VAddr);
+ if (UseNSA) {
+ for (const SDValue &Addr : VAddrs)
+ Ops.push_back(Addr);
+ } else {
+ Ops.push_back(VAddr);
+ }
Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
if (BaseOpcode->Sampler)
Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
+ if (IsGFX10)
+ Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Ops.push_back(Unorm);
+ if (IsGFX10)
+ Ops.push_back(DLC);
Ops.push_back(GLC);
Ops.push_back(SLC);
Ops.push_back(IsA16 && // a16 or r128
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
Ops.push_back(TFE); // tfe
Ops.push_back(LWE); // lwe
- Ops.push_back(DimInfo->DA ? True : False);
+ if (!IsGFX10)
+ Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
Ops.push_back(IsD16 ? True : False);
if (isa<MemSDNode>(Op))
Ops.push_back(Op.getOperand(0)); // chain
- int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
+ int NumVAddrDwords =
+ UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
int Opcode = -1;
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
- Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
- NumVDataDwords, NumVAddrDwords);
- if (Opcode == -1)
- Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
+ if (IsGFX10) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
+ UseNSA ? AMDGPU::MIMGEncGfx10NSA
+ : AMDGPU::MIMGEncGfx10Default,
NumVDataDwords, NumVAddrDwords);
+ } else {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1)
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
+ NumVDataDwords, NumVAddrDwords);
+ }
assert(Opcode != -1);
MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
@@ -5046,7 +5525,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
- SDValue Offset, SDValue GLC,
+ SDValue Offset, SDValue GLC, SDValue DLC,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineMemOperand *MMO = MF.getMachineMemOperand(
@@ -5059,7 +5538,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
SDValue Ops[] = {
Rsrc,
Offset, // Offset
- GLC // glc
+ GLC,
+ DLC,
};
return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
DAG.getVTList(VT), Ops, VT, MMO);
@@ -5263,16 +5743,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDZ);
- case SIIntrinsic::SI_load_const: {
- SDValue Load =
- lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
- DAG.getTargetConstant(0, DL, MVT::i1), DAG);
- return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
- }
+ case Intrinsic::amdgcn_wavefrontsize:
+ return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
+ SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
- unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
- return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
- DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
+ bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
+ SDValue GLC;
+ SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1);
+ if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
+ IsGFX10 ? &DLC : nullptr))
+ return Op;
+ return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), GLC, DLC,
+ DAG);
}
case Intrinsic::amdgcn_fdiv_fast:
return lowerFDIV_FAST(Op, DAG);
@@ -5295,12 +5777,70 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
Glue);
}
+ case Intrinsic::amdgcn_interp_p1_f16: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
+ SDValue Glue = M0.getValue(1);
+ if (getSubtarget()->getLDSBankCount() == 16) {
+ // 16 bank LDS
+ SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
+ DAG.getConstant(2, DL, MVT::i32), // P0
+ Op.getOperand(2), // Attrchan
+ Op.getOperand(3), // Attr
+ Glue);
+ SDValue Ops[] = {
+ Op.getOperand(1), // Src0
+ Op.getOperand(2), // Attrchan
+ Op.getOperand(3), // Attr
+ DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ S, // Src2 - holds two f16 values selected by high
+ DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+ Op.getOperand(4), // high
+ DAG.getConstant(0, DL, MVT::i1), // $clamp
+ DAG.getConstant(0, DL, MVT::i32) // $omod
+ };
+ return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
+ } else {
+ // 32 bank LDS
+ SDValue Ops[] = {
+ Op.getOperand(1), // Src0
+ Op.getOperand(2), // Attrchan
+ Op.getOperand(3), // Attr
+ DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ Op.getOperand(4), // high
+ DAG.getConstant(0, DL, MVT::i1), // $clamp
+ DAG.getConstant(0, DL, MVT::i32), // $omod
+ Glue
+ };
+ return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
+ }
+ }
+ case Intrinsic::amdgcn_interp_p2_f16: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
+ SDValue Glue = SDValue(M0.getNode(), 1);
+ SDValue Ops[] = {
+ Op.getOperand(2), // Src0
+ Op.getOperand(3), // Attrchan
+ Op.getOperand(4), // Attr
+ DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ Op.getOperand(1), // Src2
+ DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+ Op.getOperand(5), // high
+ DAG.getConstant(0, DL, MVT::i1), // $clamp
+ Glue
+ };
+ return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
+ }
case Intrinsic::amdgcn_sin:
return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_cos:
return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
+ case Intrinsic::amdgcn_mul_u24:
+ return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::amdgcn_mul_i24:
+ return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
+
case Intrinsic::amdgcn_log_clamp: {
if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return SDValue();
@@ -5334,10 +5874,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::amdgcn_div_scale: {
- // 3rd parameter required to be a constant.
- const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
- if (!Param)
- return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
+ const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
// Translate to the operands expected by the machine instruction. The
// first parameter must be the same as the first instruction.
@@ -5423,6 +5960,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_fmad_ftz:
return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::amdgcn_if_break:
+ return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
+ Op->getOperand(1), Op->getOperand(2)), 0);
+
+ case Intrinsic::amdgcn_groupstaticsize: {
+ Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
+ if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+ return Op;
+
+ const Module *M = MF.getFunction().getParent();
+ const GlobalValue *GV =
+ M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
+ SIInstrInfo::MO_ABS32_LO);
+ return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -5438,9 +5992,99 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDLoc DL(Op);
switch (IntrID) {
+ case Intrinsic::amdgcn_ds_ordered_add:
+ case Intrinsic::amdgcn_ds_ordered_swap: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ SDValue Chain = M->getOperand(0);
+ SDValue M0 = M->getOperand(2);
+ SDValue Value = M->getOperand(3);
+ unsigned IndexOperand = M->getConstantOperandVal(7);
+ unsigned WaveRelease = M->getConstantOperandVal(8);
+ unsigned WaveDone = M->getConstantOperandVal(9);
+ unsigned ShaderType;
+ unsigned Instruction;
+
+ unsigned OrderedCountIndex = IndexOperand & 0x3f;
+ IndexOperand &= ~0x3f;
+ unsigned CountDw = 0;
+
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
+ CountDw = (IndexOperand >> 24) & 0xf;
+ IndexOperand &= ~(0xf << 24);
+
+ if (CountDw < 1 || CountDw > 4) {
+ report_fatal_error(
+ "ds_ordered_count: dword count must be between 1 and 4");
+ }
+ }
+
+ if (IndexOperand)
+ report_fatal_error("ds_ordered_count: bad index operand");
+
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_ordered_add:
+ Instruction = 0;
+ break;
+ case Intrinsic::amdgcn_ds_ordered_swap:
+ Instruction = 1;
+ break;
+ }
+
+ if (WaveDone && !WaveRelease)
+ report_fatal_error("ds_ordered_count: wave_done requires wave_release");
+
+ switch (DAG.getMachineFunction().getFunction().getCallingConv()) {
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_KERNEL:
+ ShaderType = 0;
+ break;
+ case CallingConv::AMDGPU_PS:
+ ShaderType = 1;
+ break;
+ case CallingConv::AMDGPU_VS:
+ ShaderType = 2;
+ break;
+ case CallingConv::AMDGPU_GS:
+ ShaderType = 3;
+ break;
+ default:
+ report_fatal_error("ds_ordered_count unsupported for this calling conv");
+ }
+
+ unsigned Offset0 = OrderedCountIndex << 2;
+ unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
+ (Instruction << 4);
+
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
+ Offset1 |= (CountDw - 1) << 6;
+
+ unsigned Offset = Offset0 | (Offset1 << 8);
+
+ SDValue Ops[] = {
+ Chain,
+ Value,
+ DAG.getTargetConstant(Offset, DL, MVT::i16),
+ copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
+ };
+ return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
+ M->getVTList(), Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_ds_fadd: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ unsigned Opc;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_fadd:
+ Opc = ISD::ATOMIC_LOAD_FADD;
+ break;
+ }
+
+ return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
+ M->getOperand(0), M->getOperand(2), M->getOperand(3),
+ M->getMemOperand());
+ }
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec:
- case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -5452,9 +6096,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_atomic_dec:
Opc = AMDGPUISD::ATOMIC_DEC;
break;
- case Intrinsic::amdgcn_ds_fadd:
- Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
- break;
case Intrinsic::amdgcn_ds_fmin:
Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
break;
@@ -5503,8 +6144,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand());
+
+ // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
+ if (LoadVT.getScalarType() == MVT::i8 ||
+ LoadVT.getScalarType() == MVT::i16)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+
+ return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+ M->getMemOperand(), DAG);
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format: {
@@ -5531,8 +6178,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand());
+
+ // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
+ if (LoadVT.getScalarType() == MVT::i8 ||
+ LoadVT.getScalarType() == MVT::i16)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+
+ return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+ M->getMemOperand(), DAG);
}
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format: {
@@ -5559,8 +6212,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand());
+
+ // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
+ if (LoadVT.getScalarType() == MVT::i8 ||
+ LoadVT.getScalarType() == MVT::i16)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+
+ return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+ M->getMemOperand(), DAG);
}
case Intrinsic::amdgcn_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -5588,9 +6247,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, LoadVT,
- M->getMemOperand());
+ return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
+ DAG);
}
case Intrinsic::amdgcn_raw_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -5612,9 +6271,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, LoadVT,
- M->getMemOperand());
+ return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
+ DAG);
}
case Intrinsic::amdgcn_struct_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -5636,9 +6295,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (LoadVT.getScalarType() == MVT::f16)
return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
M, DAG, Ops);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, LoadVT,
- M->getMemOperand());
+ return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
+ DAG);
}
case Intrinsic::amdgcn_buffer_atomic_swap:
case Intrinsic::amdgcn_buffer_atomic_add:
@@ -5913,6 +6572,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
}
+// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
+// dwordx4 if on SI.
+SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
+ SDVTList VTList,
+ ArrayRef<SDValue> Ops, EVT MemVT,
+ MachineMemOperand *MMO,
+ SelectionDAG &DAG) const {
+ EVT VT = VTList.VTs[0];
+ EVT WidenedVT = VT;
+ EVT WidenedMemVT = MemVT;
+ if (!Subtarget->hasDwordx3LoadStores() &&
+ (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) {
+ WidenedVT = EVT::getVectorVT(*DAG.getContext(),
+ WidenedVT.getVectorElementType(), 4);
+ WidenedMemVT = EVT::getVectorVT(*DAG.getContext(),
+ WidenedMemVT.getVectorElementType(), 4);
+ MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16);
+ }
+
+ assert(VTList.NumVTs == 2);
+ SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
+
+ auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
+ WidenedMemVT, MMO);
+ if (WidenedVT != VT) {
+ auto Extract = DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
+ DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+ NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
+ }
+ return NewOp;
+}
+
SDValue SITargetLowering::handleD16VData(SDValue VData,
SelectionDAG &DAG) const {
EVT StoreVT = VData.getValueType();
@@ -6129,6 +6821,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+
+ // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
+ EVT VDataType = VData.getValueType().getScalarType();
+ if (VDataType == MVT::i8 || VDataType == MVT::i16)
+ return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
+
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
@@ -6155,6 +6853,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+
+ // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
+ EVT VDataType = VData.getValueType().getScalarType();
+ if (VDataType == MVT::i8 || VDataType == MVT::i16)
+ return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
+
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
@@ -6181,10 +6885,63 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+
+ // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
+ EVT VDataType = VData.getValueType().getScalarType();
+ if (VDataType == MVT::i8 || VDataType == MVT::i16)
+ return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
+
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
+ case Intrinsic::amdgcn_buffer_atomic_fadd: {
+ unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+ unsigned IdxEn = 1;
+ if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
+ IdxEn = Idx->getZExtValue() != 0;
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(2), // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ SDValue(), // voffset -- will be set by setBufferOffsets
+ SDValue(), // soffset -- will be set by setBufferOffsets
+ SDValue(), // offset -- will be set by setBufferOffsets
+ DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+ DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ };
+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ EVT VT = Op.getOperand(2).getValueType();
+
+ auto *M = cast<MemSDNode>(Op);
+ unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
+ : AMDGPUISD::BUFFER_ATOMIC_FADD;
+
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+ M->getMemOperand());
+ }
+
+ case Intrinsic::amdgcn_global_atomic_fadd: {
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(2), // ptr
+ Op.getOperand(3) // vdata
+ };
+ EVT VT = Op.getOperand(3).getValueType();
+
+ auto *M = cast<MemSDNode>(Op);
+ unsigned Opcode = VT.isVector() ? AMDGPUISD::ATOMIC_PK_FADD
+ : AMDGPUISD::ATOMIC_FADD;
+
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+ M->getMemOperand());
+ }
+
+ case Intrinsic::amdgcn_end_cf:
+ return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
+ Op->getOperand(2), Chain), 0);
+
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -6283,6 +7040,38 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
}
+// Handle 8 bit and 16 bit buffer loads
+SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
+ EVT LoadVT, SDLoc DL,
+ ArrayRef<SDValue> Ops,
+ MemSDNode *M) const {
+ EVT IntVT = LoadVT.changeTypeToInteger();
+ unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
+ AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
+
+ SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
+ SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
+ Ops, IntVT,
+ M->getMemOperand());
+ SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL,
+ LoadVT.getScalarType(), BufferLoad);
+ return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL);
+}
+
+// Handle 8 bit and 16 bit buffer stores
+SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
+ EVT VDataType, SDLoc DL,
+ SDValue Ops[],
+ MemSDNode *M) const {
+ SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
+ Ops[1] = BufferStoreExt;
+ unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
+ AMDGPUISD::BUFFER_STORE_SHORT;
+ ArrayRef<SDValue> OpsRef = makeArrayRef(&Ops[0], 9);
+ return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
+ M->getMemOperand());
+}
+
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
ISD::LoadExtType ExtType, SDValue Op,
const SDLoc &SL, EVT VT) {
@@ -6395,8 +7184,25 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
BasePtr, RealMemVT, MMO);
+ if (!MemVT.isVector()) {
+ SDValue Ops[] = {
+ DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+ NewLD.getValue(1)
+ };
+
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ SmallVector<SDValue, 3> Elts;
+ for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
+ SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
+ DAG.getConstant(I, DL, MVT::i32));
+
+ Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
+ }
+
SDValue Ops[] = {
- DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+ DAG.getBuildVector(MemVT, DL, Elts),
NewLD.getValue(1)
};
@@ -6409,15 +7215,21 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented.");
- unsigned Alignment = Load->getAlignment();
- unsigned AS = Load->getAddressSpace();
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
- AS, Alignment)) {
+ *Load->getMemOperand())) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, DL);
}
+ unsigned Alignment = Load->getAlignment();
+ unsigned AS = Load->getAddressSpace();
+ if (Subtarget->hasLDSMisalignedBug() &&
+ AS == AMDGPUAS::FLAT_ADDRESS &&
+ Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
+ return SplitVectorLoad(Op, DAG);
+ }
+
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// If there is a possibilty that flat instruction access scratch memory
@@ -6430,8 +7242,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
- if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32)
- return SDValue();
+ if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
+ if (MemVT.isPow2VectorType())
+ return SDValue();
+ if (NumElements == 3)
+ return WidenVectorLoad(Op, DAG);
+ return SplitVectorLoad(Op, DAG);
+ }
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
@@ -6443,8 +7260,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::GLOBAL_ADDRESS) {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
!Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
- Alignment >= 4 && NumElements < 32)
- return SDValue();
+ Alignment >= 4 && NumElements < 32) {
+ if (MemVT.isPow2VectorType())
+ return SDValue();
+ if (NumElements == 3)
+ return WidenVectorLoad(Op, DAG);
+ return SplitVectorLoad(Op, DAG);
+ }
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
@@ -6456,7 +7278,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
- // v4 loads are supported for private and global memory.
+ // v3 loads not supported on SI.
+ if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
+ return WidenVectorLoad(Op, DAG);
+ // v3 and v4 loads are supported for private and global memory.
return SDValue();
}
if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
@@ -6474,11 +7299,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
// Same as global/flat
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
+ // v3 loads not supported on SI.
+ if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
+ return WidenVectorLoad(Op, DAG);
return SDValue();
default:
llvm_unreachable("unsupported private_element_size");
}
- } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
// Use ds_read_b128 if possible.
if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
MemVT.getStoreSize() == 16)
@@ -6794,7 +7622,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
SDValue Scale;
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ if (!Subtarget->hasUsableDivScaleConditionOutput()) {
// Workaround a hardware bug on SI where the condition output from div_scale
// is not usable.
@@ -6856,12 +7684,18 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.isVector() &&
Store->getValue().getValueType().getScalarType() == MVT::i32);
- unsigned AS = Store->getAddressSpace();
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- AS, Store->getAlignment())) {
+ *Store->getMemOperand())) {
return expandUnalignedStore(Store, DAG);
}
+ unsigned AS = Store->getAddressSpace();
+ if (Subtarget->hasLDSMisalignedBug() &&
+ AS == AMDGPUAS::FLAT_ADDRESS &&
+ Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
+ return SplitVectorStore(Op, DAG);
+ }
+
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// If there is a possibilty that flat instruction access scratch memory
@@ -6875,6 +7709,9 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorStore(Op, DAG);
+ // v3 stores not supported on SI.
+ if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
+ return SplitVectorStore(Op, DAG);
return SDValue();
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
switch (Subtarget->getMaxPrivateElementSize()) {
@@ -6885,16 +7722,16 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorStore(Op, DAG);
return SDValue();
case 16:
- if (NumElements > 4)
+ if (NumElements > 4 || NumElements == 3)
return SplitVectorStore(Op, DAG);
return SDValue();
default:
llvm_unreachable("unsupported private_element_size");
}
- } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
// Use ds_write_b128 if possible.
if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
- VT.getStoreSize() == 16)
+ VT.getStoreSize() == 16 && NumElements != 3)
return SDValue();
if (NumElements > 2)
@@ -6905,7 +7742,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// out-of-bounds even if base + offsets is in bounds. Split vectorized
// stores here to avoid emitting ds_write2_b32. We may re-combine the
// store later in the SILoadStoreOptimizer.
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+ if (!Subtarget->hasUsableDSOffset() &&
NumElements == 2 && VT.getStoreSize() == 8 &&
Store->getAlignment() < 8) {
return SplitVectorStore(Op, DAG);
@@ -7614,6 +8451,43 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
+ DAGCombinerInfo &DCI)
+ const {
+ SDValue Src = N->getOperand(0);
+ auto *VTSign = cast<VTSDNode>(N->getOperand(1));
+
+ if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
+ VTSign->getVT() == MVT::i8) ||
+ (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
+ VTSign->getVT() == MVT::i16)) &&
+ Src.hasOneUse()) {
+ auto *M = cast<MemSDNode>(Src);
+ SDValue Ops[] = {
+ Src.getOperand(0), // Chain
+ Src.getOperand(1), // rsrc
+ Src.getOperand(2), // vindex
+ Src.getOperand(3), // voffset
+ Src.getOperand(4), // soffset
+ Src.getOperand(5), // offset
+ Src.getOperand(6),
+ Src.getOperand(7)
+ };
+ // replace with BUFFER_LOAD_BYTE/SHORT
+ SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
+ Src.getOperand(0).getValueType());
+ unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
+ AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
+ SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
+ ResList,
+ Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ return DCI.DAG.getMergeValues({BufferLoadSignExt,
+ BufferLoadSignExt.getValue(1)}, SDLoc(N));
+ }
+ return SDValue();
+}
+
SDValue SITargetLowering::performClassCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -8013,9 +8887,12 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
if (Cmp == APFloat::cmpGreaterThan)
return SDValue();
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
// TODO: Check IEEE bit enabled?
EVT VT = Op0.getValueType();
- if (Subtarget->enableDX10Clamp()) {
+ if (Info->getMode().DX10Clamp) {
// If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
// hardware fmed3 behavior converting to a min.
// FIXME: Should this be allowing -0.0?
@@ -8059,10 +8936,10 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
// Only do this if the inner op has one use since this will just increases
// register pressure for no benefit.
-
if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
- !VT.isVector() && VT != MVT::f64 &&
- ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
+ !VT.isVector() &&
+ (VT == MVT::i32 || VT == MVT::f32 ||
+ ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
// max(max(a, b), c) -> max3(a, b, c)
// min(min(a, b), c) -> min3(a, b, c)
if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
@@ -8149,9 +9026,12 @@ SDValue SITargetLowering::performFMed3Combine(SDNode *N,
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
}
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
// FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
// handling no dx10-clamp?
- if (Subtarget->enableDX10Clamp()) {
+ if (Info->getMode().DX10Clamp) {
// If NaNs is clamped to 0, we are free to reorder the inputs.
if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
@@ -8342,8 +9222,10 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
// Only do this if we are not trying to support denormals. v_mad_f32 does not
// support denormals ever.
- if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
- (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
+ if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
+ (VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
+ getSubtarget()->hasMadF16())) &&
+ isOperationLegal(ISD::FMAD, VT))
return ISD::FMAD;
const TargetOptions &Options = DAG.getTarget().Options;
@@ -8357,6 +9239,46 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
return 0;
}
+// For a reassociatable opcode perform:
+// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
+SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
+ SelectionDAG &DAG) const {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ unsigned Opc = N->getOpcode();
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ if (!(Op0->isDivergent() ^ Op1->isDivergent()))
+ return SDValue();
+
+ if (Op0->isDivergent())
+ std::swap(Op0, Op1);
+
+ if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
+ return SDValue();
+
+ SDValue Op2 = Op1.getOperand(1);
+ Op1 = Op1.getOperand(0);
+ if (!(Op1->isDivergent() ^ Op2->isDivergent()))
+ return SDValue();
+
+ if (Op1->isDivergent())
+ std::swap(Op1, Op2);
+
+ // If either operand is constant this will conflict with
+ // DAGCombiner::ReassociateOps().
+ if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
+ DAG.isConstantIntBuildVectorOrConstantInt(Op1))
+ return SDValue();
+
+ SDLoc SL(N);
+ SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
+ return DAG.getNode(Opc, SL, VT, Add1, Op2);
+}
+
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
EVT VT,
SDValue N0, SDValue N1, SDValue N2,
@@ -8405,6 +9327,10 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
return SDValue();
}
+ if (SDValue V = reassociateScalarOps(N, DAG)) {
+ return V;
+ }
+
if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
return SDValue();
@@ -8452,14 +9378,10 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
- unsigned Opc = LHS.getOpcode();
- if (Opc != ISD::SUBCARRY)
- std::swap(RHS, LHS);
-
if (LHS.getOpcode() == ISD::SUBCARRY) {
// sub (subcarry x, 0, cc), y => subcarry x, y, cc
auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
- if (!C || C->getZExtValue() != 0)
+ if (!C || !C->isNullValue())
return SDValue();
SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
@@ -8587,7 +9509,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
EVT VT = N->getValueType(0);
SDLoc SL(N);
- if (!Subtarget->hasDotInsts() || VT != MVT::f32)
+ if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
return SDValue();
// FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
@@ -8801,11 +9723,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
if (!CSrc)
return SDValue();
+ const MachineFunction &MF = DCI.DAG.getMachineFunction();
const APFloat &F = CSrc->getValueAPF();
APFloat Zero = APFloat::getZero(F.getSemantics());
APFloat::cmpResult Cmp0 = F.compare(Zero);
if (Cmp0 == APFloat::cmpLessThan ||
- (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+ (Cmp0 == APFloat::cmpUnordered &&
+ MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
}
@@ -8822,7 +9746,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
return SDValue();
-
switch (N->getOpcode()) {
default:
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
@@ -8873,11 +9796,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
+ case ISD::ATOMIC_LOAD_FADD:
case AMDGPUISD::ATOMIC_INC:
case AMDGPUISD::ATOMIC_DEC:
- case AMDGPUISD::ATOMIC_LOAD_FADD:
case AMDGPUISD::ATOMIC_LOAD_FMIN:
- case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
+ case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
if (DCI.isBeforeLegalize())
break;
return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
@@ -8889,6 +9812,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performXorCombine(N, DCI);
case ISD::ZERO_EXTEND:
return performZeroExtendCombine(N, DCI);
+ case ISD::SIGN_EXTEND_INREG:
+ return performSignExtendInRegCombine(N , DCI);
case AMDGPUISD::FP_CLASS:
return performClassCombine(N, DCI);
case ISD::FCANONICALIZE:
@@ -9034,6 +9959,10 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Don't allow 0 dmask, as hardware assumes one channel enabled.
bool NoChannels = !NewDmask;
if (NoChannels) {
+ if (!UsesTFC) {
+ // No uses of the result and not using TFC. Then do nothing.
+ return Node;
+ }
// If the original dmask has one channel - then nothing to do
if (OldBitsSet == 1)
return Node;
@@ -9205,7 +10134,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
break;
MVT VT = Src0.getValueType().getSimpleVT();
- const TargetRegisterClass *RC = getRegClassFor(VT);
+ const TargetRegisterClass *RC =
+ getRegClassFor(VT, Src0.getNode()->isDivergent());
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
@@ -9238,6 +10168,24 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
Ops.push_back(ImpDef.getValue(1));
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
+ case AMDGPU::V_PERMLANE16_B32:
+ case AMDGPU::V_PERMLANEX16_B32: {
+ ConstantSDNode *FI = cast<ConstantSDNode>(Node->getOperand(0));
+ ConstantSDNode *BC = cast<ConstantSDNode>(Node->getOperand(2));
+ if (!FI->getZExtValue() && !BC->getZExtValue())
+ break;
+ SDValue VDstIn = Node->getOperand(6);
+ if (VDstIn.isMachineOpcode()
+ && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF)
+ break;
+ MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ SDLoc(Node), MVT::i32);
+ SmallVector<SDValue, 8> Ops = { SDValue(FI, 0), Node->getOperand(1),
+ SDValue(BC, 0), Node->getOperand(3),
+ Node->getOperand(4), Node->getOperand(5),
+ SDValue(ImpDef, 0), Node->getOperand(7) };
+ return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+ }
default:
break;
}
@@ -9256,6 +10204,36 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
if (TII->isVOP3(MI.getOpcode())) {
// Make sure constant bus requirements are respected.
TII->legalizeOperandsVOP3(MRI, MI);
+
+ // Prefer VGPRs over AGPRs in mAI instructions where possible.
+ // This saves a chain-copy of registers and better ballance register
+ // use between vgpr and agpr as agpr tuples tend to be big.
+ if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) {
+ unsigned Opc = MI.getOpcode();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) {
+ if (I == -1)
+ break;
+ MachineOperand &Op = MI.getOperand(I);
+ if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID &&
+ OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) ||
+ !TargetRegisterInfo::isVirtualRegister(Op.getReg()) ||
+ !TRI->isAGPR(MRI, Op.getReg()))
+ continue;
+ auto *Src = MRI.getUniqueVRegDef(Op.getReg());
+ if (!Src || !Src->isCopy() ||
+ !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
+ continue;
+ auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
+ auto *NewRC = TRI->getEquivalentVGPRClass(RC);
+ // All uses of agpr64 and agpr32 can also accept vgpr except for
+ // v_accvgpr_read, but we do not produce agpr reads during selection,
+ // so no use checks are needed.
+ MRI.setRegClass(Op.getReg(), NewRC);
+ }
+ }
+
return;
}
@@ -9391,9 +10369,15 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 64:
RC = &AMDGPU::SGPR_64RegClass;
break;
+ case 96:
+ RC = &AMDGPU::SReg_96RegClass;
+ break;
case 128:
RC = &AMDGPU::SReg_128RegClass;
break;
+ case 160:
+ RC = &AMDGPU::SReg_160RegClass;
+ break;
case 256:
RC = &AMDGPU::SReg_256RegClass;
break;
@@ -9419,6 +10403,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 128:
RC = &AMDGPU::VReg_128RegClass;
break;
+ case 160:
+ RC = &AMDGPU::VReg_160RegClass;
+ break;
case 256:
RC = &AMDGPU::VReg_256RegClass;
break;
@@ -9427,6 +10414,29 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
break;
}
break;
+ case 'a':
+ switch (VT.getSizeInBits()) {
+ default:
+ return std::make_pair(0U, nullptr);
+ case 32:
+ case 16:
+ RC = &AMDGPU::AGPR_32RegClass;
+ break;
+ case 64:
+ RC = &AMDGPU::AReg_64RegClass;
+ break;
+ case 128:
+ RC = &AMDGPU::AReg_128RegClass;
+ break;
+ case 512:
+ RC = &AMDGPU::AReg_512RegClass;
+ break;
+ case 1024:
+ RC = &AMDGPU::AReg_1024RegClass;
+ // v32 types are not legal but we support them here.
+ return std::make_pair(0U, RC);
+ }
+ break;
}
// We actually support i128, i16 and f16 as inline parameters
// even if they are not reported as legal
@@ -9440,6 +10450,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
RC = &AMDGPU::VGPR_32RegClass;
} else if (Constraint[1] == 's') {
RC = &AMDGPU::SGPR_32RegClass;
+ } else if (Constraint[1] == 'a') {
+ RC = &AMDGPU::AGPR_32RegClass;
}
if (RC) {
@@ -9459,6 +10471,7 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
default: break;
case 's':
case 'v':
+ case 'a':
return C_RegisterClass;
}
}
@@ -9471,7 +10484,7 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (Info->isEntryFunction()) {
@@ -9479,31 +10492,45 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
}
- // We have to assume the SP is needed in case there are calls in the function
- // during lowering. Calls are only detected after the function is
- // lowered. We're about to reserve registers, so don't bother using it if we
- // aren't really going to use it.
- bool NeedSP = !Info->isEntryFunction() ||
- MFI.hasVarSizedObjects() ||
- MFI.hasCalls();
+ assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
+ Info->getStackPtrOffsetReg()));
+ if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
+ MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
- if (NeedSP) {
- unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
- Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
+ // We need to worry about replacing the default register with itself in case
+ // of MIR testcases missing the MFI.
+ if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
+ MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
- assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
- assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
- Info->getStackPtrOffsetReg()));
- MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
- }
+ if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
+ MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
- MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
- MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
- MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
- Info->getScratchWaveOffsetReg());
+ if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) {
+ MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
+ Info->getScratchWaveOffsetReg());
+ }
Info->limitOccupancy(MF);
+ if (ST.isWave32() && !MF.empty()) {
+ // Add VCC_HI def because many instructions marked as imp-use VCC where
+ // we may only define VCC_LO. If nothing defines VCC_HI we may end up
+ // having a use of undef.
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ DebugLoc DL;
+
+ MachineBasicBlock &MBB = MF.front();
+ MachineBasicBlock::iterator I = MBB.getFirstNonDebugInstr();
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), AMDGPU::VCC_HI);
+
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ TII->fixImplicitOperands(MI);
+ }
+ }
+ }
+
TargetLoweringBase::finalizeLowering(MF);
}
@@ -9515,14 +10542,81 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
DAG, Depth);
- if (getSubtarget()->enableHugePrivateBuffer())
- return;
-
- // Technically it may be possible to have a dispatch with a single workitem
- // that uses the full private memory size, but that's not really useful. We
- // can't use vaddr in MUBUF instructions if we don't know the address
+ // Set the high bits to zero based on the maximum allowed scratch size per
+ // wave. We can't use vaddr in MUBUF instructions if we don't know the address
// calculation won't overflow, so assume the sign bit is never set.
- Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
+ Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
+}
+
+unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+ const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
+ const unsigned CacheLineAlign = 6; // log2(64)
+
+ // Pre-GFX10 target did not benefit from loop alignment
+ if (!ML || DisableLoopAlignment ||
+ (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
+ getSubtarget()->hasInstFwdPrefetchBug())
+ return PrefAlign;
+
+ // On GFX10 I$ is 4 x 64 bytes cache lines.
+ // By default prefetcher keeps one cache line behind and reads two ahead.
+ // We can modify it with S_INST_PREFETCH for larger loops to have two lines
+ // behind and one ahead.
+ // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
+ // If loop fits 64 bytes it always spans no more than two cache lines and
+ // does not need an alignment.
+ // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
+ // Else if loop is less or equal 192 bytes we need two lines behind.
+
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const MachineBasicBlock *Header = ML->getHeader();
+ if (Header->getAlignment() != PrefAlign)
+ return Header->getAlignment(); // Already processed.
+
+ unsigned LoopSize = 0;
+ for (const MachineBasicBlock *MBB : ML->blocks()) {
+ // If inner loop block is aligned assume in average half of the alignment
+ // size to be added as nops.
+ if (MBB != Header)
+ LoopSize += (1 << MBB->getAlignment()) / 2;
+
+ for (const MachineInstr &MI : *MBB) {
+ LoopSize += TII->getInstSizeInBytes(MI);
+ if (LoopSize > 192)
+ return PrefAlign;
+ }
+ }
+
+ if (LoopSize <= 64)
+ return PrefAlign;
+
+ if (LoopSize <= 128)
+ return CacheLineAlign;
+
+ // If any of parent loops is surrounded by prefetch instructions do not
+ // insert new for inner loop, which would reset parent's settings.
+ for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
+ if (MachineBasicBlock *Exit = P->getExitBlock()) {
+ auto I = Exit->getFirstNonDebugInstr();
+ if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
+ return CacheLineAlign;
+ }
+ }
+
+ MachineBasicBlock *Pre = ML->getLoopPreheader();
+ MachineBasicBlock *Exit = ML->getExitBlock();
+
+ if (Pre && Exit) {
+ BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
+ TII->get(AMDGPU::S_INST_PREFETCH))
+ .addImm(1); // prefetch 2 lines behind PC
+
+ BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
+ TII->get(AMDGPU::S_INST_PREFETCH))
+ .addImm(2); // prefetch 1 line behind PC
+ }
+
+ return CacheLineAlign;
}
LLVM_ATTRIBUTE_UNUSED
@@ -9531,7 +10625,8 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
do {
// Follow the chain until we find an INLINEASM node.
N = N->getOperand(0).getNode();
- if (N->getOpcode() == ISD::INLINEASM)
+ if (N->getOpcode() == ISD::INLINEASM ||
+ N->getOpcode() == ISD::INLINEASM_BR)
return true;
} while (N->getOpcode() == ISD::CopyFromReg);
return false;
@@ -9616,7 +10711,10 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
bool SNaN,
unsigned Depth) const {
if (Op.getOpcode() == AMDGPUISD::CLAMP) {
- if (Subtarget->enableDX10Clamp())
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (Info->getMode().DX10Clamp)
return true; // Clamped to 0.
return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
}
@@ -9624,3 +10722,29 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
SNaN, Depth);
}
+
+TargetLowering::AtomicExpansionKind
+SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+ switch (RMW->getOperation()) {
+ case AtomicRMWInst::FAdd: {
+ Type *Ty = RMW->getType();
+
+ // We don't have a way to support 16-bit atomics now, so just leave them
+ // as-is.
+ if (Ty->isHalfTy())
+ return AtomicExpansionKind::None;
+
+ if (!Ty->isFloatTy())
+ return AtomicExpansionKind::CmpXChg;
+
+ // TODO: Do have these for flat. Older targets also had them for buffers.
+ unsigned AS = RMW->getPointerAddressSpace();
+ return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
+ AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
+ }
+ default:
+ break;
+ }
+
+ return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index bcef519ee663..21a215e16ce7 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -1,9 +1,8 @@
//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -61,7 +60,7 @@ private:
SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
SelectionDAG &DAG) const;
SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
- SDValue GLC, SelectionDAG &DAG) const;
+ SDValue GLC, SDValue DLC, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -90,11 +89,17 @@ private:
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
SelectionDAG &DAG, ArrayRef<SDValue> Ops,
bool IsIntrinsic = false) const;
+ // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
+ // dwordx4 if on SI.
+ SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+ ArrayRef<SDValue> Ops, EVT MemVT,
+ MachineMemOperand *MMO, SelectionDAG &DAG) const;
+
SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
/// Converts \p Op, which must be of floating point type, to the
@@ -116,8 +121,10 @@ private:
SelectionDAG &DAG) const;
SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;
@@ -141,6 +148,7 @@ private:
SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSignExtendInRegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
const APFloat &C) const;
@@ -156,6 +164,7 @@ private:
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -174,8 +183,6 @@ private:
unsigned isCFIntrinsic(const SDNode *Intr) const;
- void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
-
/// \returns True if fixup needs to be emitted for given global value \p GV,
/// false otherwise.
bool shouldEmitFixup(const GlobalValue *GV) const;
@@ -194,6 +201,15 @@ private:
void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
SDValue *Offsets, unsigned Align = 4) const;
+ // Handle 8 bit and 16 bit buffer loads
+ SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
+ ArrayRef<SDValue> Ops, MemSDNode *M) const;
+
+ // Handle 8 bit and 16 bit buffer stores
+ SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType,
+ SDLoc DL, SDValue Ops[],
+ MemSDNode *M) const;
+
public:
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
@@ -219,20 +235,21 @@ public:
bool canMergeStoresTo(unsigned AS, EVT MemVT,
const SelectionDAG &DAG) const override;
- bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
- unsigned Align,
- bool *IsFast) const override;
+ bool allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AS, unsigned Align,
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ bool *IsFast = nullptr) const override;
EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
unsigned SrcAlign, bool IsMemset,
bool ZeroMemset,
bool MemcpyStrSrc,
- MachineFunction &MF) const override;
+ const AttributeList &FuncAttributes) const override;
bool isMemOpUniform(const SDNode *N) const;
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
- bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+ bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(MVT VT) const override;
@@ -298,6 +315,9 @@ public:
MachineBasicBlock *splitKillBlock(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const override;
@@ -352,6 +372,9 @@ public:
const SelectionDAG &DAG,
bool SNaN = false,
unsigned Depth = 0) const override;
+ AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+
+ unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index ba21a5ce1293..87e63fcc4a04 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -1,9 +1,8 @@
//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -93,15 +92,13 @@ INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
-static bool opcodeEmitsNoInsts(unsigned Opc) {
- switch (Opc) {
- case TargetOpcode::IMPLICIT_DEF:
- case TargetOpcode::KILL:
- case TargetOpcode::BUNDLE:
- case TargetOpcode::CFI_INSTRUCTION:
- case TargetOpcode::EH_LABEL:
- case TargetOpcode::GC_LABEL:
- case TargetOpcode::DBG_VALUE:
+static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
+ if (MI.isMetaInstruction())
+ return true;
+
+ // Handle target specific opcodes.
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_MASK_BRANCH:
return true;
default:
return false;
@@ -110,9 +107,6 @@ static bool opcodeEmitsNoInsts(unsigned Opc) {
bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
const MachineBasicBlock &To) const {
- if (From.succ_empty())
- return false;
-
unsigned NumInstr = 0;
const MachineFunction *MF = From.getParent();
@@ -122,7 +116,7 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
NumInstr < SkipThreshold && I != E; ++I) {
- if (opcodeEmitsNoInsts(I->getOpcode()))
+ if (opcodeEmitsNoInsts(*I))
continue;
// FIXME: Since this is required for correctness, this should be inserted
@@ -138,6 +132,11 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
return true;
+ // These instructions are potentially expensive even if EXEC = 0.
+ if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
+ I->getOpcode() == AMDGPU::S_WAITCNT)
+ return true;
+
++NumInstr;
if (NumInstr >= SkipThreshold)
return true;
@@ -177,7 +176,7 @@ bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
.addImm(0); // en
// ... and terminate wavefront.
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
return true;
}
@@ -245,6 +244,10 @@ void SIInsertSkips::kill(MachineInstr &MI) {
llvm_unreachable("invalid ISD:SET cond code");
}
+ const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ if (ST.hasNoSdstCMPX())
+ Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
+
assert(MI.getOperand(0).isReg());
if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
@@ -254,17 +257,23 @@ void SIInsertSkips::kill(MachineInstr &MI) {
.add(MI.getOperand(1))
.add(MI.getOperand(0));
} else {
- BuildMI(MBB, &MI, DL, TII->get(Opcode))
- .addReg(AMDGPU::VCC, RegState::Define)
- .addImm(0) // src0 modifiers
- .add(MI.getOperand(1))
- .addImm(0) // src1 modifiers
- .add(MI.getOperand(0))
- .addImm(0); // omod
+ auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
+ if (!ST.hasNoSdstCMPX())
+ I.addReg(AMDGPU::VCC, RegState::Define);
+
+ I.addImm(0) // src0 modifiers
+ .add(MI.getOperand(1))
+ .addImm(0) // src1 modifiers
+ .add(MI.getOperand(0));
+
+ I.addImm(0); // omod
}
break;
}
case AMDGPU::SI_KILL_I1_TERMINATOR: {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
const MachineOperand &Op = MI.getOperand(0);
int64_t KillVal = MI.getOperand(1).getImm();
assert(KillVal == 0 || KillVal == -1);
@@ -275,14 +284,17 @@ void SIInsertSkips::kill(MachineInstr &MI) {
assert(Imm == 0 || Imm == -1);
if (Imm == KillVal)
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
+ : AMDGPU::S_MOV_B64), Exec)
.addImm(0);
break;
}
unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
- BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ if (ST.isWave32())
+ Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
+ BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
+ .addReg(Exec)
.add(Op);
break;
}
@@ -331,9 +343,11 @@ bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
// S_CBRANCH_EXEC[N]Z
bool Changed = false;
MachineBasicBlock &MBB = *MI.getParent();
- const unsigned CondReg = AMDGPU::VCC;
- const unsigned ExecReg = AMDGPU::EXEC;
- const unsigned And = AMDGPU::S_AND_B64;
+ const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ const bool IsWave32 = ST.isWave32();
+ const unsigned CondReg = TRI->getVCC();
+ const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
E = MBB.rend();
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index afc0b4467610..c89d5b71ec5c 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1,9 +1,8 @@
//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -69,10 +68,10 @@ DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
"Force emit s_waitcnt vmcnt(0) instrs");
-static cl::opt<unsigned> ForceEmitZeroFlag(
+static cl::opt<bool> ForceEmitZeroFlag(
"amdgpu-waitcnt-forcezero",
cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
- cl::init(0), cl::Hidden);
+ cl::init(false), cl::Hidden);
namespace {
@@ -101,7 +100,7 @@ public:
#define CNT_MASK(t) (1u << (t))
-enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
+enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
return make_range(enum_iterator<InstCounterType>(VM_CNT),
@@ -114,6 +113,7 @@ struct {
uint32_t VmcntMax;
uint32_t ExpcntMax;
uint32_t LgkmcntMax;
+ uint32_t VscntMax;
int32_t NumVGPRsMax;
int32_t NumSGPRsMax;
} HardwareLimits;
@@ -127,6 +127,8 @@ struct {
enum WaitEventType {
VMEM_ACCESS, // vector-memory read & write
+ VMEM_READ_ACCESS, // vector-memory read
+ VMEM_WRITE_ACCESS,// vector-memory write
LDS_ACCESS, // lds read & write
GDS_ACCESS, // gds read & write
SQ_MESSAGE, // send message
@@ -140,11 +142,12 @@ enum WaitEventType {
};
static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
- (1 << VMEM_ACCESS),
+ (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
(1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
(1 << SQ_MESSAGE),
(1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
(1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
+ (1 << VMEM_WRITE_ACCESS)
};
// The mapping is:
@@ -172,6 +175,9 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
case LGKM_CNT:
Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
break;
+ case VS_CNT:
+ Wait.VsCnt = std::min(Wait.VsCnt, Count);
+ break;
default:
llvm_unreachable("bad InstCounterType");
}
@@ -200,6 +206,8 @@ public:
return HardwareLimits.LgkmcntMax;
case EXP_CNT:
return HardwareLimits.ExpcntMax;
+ case VS_CNT:
+ return HardwareLimits.VscntMax;
default:
break;
}
@@ -222,10 +230,12 @@ public:
// Mapping from event to counter.
InstCounterType eventCounter(WaitEventType E) {
- if (E == VMEM_ACCESS)
+ if (WaitEventMaskForInst[VM_CNT] & (1 << E))
return VM_CNT;
if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
return LGKM_CNT;
+ if (WaitEventMaskForInst[VS_CNT] & (1 << E))
+ return VS_CNT;
assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
return EXP_CNT;
}
@@ -453,7 +463,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
unsigned OpNo, bool Def) const {
const MachineOperand &Op = MI->getOperand(OpNo);
if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
- (Def && !Op.isDef()))
+ (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg()))
return {-1, -1};
// A use via a PW operand does not need a waitcnt.
@@ -526,20 +536,22 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
// Put score on the source vgprs. If this is a store, just use those
// specific register(s).
if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
+ int AddrOpIdx =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
// All GDS operations must protect their address register (same as
// export.)
- if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
- Inst.getOpcode() != AMDGPU::DS_CONSUME) {
- setExpScore(
- &Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
- CurrScore);
+ if (AddrOpIdx != -1) {
+ setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
}
+
if (Inst.mayStore()) {
- setExpScore(
- &Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
- CurrScore);
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::data0) != -1) {
+ setExpScore(
+ &Inst, TII, TRI, MRI,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
+ CurrScore);
+ }
if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
AMDGPU::OpName::data1) != -1) {
setExpScore(&Inst, TII, TRI, MRI,
@@ -663,6 +675,9 @@ void WaitcntBrackets::print(raw_ostream &OS) {
case EXP_CNT:
OS << " EXP_CNT(" << UB - LB << "): ";
break;
+ case VS_CNT:
+ OS << " VS_CNT(" << UB - LB << "): ";
+ break;
default:
OS << " UNKNOWN(" << UB - LB << "): ";
break;
@@ -702,7 +717,8 @@ void WaitcntBrackets::print(raw_ostream &OS) {
bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
- simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+ simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
+ simplifyWaitcnt(VS_CNT, Wait.VsCnt);
}
bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -745,6 +761,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(VM_CNT, Wait.VmCnt);
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+ applyWaitcnt(VS_CNT, Wait.VsCnt);
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -790,6 +807,21 @@ static bool readsVCCZ(const MachineInstr &MI) {
!MI.getOperand(1).isUndef();
}
+/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
+static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
+ // Currently all conventions wait, but this may not always be the case.
+ //
+ // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
+ // senses to omit the wait and do it in the caller.
+ return true;
+}
+
+/// \returns true if the callee is expected to wait for any outstanding waits
+/// before returning.
+static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
+ return true;
+}
+
/// Generate s_waitcnt instruction to be placed before cur_Inst.
/// Instructions of a given type are returned in order,
/// but instructions of different types can complete out of order.
@@ -815,7 +847,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// TODO: Handle other cases of NeedsWaitcntVmBefore()
if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
- MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+ MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
+ MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
+ MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
Wait.VmCnt = 0;
}
@@ -823,8 +857,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// NOTE: this could be improved with knowledge of all call sites or
// with knowledge of the called routines.
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
- MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
- Wait = AMDGPU::Waitcnt::allZero();
+ MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+ (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
+ Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
}
// Resolve vm waits before gs-done.
else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
@@ -903,91 +938,91 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
}
}
-#if 0 // TODO: the following code to handle CALL.
- // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
- // However, there is a problem with EXP_CNT, because the call cannot
- // easily tell if a register is used in the function, and if it did, then
- // the referring instruction would have to have an S_WAITCNT, which is
- // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
- // before the call.
- if (MI.getOpcode() == SC_CALL) {
- if (ScoreBrackets->getScoreUB(EXP_CNT) >
- ScoreBrackets->getScoreLB(EXP_CNT)) {
- ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
- EmitWaitcnt |= CNT_MASK(EXP_CNT);
- }
- }
-#endif
-
- // FIXME: Should not be relying on memoperands.
- // Look at the source operands of every instruction to see if
- // any of them results from a previous memory operation that affects
- // its current usage. If so, an s_waitcnt instruction needs to be
- // emitted.
- // If the source operand was defined by a load, add the s_waitcnt
- // instruction.
- for (const MachineMemOperand *Memop : MI.memoperands()) {
- unsigned AS = Memop->getAddrSpace();
- if (AS != AMDGPUAS::LOCAL_ADDRESS)
- continue;
- unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
- // VM_CNT is only relevant to vgpr or LDS.
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- }
+ if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
+ // Don't bother waiting on anything except the call address. The function
+ // is going to insert a wait on everything in its prolog. This still needs
+ // to be careful if the call target is a load (e.g. a GOT load).
+ Wait = AMDGPU::Waitcnt();
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
- const MachineOperand &Op = MI.getOperand(I);
- const MachineRegisterInfo &MRIA = *MRI;
- RegInterval Interval =
- ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+ int CallAddrOpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+ RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI,
+ CallAddrOpIdx, false);
for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- if (TRI->isVGPR(MRIA, Op.getReg())) {
- // VM_CNT is only relevant to vgpr or LDS.
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- }
ScoreBrackets.determineWait(
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
}
- }
- // End of for loop that looks at all source operands to decide vm_wait_cnt
- // and lgk_wait_cnt.
-
- // Two cases are handled for destination operands:
- // 1) If the destination operand was defined by a load, add the s_waitcnt
- // instruction to guarantee the right WAW order.
- // 2) If a destination operand that was used by a recent export/store ins,
- // add s_waitcnt on exp_cnt to guarantee the WAR order.
- if (MI.mayStore()) {
+ } else {
// FIXME: Should not be relying on memoperands.
+ // Look at the source operands of every instruction to see if
+ // any of them results from a previous memory operation that affects
+ // its current usage. If so, an s_waitcnt instruction needs to be
+ // emitted.
+ // If the source operand was defined by a load, add the s_waitcnt
+ // instruction.
for (const MachineMemOperand *Memop : MI.memoperands()) {
unsigned AS = Memop->getAddrSpace();
if (AS != AMDGPUAS::LOCAL_ADDRESS)
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+ // VM_CNT is only relevant to vgpr or LDS.
ScoreBrackets.determineWait(
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- ScoreBrackets.determineWait(
- EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
}
- }
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
- MachineOperand &Def = MI.getOperand(I);
- const MachineRegisterInfo &MRIA = *MRI;
- RegInterval Interval =
- ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
- for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- if (TRI->isVGPR(MRIA, Def.getReg())) {
+
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+ const MachineOperand &Op = MI.getOperand(I);
+ const MachineRegisterInfo &MRIA = *MRI;
+ RegInterval Interval =
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+ for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ if (TRI->isVGPR(MRIA, Op.getReg())) {
+ // VM_CNT is only relevant to vgpr or LDS.
+ ScoreBrackets.determineWait(
+ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ }
+ ScoreBrackets.determineWait(
+ LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+ }
+ }
+ // End of for loop that looks at all source operands to decide vm_wait_cnt
+ // and lgk_wait_cnt.
+
+ // Two cases are handled for destination operands:
+ // 1) If the destination operand was defined by a load, add the s_waitcnt
+ // instruction to guarantee the right WAW order.
+ // 2) If a destination operand that was used by a recent export/store ins,
+ // add s_waitcnt on exp_cnt to guarantee the WAR order.
+ if (MI.mayStore()) {
+ // FIXME: Should not be relying on memoperands.
+ for (const MachineMemOperand *Memop : MI.memoperands()) {
+ unsigned AS = Memop->getAddrSpace();
+ if (AS != AMDGPUAS::LOCAL_ADDRESS)
+ continue;
+ unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
ScoreBrackets.determineWait(
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
ScoreBrackets.determineWait(
EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
}
- ScoreBrackets.determineWait(
- LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
}
- } // End of for loop that looks at all dest operands.
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+ MachineOperand &Def = MI.getOperand(I);
+ const MachineRegisterInfo &MRIA = *MRI;
+ RegInterval Interval =
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
+ for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ if (TRI->isVGPR(MRIA, Def.getReg())) {
+ ScoreBrackets.determineWait(
+ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ ScoreBrackets.determineWait(
+ EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+ }
+ ScoreBrackets.determineWait(
+ LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+ }
+ } // End of for loop that looks at all dest operands.
+ }
}
// Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
@@ -996,13 +1031,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// requiring a WAITCNT beforehand.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier()) {
- Wait = AMDGPU::Waitcnt::allZero();
+ Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
// after fixing the scheduler. Also, the Shader Compiler code is
// independent of target.
- if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
+ if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
if (ScoreBrackets.getScoreLB(LGKM_CNT) <
ScoreBrackets.getScoreUB(LGKM_CNT) &&
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
@@ -1014,21 +1049,31 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
bool Modified = false;
if (OldWaitcntInstr) {
- if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
- TrackedWaitcntSet.erase(OldWaitcntInstr);
- OldWaitcntInstr->eraseFromParent();
- Modified = true;
- } else {
- int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+ for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
+ &*II != &MI; II = NextI, ++NextI) {
+ if (II->isDebugInstr())
+ continue;
+
+ if (TrackedWaitcntSet.count(&*II)) {
+ TrackedWaitcntSet.erase(&*II);
+ II->eraseFromParent();
+ Modified = true;
+ } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+ int64_t Imm = II->getOperand(0).getImm();
+ ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+ } else {
+ assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ ScoreBrackets.applyWaitcnt(
+ AMDGPU::Waitcnt(0, 0, 0, II->getOperand(1).getImm()));
+ }
}
- Modified = true;
}
return Modified;
}
if (ForceEmitZeroWaitcnts)
- Wait = AMDGPU::Waitcnt::allZero();
+ Wait = AMDGPU::Waitcnt::allZero(IV);
if (ForceEmitWaitcnt[VM_CNT])
Wait.VmCnt = 0;
@@ -1036,39 +1081,88 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
Wait.ExpCnt = 0;
if (ForceEmitWaitcnt[LGKM_CNT])
Wait.LgkmCnt = 0;
+ if (ForceEmitWaitcnt[VS_CNT])
+ Wait.VsCnt = 0;
ScoreBrackets.applyWaitcnt(Wait);
AMDGPU::Waitcnt OldWait;
+ bool Modified = false;
+
if (OldWaitcntInstr) {
- OldWait =
- AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
- }
- if (OldWait.dominates(Wait))
- return false;
+ for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
+ &*II != &MI; II = NextI, NextI++) {
+ if (II->isDebugInstr())
+ continue;
- if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
- Wait = Wait.combined(OldWait);
+ if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+ unsigned IEnc = II->getOperand(0).getImm();
+ AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+ OldWait = OldWait.combined(IWait);
+ if (!TrackedWaitcntSet.count(&*II))
+ Wait = Wait.combined(IWait);
+ unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
+ if (IEnc != NewEnc) {
+ II->getOperand(0).setImm(NewEnc);
+ Modified = true;
+ }
+ Wait.VmCnt = ~0u;
+ Wait.LgkmCnt = ~0u;
+ Wait.ExpCnt = ~0u;
+ } else {
+ assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+
+ unsigned ICnt = II->getOperand(1).getImm();
+ OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
+ if (!TrackedWaitcntSet.count(&*II))
+ Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
+ if (Wait.VsCnt != ICnt) {
+ II->getOperand(1).setImm(Wait.VsCnt);
+ Modified = true;
+ }
+ Wait.VsCnt = ~0u;
+ }
- unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- if (OldWaitcntInstr) {
- OldWaitcntInstr->getOperand(0).setImm(Enc);
+ LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *II << '\n');
- LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
- << "Old Instr: " << MI << '\n'
- << "New Instr: " << *OldWaitcntInstr << '\n');
- } else {
+ if (!Wait.hasWait())
+ return Modified;
+ }
+ }
+
+ if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
.addImm(Enc);
TrackedWaitcntSet.insert(SWaitInst);
+ Modified = true;
LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
<< "Old Instr: " << MI << '\n'
<< "New Instr: " << *SWaitInst << '\n');
}
- return true;
+ if (Wait.VsCnt != ~0u) {
+ assert(ST->hasVscnt());
+
+ auto SWaitInst =
+ BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(Wait.VsCnt);
+ TrackedWaitcntSet.insert(SWaitInst);
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *SWaitInst << '\n');
+ }
+
+ return Modified;
}
// This is a flat memory operation. Check to see if it has memory
@@ -1093,7 +1187,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// bracket and the destination operand scores.
// TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
- if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
+ if (TII->isAlwaysGDS(Inst.getOpcode()) ||
+ TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
} else {
@@ -1102,8 +1197,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
} else if (TII->isFLAT(Inst)) {
assert(Inst.mayLoad() || Inst.mayStore());
- if (TII->usesVM_CNT(Inst))
- ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ if (TII->usesVM_CNT(Inst)) {
+ if (!ST->hasVscnt())
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ else if (Inst.mayLoad() &&
+ AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
+ else
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
+ }
if (TII->usesLGKM_CNT(Inst)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
@@ -1118,14 +1220,33 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// TODO: get a better carve out.
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
- Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
- ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
+ Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
+ Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
+ if (!ST->hasVscnt())
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+ else if ((Inst.mayLoad() &&
+ AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
+ /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
+ (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
+ else if (Inst.mayStore())
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
+
if (ST->vmemWriteNeedsExpWaitcnt() &&
(Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
}
} else if (TII->isSMRD(Inst)) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+ } else if (Inst.isCall()) {
+ if (callWaitsOnFunctionReturn(Inst)) {
+ // Act as a wait on everything
+ ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
+ } else {
+ // May need to way wait for anything.
+ ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
+ }
} else {
switch (Inst.getOpcode()) {
case AMDGPU::S_SENDMSG:
@@ -1236,31 +1357,18 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
- for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
+ for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
+ E = Block.instr_end();
Iter != E;) {
MachineInstr &Inst = *Iter;
- // Remove any previously existing waitcnts.
- if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
- if (OldWaitcntInstr) {
- if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
- TrackedWaitcntSet.erase(OldWaitcntInstr);
- OldWaitcntInstr->eraseFromParent();
- OldWaitcntInstr = nullptr;
- } else if (!TrackedWaitcntSet.count(&Inst)) {
- // Two successive s_waitcnt's, both of which are pre-existing and
- // are therefore preserved.
- int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
- } else {
- ++Iter;
- Inst.eraseFromParent();
- Modified = true;
- continue;
- }
- }
-
- OldWaitcntInstr = &Inst;
+ // Track pre-existing waitcnts from earlier iterations.
+ if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
+ (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+ Inst.getOperand(0).isReg() &&
+ Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
+ if (!OldWaitcntInstr)
+ OldWaitcntInstr = &Inst;
++Iter;
continue;
}
@@ -1299,27 +1407,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
ScoreBrackets.dump();
});
- // Check to see if this is a GWS instruction. If so, and if this is CI or
- // VI, then the generated code sequence will include an S_WAITCNT 0.
- // TODO: Are these the only GWS instructions?
- if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
- Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
- Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
- Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
- Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
- // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
- ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero());
- }
-
// TODO: Remove this work-around after fixing the scheduler and enable the
// assert above.
if (VCCZBugWorkAround) {
// Restore the vccz bit. Any time a value is written to vcc, the vcc
// bit is updated, so we can restore the bit by reading the value of
// vcc and then writing it back to the register.
- BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
- AMDGPU::VCC)
- .addReg(AMDGPU::VCC);
+ BuildMI(Block, Inst, Inst.getDebugLoc(),
+ TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
+ TRI->getVCC())
+ .addReg(TRI->getVCC());
VCCZBugHandledSet.insert(&Inst);
Modified = true;
}
@@ -1345,6 +1442,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
+ HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
@@ -1480,6 +1578,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// TODO: Could insert earlier and schedule more liberally with operations
// that only use caller preserved registers.
MachineBasicBlock &EntryBB = MF.front();
+ if (ST->hasVscnt())
+ BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
.addImm(0);
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 65ffc27b8b60..561a16c3e351 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -1,9 +1,8 @@
//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,19 +10,9 @@
//
//===----------------------------------------------------------------------===//
-def isGCN : Predicate<"Subtarget->getGeneration() "
- ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
- AssemblerPredicate<"FeatureGCN">;
-def isSI : Predicate<"Subtarget->getGeneration() "
- "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
- AssemblerPredicate<"FeatureSouthernIslands">;
-
-
class InstSI <dag outs, dag ins, string asm = "",
list<dag> pattern = []> :
AMDGPUInst<outs, ins, asm, pattern>, GCNPredicateControl {
- let SubtargetPredicate = isGCN;
-
// Low bits - basic encoding information.
field bit SALU = 0;
field bit VALU = 0;
@@ -121,10 +110,20 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that this is a D16 buffer instruction.
field bit D16Buf = 0;
+ // This field indicates that FLAT instruction accesses FLAT_GLBL or
+ // FLAT_SCRATCH segment. Must be 0 for non-FLAT instructions.
+ field bit IsNonFlatSeg = 0;
+
// This bit indicates that this uses the floating point double precision
// rounding mode flags
field bit FPDPRounding = 0;
+ // Instruction is FP atomic.
+ field bit FPAtomic = 0;
+
+ // This bit indicates that this is one of MFMA instructions.
+ field bit IsMAI = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -182,7 +181,13 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{50} = D16Buf;
- let TSFlags{51} = FPDPRounding;
+ let TSFlags{51} = IsNonFlatSeg;
+
+ let TSFlags{52} = FPDPRounding;
+
+ let TSFlags{53} = FPAtomic;
+
+ let TSFlags{54} = IsMAI;
let SchedRW = [Write32Bit];
@@ -251,38 +256,59 @@ class VINTRPe <bits<2> op> : Enc32 {
let Inst{31-26} = 0x32; // encoding
}
-class MIMGe <bits<7> op> : Enc64 {
+class MIMGe : Enc64 {
bits<8> vdata;
bits<4> dmask;
bits<1> unorm;
bits<1> glc;
- bits<1> da;
bits<1> r128;
bits<1> tfe;
bits<1> lwe;
bits<1> slc;
bit d16;
- bits<8> vaddr;
bits<7> srsrc;
bits<7> ssamp;
let Inst{11-8} = dmask;
let Inst{12} = unorm;
let Inst{13} = glc;
- let Inst{14} = da;
let Inst{15} = r128;
let Inst{16} = tfe;
let Inst{17} = lwe;
- let Inst{24-18} = op;
let Inst{25} = slc;
let Inst{31-26} = 0x3c;
- let Inst{39-32} = vaddr;
let Inst{47-40} = vdata;
let Inst{52-48} = srsrc{6-2};
let Inst{57-53} = ssamp{6-2};
let Inst{63} = d16;
}
+class MIMGe_gfx6789 <bits<8> op> : MIMGe {
+ bits<8> vaddr;
+ bits<1> da;
+
+ let Inst{0} = op{7};
+ let Inst{14} = da;
+ let Inst{24-18} = op{6-0};
+ let Inst{39-32} = vaddr;
+}
+
+class MIMGe_gfx10 <bits<8> op> : MIMGe {
+ bits<8> vaddr0;
+ bits<3> dim;
+ bits<2> nsa;
+ bits<1> dlc;
+ bits<1> a16 = 0; // TODO: this should be an operand
+
+ let Inst{0} = op{7};
+ let Inst{2-1} = nsa;
+ let Inst{5-3} = dim;
+ let Inst{7} = dlc;
+ let Inst{24-18} = op{6-0};
+ let Inst{39-32} = vaddr0;
+ let Inst{62} = a16;
+}
+
class EXPe : Enc64 {
bits<4> en;
bits<6> tgt;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2370d5fa7b27..ba8ed6993a56 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1,9 +1,8 @@
//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -14,7 +13,6 @@
#include "SIInstrInfo.h"
#include "AMDGPU.h"
-#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "GCNHazardRecognizer.h"
#include "SIDefines.h"
@@ -100,12 +98,6 @@ static unsigned getNumOperandsNoGlue(SDNode *Node) {
return N;
}
-static SDValue findChainOperand(SDNode *Load) {
- SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
- assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
- return LastOp;
-}
-
/// Returns true if both nodes have the same value for the given
/// operand \p Op, or if both nodes do not have this operand.
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
@@ -142,7 +134,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO:
- return true;
+ // No implicit operands.
+ return MI.getNumOperands() == MI.getDesc().getNumOperands();
default:
return false;
}
@@ -168,22 +161,25 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
return false;
// Check base reg.
- if (Load0->getOperand(1) != Load1->getOperand(1))
- return false;
-
- // Check chain.
- if (findChainOperand(Load0) != findChainOperand(Load1))
+ if (Load0->getOperand(0) != Load1->getOperand(0))
return false;
// Skip read2 / write2 variants for simplicity.
// TODO: We should report true if the used offsets are adjacent (excluded
// st64 versions).
- if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
- AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
+ int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
+ int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
+ if (Offset0Idx == -1 || Offset1Idx == -1)
return false;
- Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
- Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
+ // XXX - be careful of datalesss loads
+ // getNamedOperandIdx returns the index for MachineInstrs. Since they
+ // include the output in the operand list, but SDNodes don't, we need to
+ // subtract the index by one.
+ Offset0Idx -= get(Opc0).NumDefs;
+ Offset1Idx -= get(Opc1).NumDefs;
+ Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
+ Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
return true;
}
@@ -207,10 +203,6 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
if (!Load0Offset || !Load1Offset)
return false;
- // Check chain.
- if (findChainOperand(Load0) != findChainOperand(Load1))
- return false;
-
Offset0 = Load0Offset->getZExtValue();
Offset1 = Load1Offset->getZExtValue();
return true;
@@ -221,7 +213,6 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
// MUBUF and MTBUF have vaddr at different indices.
if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
- findChainOperand(Load0) != findChainOperand(Load1) ||
!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
return false;
@@ -233,10 +224,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
return false;
// getNamedOperandIdx returns the index for MachineInstrs. Since they
- // inlcude the output in the operand list, but SDNodes don't, we need to
+ // include the output in the operand list, but SDNodes don't, we need to
// subtract the index by one.
- --OffIdx0;
- --OffIdx1;
+ OffIdx0 -= get(Opc0).NumDefs;
+ OffIdx1 -= get(Opc1).NumDefs;
SDValue Off0 = Load0->getOperand(OffIdx0);
SDValue Off1 = Load1->getOperand(OffIdx1);
@@ -265,8 +256,8 @@ static bool isStride64(unsigned Opc) {
}
}
-bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
- MachineOperand *&BaseOp,
+bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const {
unsigned Opc = LdSt.getOpcode();
@@ -277,6 +268,11 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
if (OffsetImm) {
// Normal, single offset LDS instruction.
BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
+ // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
+ // report that here?
+ if (!BaseOp)
+ return false;
+
Offset = OffsetImm->getImm();
assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
"operands of type register.");
@@ -325,7 +321,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
if (SOffset && SOffset->isReg())
return false;
- MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
if (!AddrReg)
return false;
@@ -348,7 +344,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
if (!OffsetImm)
return false;
- MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
+ const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
BaseOp = SBaseReg;
Offset = OffsetImm->getImm();
assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
@@ -357,7 +353,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
}
if (isFLAT(LdSt)) {
- MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
if (VAddr) {
// Can't analyze 2 offsets.
if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
@@ -413,11 +409,11 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
return Base1 == Base2;
}
-bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
- MachineOperand &BaseOp2,
+bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
+ const MachineOperand &BaseOp2,
unsigned NumLoads) const {
- MachineInstr &FirstLdSt = *BaseOp1.getParent();
- MachineInstr &SecondLdSt = *BaseOp2.getParent();
+ const MachineInstr &FirstLdSt = *BaseOp1.getParent();
+ const MachineInstr &SecondLdSt = *BaseOp2.getParent();
if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
return false;
@@ -461,7 +457,12 @@ bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
const MachineRegisterInfo &MRI =
FirstLdSt.getParent()->getParent()->getRegInfo();
- const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
+
+ const unsigned Reg = FirstDst->getReg();
+
+ const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg)
+ ? MRI.getRegClass(Reg)
+ : RI.getPhysRegClass(Reg);
return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
}
@@ -511,8 +512,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (RC == &AMDGPU::VGPR_32RegClass) {
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
- AMDGPU::SReg_32RegClass.contains(SrcReg));
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+ AMDGPU::SReg_32RegClass.contains(SrcReg) ||
+ AMDGPU::AGPR_32RegClass.contains(SrcReg));
+ unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
+ AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32;
+ BuildMI(MBB, MI, DL, get(Opc), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
@@ -526,6 +530,21 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (DestReg == AMDGPU::VCC_LO) {
+ if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ } else {
+ // FIXME: Hack until VReg_1 removed.
+ assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+
+ return;
+ }
+
if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
@@ -570,10 +589,83 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (RC == &AMDGPU::AGPR_32RegClass) {
+ assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_32RegClass.contains(SrcReg) ||
+ AMDGPU::AGPR_32RegClass.contains(SrcReg));
+ if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
+ // First try to find defining accvgpr_write to avoid temporary registers.
+ for (auto Def = MI, E = MBB.begin(); Def != E; ) {
+ --Def;
+ if (!Def->definesRegister(SrcReg, &RI))
+ continue;
+ if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
+ break;
+
+ MachineOperand &DefOp = Def->getOperand(1);
+ assert(DefOp.isReg() || DefOp.isImm());
+
+ if (DefOp.isReg()) {
+ // Check that register source operand if not clobbered before MI.
+ // Immediate operands are always safe to propagate.
+ bool SafeToPropagate = true;
+ for (auto I = Def; I != MI && SafeToPropagate; ++I)
+ if (I->modifiesRegister(DefOp.getReg(), &RI))
+ SafeToPropagate = false;
+
+ if (!SafeToPropagate)
+ break;
+
+ DefOp.setIsKill(false);
+ }
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
+ .add(DefOp);
+ return;
+ }
+
+ RegScavenger RS;
+ RS.enterBasicBlock(MBB);
+ RS.forward(MI);
+
+ // Ideally we want to have three registers for a long reg_sequence copy
+ // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
+ unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
+ *MBB.getParent());
+
+ // Registers in the sequence are allocated contiguously so we can just
+ // use register number to pick one of three round-robin temps.
+ unsigned RegNo = DestReg % 3;
+ unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ if (!Tmp)
+ report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
+ RS.setRegUsed(Tmp);
+ // Only loop through if there are any free registers left, otherwise
+ // scavenger may report a fatal error without emergency spill slot
+ // or spill with the slot.
+ while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
+ unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
+ break;
+ Tmp = Tmp2;
+ RS.setRegUsed(Tmp);
+ }
+ copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
+ .addReg(Tmp, RegState::Kill);
+ return;
+ }
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.isSGPRClass(RC)) {
- if (RI.getRegSizeInBits(*RC) > 32) {
+ // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32.
+ if (!(RI.getRegSizeInBits(*RC) % 64)) {
Opcode = AMDGPU::S_MOV_B64;
EltSize = 8;
} else {
@@ -585,6 +677,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
+ } else if (RI.hasAGPRs(RC)) {
+ Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ?
+ AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
+ } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) {
+ Opcode = AMDGPU::V_ACCVGPR_READ_B32;
}
ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
@@ -597,6 +694,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
else
SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+ if (Opcode == TargetOpcode::COPY) {
+ copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
+ RI.getSubReg(SrcReg, SubIdx), KillSrc);
+ continue;
+ }
+
MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
get(Opcode), RI.getSubReg(DestReg, SubIdx));
@@ -696,38 +799,50 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
unsigned TrueReg,
unsigned FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineFunction *MF = MBB.getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const TargetRegisterClass *BoolXExecRC =
+ RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
"Not a VGPR32 reg");
if (Cond.size() == 1) {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(Cond[0]);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
} else if (Cond.size() == 2) {
assert(Cond[0].isImm() && "Cond[0] is not an immediate");
switch (Cond[0].getImm()) {
case SIInstrInfo::SCC_TRUE: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
+ : AMDGPU::S_CSELECT_B64), SReg)
.addImm(-1)
.addImm(0);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
}
case SIInstrInfo::SCC_FALSE: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
+ : AMDGPU::S_CSELECT_B64), SReg)
.addImm(0)
.addImm(-1);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
@@ -735,11 +850,13 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
case SIInstrInfo::VCCNZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
@@ -747,39 +864,49 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
case SIInstrInfo::VCCZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(TrueReg)
+ .addImm(0)
.addReg(FalseReg)
.addReg(SReg);
break;
}
case SIInstrInfo::EXECNZ: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
+ : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
- BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
+ : AMDGPU::S_CSELECT_B64), SReg)
.addImm(-1)
.addImm(0);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
}
case SIInstrInfo::EXECZ: {
- unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+ unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
+ : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
- BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
+ : AMDGPU::S_CSELECT_B64), SReg)
.addImm(0)
.addImm(-1);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
.addReg(FalseReg)
+ .addImm(0)
.addReg(TrueReg)
.addReg(SReg);
llvm_unreachable("Unhandled branch predicate EXECZ");
@@ -798,7 +925,7 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
const DebugLoc &DL,
unsigned SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
.addImm(Value)
.addReg(SrcReg);
@@ -811,7 +938,7 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
const DebugLoc &DL,
unsigned SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
.addImm(Value)
.addReg(SrcReg);
@@ -821,6 +948,8 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
+ if (RI.hasAGPRs(DstRC))
+ return AMDGPU::COPY;
if (RI.getRegSizeInBits(*DstRC) == 32) {
return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
} else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
@@ -837,12 +966,18 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S32_SAVE;
case 8:
return AMDGPU::SI_SPILL_S64_SAVE;
+ case 12:
+ return AMDGPU::SI_SPILL_S96_SAVE;
case 16:
return AMDGPU::SI_SPILL_S128_SAVE;
+ case 20:
+ return AMDGPU::SI_SPILL_S160_SAVE;
case 32:
return AMDGPU::SI_SPILL_S256_SAVE;
case 64:
return AMDGPU::SI_SPILL_S512_SAVE;
+ case 128:
+ return AMDGPU::SI_SPILL_S1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
@@ -858,10 +993,31 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V96_SAVE;
case 16:
return AMDGPU::SI_SPILL_V128_SAVE;
+ case 20:
+ return AMDGPU::SI_SPILL_V160_SAVE;
case 32:
return AMDGPU::SI_SPILL_V256_SAVE;
case 64:
return AMDGPU::SI_SPILL_V512_SAVE;
+ case 128:
+ return AMDGPU::SI_SPILL_V1024_SAVE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_A32_SAVE;
+ case 8:
+ return AMDGPU::SI_SPILL_A64_SAVE;
+ case 16:
+ return AMDGPU::SI_SPILL_A128_SAVE;
+ case 64:
+ return AMDGPU::SI_SPILL_A512_SAVE;
+ case 128:
+ return AMDGPU::SI_SPILL_A1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
@@ -906,12 +1062,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
- .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
+ .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
// Add the scratch resource registers as implicit uses because we may end up
// needing them, and need to ensure that the reserved registers are
// correctly handled.
-
- FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
+ if (RI.spillSGPRToVGPR())
+ FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
if (ST.hasScalarStores()) {
// m0 is used for offset to scalar stores if used to spill.
Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
@@ -920,17 +1076,22 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
return;
}
- assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
-
- unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
+ unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize)
+ : getVGPRSpillSaveOpcode(SpillSize);
MFI->setHasSpilledVGPRs();
- BuildMI(MBB, MI, DL, get(Opcode))
- .addReg(SrcReg, getKillRegState(isKill)) // data
- .addFrameIndex(FrameIndex) // addr
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getFrameOffsetReg()) // scratch_offset
- .addImm(0) // offset
- .addMemOperand(MMO);
+
+ auto MIB = BuildMI(MBB, MI, DL, get(Opcode));
+ if (RI.hasAGPRs(RC)) {
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MIB.addReg(Tmp, RegState::Define);
+ }
+ MIB.addReg(SrcReg, getKillRegState(isKill)) // data
+ .addFrameIndex(FrameIndex) // addr
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+ .addImm(0) // offset
+ .addMemOperand(MMO);
}
static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
@@ -939,12 +1100,18 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S32_RESTORE;
case 8:
return AMDGPU::SI_SPILL_S64_RESTORE;
+ case 12:
+ return AMDGPU::SI_SPILL_S96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_S128_RESTORE;
+ case 20:
+ return AMDGPU::SI_SPILL_S160_RESTORE;
case 32:
return AMDGPU::SI_SPILL_S256_RESTORE;
case 64:
return AMDGPU::SI_SPILL_S512_RESTORE;
+ case 128:
+ return AMDGPU::SI_SPILL_S1024_RESTORE;
default:
llvm_unreachable("unknown register size");
}
@@ -960,10 +1127,31 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_V128_RESTORE;
+ case 20:
+ return AMDGPU::SI_SPILL_V160_RESTORE;
case 32:
return AMDGPU::SI_SPILL_V256_RESTORE;
case 64:
return AMDGPU::SI_SPILL_V512_RESTORE;
+ case 128:
+ return AMDGPU::SI_SPILL_V1024_RESTORE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_A32_RESTORE;
+ case 8:
+ return AMDGPU::SI_SPILL_A64_RESTORE;
+ case 16:
+ return AMDGPU::SI_SPILL_A128_RESTORE;
+ case 64:
+ return AMDGPU::SI_SPILL_A512_RESTORE;
+ case 128:
+ return AMDGPU::SI_SPILL_A1024_RESTORE;
default:
llvm_unreachable("unknown register size");
}
@@ -999,12 +1187,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
- FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
+ if (RI.spillSGPRToVGPR())
+ FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
- .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
+ .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
if (ST.hasScalarStores()) {
// m0 is used for offset to scalar stores if used to spill.
@@ -1014,15 +1203,19 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
return;
}
- assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
-
- unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
- BuildMI(MBB, MI, DL, get(Opcode), DestReg)
- .addFrameIndex(FrameIndex) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getFrameOffsetReg()) // scratch_offset
- .addImm(0) // offset
- .addMemOperand(MMO);
+ unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
+ : getVGPRSpillRestoreOpcode(SpillSize);
+ auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg);
+ if (RI.hasAGPRs(RC)) {
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MIB.addReg(Tmp, RegState::Define);
+ }
+ MIB.addFrameIndex(FrameIndex) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+ .addImm(0) // offset
+ .addMemOperand(MMO);
}
/// \param @Offset Offset in bytes of the FrameIndex being spilled
@@ -1089,7 +1282,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
// (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
getAddNoCarry(Entry, Insert, DL, TIDReg)
.addReg(TIDReg)
- .addReg(TIDIGZReg);
+ .addReg(TIDIGZReg)
+ .addImm(0); // clamp bit
} else {
// Get the wave id
BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
@@ -1114,7 +1308,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
getAddNoCarry(MBB, MI, DL, TmpReg)
.addImm(LDSOffset)
- .addReg(TIDReg);
+ .addReg(TIDReg)
+ .addImm(0); // clamp bit
return TmpReg;
}
@@ -1148,13 +1343,17 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
if (MBB.succ_empty()) {
bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
- if (HasNoTerminator)
- BuildMI(MBB, MBB.end(), DebugLoc(),
- get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
+ if (HasNoTerminator) {
+ if (Info->returnsVoid()) {
+ BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
+ } else {
+ BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
+ }
+ }
}
}
-unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
+unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default: return 1; // FIXME: Do wait states equal cycles?
@@ -1174,18 +1373,42 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_MOV_B64));
break;
+ case AMDGPU::S_MOV_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_MOV_B32));
+ break;
+
case AMDGPU::S_XOR_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_XOR_B64));
break;
+ case AMDGPU::S_XOR_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_XOR_B32));
+ break;
+
+ case AMDGPU::S_OR_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_OR_B32));
+ break;
+
case AMDGPU::S_ANDN2_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_ANDN2_B64));
break;
+ case AMDGPU::S_ANDN2_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_ANDN2_B32));
+ break;
+
case AMDGPU::V_MOV_B64_PSEUDO: {
unsigned Dst = MI.getOperand(0).getReg();
unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -1215,24 +1438,28 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
break;
}
case AMDGPU::V_SET_INACTIVE_B32: {
- BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ BuildMI(MBB, MI, DL, get(NotOpc), Exec)
+ .addReg(Exec);
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
.add(MI.getOperand(2));
- BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ BuildMI(MBB, MI, DL, get(NotOpc), Exec)
+ .addReg(Exec);
MI.eraseFromParent();
break;
}
case AMDGPU::V_SET_INACTIVE_B64: {
- BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ BuildMI(MBB, MI, DL, get(NotOpc), Exec)
+ .addReg(Exec);
MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
MI.getOperand(0).getReg())
.add(MI.getOperand(2));
expandPostRAPseudo(*Copy);
- BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ BuildMI(MBB, MI, DL, get(NotOpc), Exec)
+ .addReg(Exec);
MI.eraseFromParent();
break;
}
@@ -1282,10 +1509,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
.addReg(RegHi);
- if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
- MIB.addImm(0);
- else
- MIB.add(MI.getOperand(2));
+ MIB.add(MI.getOperand(2));
Bundler.append(MIB);
finalizeBundle(MBB, Bundler.begin());
@@ -1293,10 +1517,17 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+ case AMDGPU::ENTER_WWM: {
+ // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+ // WWM is entered.
+ MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
+ : AMDGPU::S_OR_SAVEEXEC_B64));
+ break;
+ }
case AMDGPU::EXIT_WWM: {
- // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
- // is exited.
- MI.setDesc(get(AMDGPU::S_MOV_B64));
+ // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+ // WWM is exited.
+ MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
case TargetOpcode::BUNDLE: {
@@ -1492,7 +1723,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub0)
.addReg(PCReg, 0, AMDGPU::sub0)
- .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
+ .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD);
BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub1)
.addReg(PCReg, 0, AMDGPU::sub1)
@@ -1502,7 +1733,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub0)
.addReg(PCReg, 0, AMDGPU::sub0)
- .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
+ .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD);
BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub1)
.addReg(PCReg, 0, AMDGPU::sub1)
@@ -1659,6 +1890,10 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
case AMDGPU::S_MOV_B64_term:
case AMDGPU::S_XOR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
+ case AMDGPU::S_MOV_B32_term:
+ case AMDGPU::S_XOR_B32_term:
+ case AMDGPU::S_OR_B32_term:
+ case AMDGPU::S_ANDN2_B32_term:
break;
case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:
@@ -1826,7 +2061,7 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
// Limit to equal cost for branch vs. N v_cndmask_b32s.
- return !RI.isSGPRClass(RC) && NumInsts <= 6;
+ return RI.hasVGPRs(RC) && NumInsts <= 6;
}
case SCC_TRUE:
case SCC_FALSE: {
@@ -1907,14 +2142,18 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
const int16_t *SubIndices = Sub0_15;
int NElts = DstSize / 32;
- // 64-bit select is only avaialble for SALU.
+ // 64-bit select is only available for SALU.
+ // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
if (Pred == SCC_TRUE) {
- SelOp = AMDGPU::S_CSELECT_B64;
- EltRC = &AMDGPU::SGPR_64RegClass;
- SubIndices = Sub0_15_64;
-
- assert(NElts % 2 == 0);
- NElts /= 2;
+ if (NElts % 2) {
+ SelOp = AMDGPU::S_CSELECT_B32;
+ EltRC = &AMDGPU::SGPR_32RegClass;
+ } else {
+ SelOp = AMDGPU::S_CSELECT_B64;
+ EltRC = &AMDGPU::SGPR_64RegClass;
+ SubIndices = Sub0_15_64;
+ NElts /= 2;
+ }
}
MachineInstrBuilder MIB = BuildMI(
@@ -1934,6 +2173,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
.addReg(FalseReg, 0, SubIdx)
.addReg(TrueReg, 0, SubIdx);
preserveCondRegFlags(Select->getOperand(3), Cond[1]);
+ fixImplicitOperands(*Select);
MIB.addReg(DstElt)
.addImm(SubIdx);
@@ -1955,6 +2195,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B64:
case AMDGPU::COPY:
+ case AMDGPU::V_ACCVGPR_WRITE_B32:
+ case AMDGPU::V_ACCVGPR_READ_B32:
return true;
default:
return false;
@@ -2007,6 +2249,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::S_MOV_B32:
+ case AMDGPU::V_ACCVGPR_WRITE_B32:
break;
}
@@ -2020,6 +2263,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (Opc == AMDGPU::COPY) {
bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+ if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) {
+ if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32))
+ return false;
+ NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32;
+ }
UseMI.setDesc(get(NewOpc));
UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
@@ -2027,7 +2275,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
}
if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
+ Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
if (hasAnyModifiersSet(UseMI))
@@ -2042,7 +2292,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (isInlineConstant(UseMI, *Src0, *ImmOp))
return false;
- bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
+ bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64;
+ bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -2055,6 +2308,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
return false;
+ unsigned NewOpc =
+ IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
+ : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
+ if (pseudoToMCOpcode(NewOpc) == -1)
+ return false;
+
// We need to swap operands 0 and 1 since madmk constant is at operand 1.
const int64_t Imm = ImmOp->getImm();
@@ -2075,14 +2334,16 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(Src1->isKill());
if (Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_MAC_F16_e64)
+ Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
Src1->ChangeToImmediate(Imm);
removeModOperands(UseMI);
- UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
+ UseMI.setDesc(get(NewOpc));
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
@@ -2107,9 +2368,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
Src0Inlined = true;
} else if ((RI.isPhysicalRegister(Src0->getReg()) &&
- RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) ||
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
(RI.isVirtualRegister(Src0->getReg()) &&
- RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
return false;
// VGPR is okay as Src0 - fallthrough
}
@@ -2130,6 +2393,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// VGPR is okay as Src1 - fallthrough
}
+ unsigned NewOpc =
+ IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
+ : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
+ if (pseudoToMCOpcode(NewOpc) == -1)
+ return false;
+
const int64_t Imm = ImmOp->getImm();
// FIXME: This would be a lot easier if we could return a new instruction
@@ -2142,7 +2411,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
if (Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_MAC_F16_e64)
+ Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -2151,7 +2422,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// These come before src2.
removeModOperands(UseMI);
- UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
+ UseMI.setDesc(get(NewOpc));
+ // It might happen that UseMI was commuted
+ // and we now have SGPR as SRC1. If so 2 inlined
+ // constant and SGPR are illegal.
+ legalizeOperands(UseMI);
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
@@ -2172,9 +2447,9 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
return LowOffset + LowWidth <= HighOffset;
}
-bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
- MachineInstr &MIb) const {
- MachineOperand *BaseOp0, *BaseOp1;
+bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
+ const MachineInstr &MIb) const {
+ const MachineOperand *BaseOp0, *BaseOp1;
int64_t Offset0, Offset1;
if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
@@ -2196,8 +2471,8 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
return false;
}
-bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
- MachineInstr &MIb,
+bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+ const MachineInstr &MIb,
AliasAnalysis *AA) const {
assert((MIa.mayLoad() || MIa.mayStore()) &&
"MIa must load from or modify a memory location");
@@ -2211,17 +2486,6 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
return false;
- if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
- const MachineMemOperand *MMOa = *MIa.memoperands_begin();
- const MachineMemOperand *MMOb = *MIb.memoperands_begin();
- if (MMOa->getValue() && MMOb->getValue()) {
- MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
- MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
- if (!AA->alias(LocA, LocB))
- return true;
- }
- }
-
// TODO: Should we check the address space from the MachineMemOperand? That
// would allow us to distinguish objects we know don't alias based on the
// underlying address space, even if it was lowered to a different one,
@@ -2275,18 +2539,21 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
LiveVariables *LV) const {
unsigned Opc = MI.getOpcode();
bool IsF16 = false;
- bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
switch (Opc) {
default:
return nullptr;
case AMDGPU::V_MAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_e64:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_FMAC_F32_e64:
break;
case AMDGPU::V_MAC_F16_e32:
+ case AMDGPU::V_FMAC_F16_e32:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e32:
@@ -2315,30 +2582,38 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
- if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
+ if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
// If we have an SGPR input, we will violate the constant bus restriction.
- (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
+ (ST.getConstantBusLimit(Opc) > 1 ||
+ !Src0->isReg() ||
+ !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
if (auto Imm = getFoldableImm(Src2)) {
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
- .add(*Dst)
- .add(*Src0)
- .add(*Src1)
- .addImm(Imm);
+ unsigned NewOpc =
+ IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
+ : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
+ if (pseudoToMCOpcode(NewOpc) != -1)
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .add(*Dst)
+ .add(*Src0)
+ .add(*Src1)
+ .addImm(Imm);
}
+ unsigned NewOpc =
+ IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
if (auto Imm = getFoldableImm(Src1)) {
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
- .add(*Dst)
- .add(*Src0)
- .addImm(Imm)
- .add(*Src2);
+ if (pseudoToMCOpcode(NewOpc) != -1)
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .add(*Dst)
+ .add(*Src0)
+ .addImm(Imm)
+ .add(*Src2);
}
if (auto Imm = getFoldableImm(Src0)) {
- if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
+ if (pseudoToMCOpcode(NewOpc) != -1 &&
+ isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc,
AMDGPU::OpName::src0), Src1))
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.add(*Src1)
.addImm(Imm)
@@ -2346,9 +2621,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
}
}
- assert((!IsFMA || !IsF16) && "fmac only expected with f32");
- unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
- (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+ unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32)
+ : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+ if (pseudoToMCOpcode(NewOpc) == -1)
+ return nullptr;
+
return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.addImm(Src0Mods ? Src0Mods->getImm() : 0)
@@ -2390,12 +2667,26 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
changesVGPRIndexingMode(MI);
}
+bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
+ return Opcode == AMDGPU::DS_ORDERED_COUNT ||
+ Opcode == AMDGPU::DS_GWS_INIT ||
+ Opcode == AMDGPU::DS_GWS_SEMA_V ||
+ Opcode == AMDGPU::DS_GWS_SEMA_BR ||
+ Opcode == AMDGPU::DS_GWS_SEMA_P ||
+ Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
+ Opcode == AMDGPU::DS_GWS_BARRIER;
+}
+
bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
if (MI.mayStore() && isSMRD(MI))
return true; // scalar store or atomic
+ // This will terminate the function when other lanes may need to continue.
+ if (MI.isReturn())
+ return true;
+
// These instructions cause shader I/O that may cause hardware lockups
// when executed with an empty EXEC mask.
//
@@ -2403,10 +2694,12 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
// EXEC = 0, but checking for that case here seems not worth it
// given the typical code patterns.
if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
- Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE)
+ Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
+ Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
+ Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
return true;
- if (MI.isInlineAsm())
+ if (MI.isCall() || MI.isInlineAsm())
return true; // conservative assumption
// These are like SALU instructions in terms of effects, so it's questionable
@@ -2420,8 +2713,36 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
return false;
}
+bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
+ const MachineInstr &MI) const {
+ if (MI.isMetaInstruction())
+ return false;
+
+ // This won't read exec if this is an SGPR->SGPR copy.
+ if (MI.isCopyLike()) {
+ if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
+ return true;
+
+ // Make sure this isn't copying exec as a normal operand
+ return MI.readsRegister(AMDGPU::EXEC, &RI);
+ }
+
+ // Make a conservative assumption about the callee.
+ if (MI.isCall())
+ return true;
+
+ // Be conservative with any unhandled generic opcodes.
+ if (!isTargetSpecificOpcode(MI.getOpcode()))
+ return true;
+
+ return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
+}
+
bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
switch (Imm.getBitWidth()) {
+ case 1: // This likely will be a condition code mask.
+ return true;
+
case 32:
return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
ST.hasInv2PiInlineImm());
@@ -2454,7 +2775,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
- case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
int32_t Trunc = static_cast<int32_t>(Imm);
return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
}
@@ -2467,7 +2790,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
// A few special case instructions have 16-bit operands on subtargets
// where 16-bit instructions are not legal.
@@ -2480,19 +2805,14 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
return false;
}
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
- if (isUInt<16>(Imm)) {
- int16_t Trunc = static_cast<int16_t>(Imm);
- return ST.has16BitInsts() &&
- AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
- }
- if (!(Imm & 0xffff)) {
- return ST.has16BitInsts() &&
- AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm());
- }
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
uint32_t Trunc = static_cast<uint32_t>(Imm);
- return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
+ return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
}
default:
llvm_unreachable("invalid bitwidth");
@@ -2534,9 +2854,10 @@ static bool compareMachineOp(const MachineOperand &Op0,
bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
const MachineOperand &MO) const {
- const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
+ const MCInstrDesc &InstDesc = MI.getDesc();
+ const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
- assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+ assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
return true;
@@ -2547,7 +2868,15 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
if (MO.isImm() && isInlineConstant(MO, OpInfo))
return RI.opCanUseInlineConstant(OpInfo.OperandType);
- return RI.opCanUseLiteralConstant(OpInfo.OperandType);
+ if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
+ return false;
+
+ if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
+ return true;
+
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ return ST.hasVOP3Literal();
}
bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
@@ -2586,7 +2915,8 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
// Can't shrink instruction with three operands.
// FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
// a special case for it. It can only be shrunk if the third operand
- // is vcc. We should handle this the same way we handle vopc, by addding
+ // is vcc, and src0_modifiers and src1_modifiers are not set.
+ // We should handle this the same way we handle vopc, by addding
// a register allocation hint pre-regalloc and then do the shrinking
// post-regalloc.
if (Src2) {
@@ -2606,6 +2936,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F32_e64:
+ case AMDGPU::V_FMAC_F16_e64:
if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
return false;
@@ -2662,7 +2993,8 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
// dst
Inst32.add(MI.getOperand(0));
} else {
- assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
+ assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
+ (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
"Unexpected case");
}
@@ -2707,19 +3039,19 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
- // FLAT_SCR is just an SGPR pair.
- if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
- return true;
-
- // EXEC register uses the constant bus.
- if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
- return true;
+ // Null is free
+ if (MO.getReg() == AMDGPU::SGPR_NULL)
+ return false;
// SGPRs use the constant bus
- return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
- (!MO.isImplicit() &&
- (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
- AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
+ if (MO.isImplicit()) {
+ return MO.getReg() == AMDGPU::M0 ||
+ MO.getReg() == AMDGPU::VCC ||
+ MO.getReg() == AMDGPU::VCC_LO;
+ } else {
+ return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
+ AMDGPU::SReg_64RegClass.contains(MO.getReg());
+ }
}
static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
@@ -2730,6 +3062,8 @@ static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
switch (MO.getReg()) {
case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
case AMDGPU::M0:
case AMDGPU::FLAT_SCR:
return MO.getReg();
@@ -2746,10 +3080,12 @@ static bool shouldReadExec(const MachineInstr &MI) {
if (SIInstrInfo::isVALU(MI)) {
switch (MI.getOpcode()) {
case AMDGPU::V_READLANE_B32:
- case AMDGPU::V_READLANE_B32_si:
+ case AMDGPU::V_READLANE_B32_gfx6_gfx7:
+ case AMDGPU::V_READLANE_B32_gfx10:
case AMDGPU::V_READLANE_B32_vi:
case AMDGPU::V_WRITELANE_B32:
- case AMDGPU::V_WRITELANE_B32_si:
+ case AMDGPU::V_WRITELANE_B32_gfx6_gfx7:
+ case AMDGPU::V_WRITELANE_B32_gfx10:
case AMDGPU::V_WRITELANE_B32_vi:
return false;
}
@@ -2830,7 +3166,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
switch (Desc.OpInfo[i].OperandType) {
case MCOI::OPERAND_REGISTER:
- if (MI.getOperand(i).isImm()) {
+ if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
ErrInfo = "Illegal immediate value for operand.";
return false;
}
@@ -2843,7 +3179,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
const MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
ErrInfo = "Illegal immediate value for operand.";
@@ -3022,9 +3362,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
++ConstantBusCount;
+ SmallVector<unsigned, 2> SGPRsUsed;
unsigned SGPRUsed = findImplicitSGPRRead(MI);
- if (SGPRUsed != AMDGPU::NoRegister)
+ if (SGPRUsed != AMDGPU::NoRegister) {
++ConstantBusCount;
+ SGPRsUsed.push_back(SGPRUsed);
+ }
for (int OpIdx : OpIndices) {
if (OpIdx == -1)
@@ -3032,23 +3375,37 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
const MachineOperand &MO = MI.getOperand(OpIdx);
if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
if (MO.isReg()) {
- if (MO.getReg() != SGPRUsed)
- ++ConstantBusCount;
SGPRUsed = MO.getReg();
+ if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
+ return !RI.regsOverlap(SGPRUsed, SGPR);
+ })) {
+ ++ConstantBusCount;
+ SGPRsUsed.push_back(SGPRUsed);
+ }
} else {
++ConstantBusCount;
++LiteralCount;
}
}
}
- if (ConstantBusCount > 1) {
- ErrInfo = "VOP* instruction uses the constant bus more than once";
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ // v_writelane_b32 is an exception from constant bus restriction:
+ // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
+ if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
+ Opcode != AMDGPU::V_WRITELANE_B32) {
+ ErrInfo = "VOP* instruction violates constant bus restriction";
return false;
}
if (isVOP3(MI) && LiteralCount) {
- ErrInfo = "VOP3 instruction uses literal";
- return false;
+ if (LiteralCount && !ST.hasVOP3Literal()) {
+ ErrInfo = "VOP3 instruction uses literal";
+ return false;
+ }
+ if (LiteralCount > 1) {
+ ErrInfo = "VOP3 instruction uses more than one literal";
+ return false;
+ }
}
}
@@ -3067,17 +3424,43 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (isSOP2(MI) || isSOPC(MI)) {
+ const MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ const MachineOperand &Src1 = MI.getOperand(Src1Idx);
+ unsigned Immediates = 0;
+
+ if (!Src0.isReg() &&
+ !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType))
+ Immediates++;
+ if (!Src1.isReg() &&
+ !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType))
+ Immediates++;
+
+ if (Immediates > 1) {
+ ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
+ return false;
+ }
+ }
+
if (isSOPK(MI)) {
- int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
- if (sopkIsZext(MI)) {
- if (!isUInt<16>(Imm)) {
- ErrInfo = "invalid immediate for SOPK instruction";
+ auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
+ if (Desc.isBranch()) {
+ if (!Op->isMBB()) {
+ ErrInfo = "invalid branch target for SOPK instruction";
return false;
}
} else {
- if (!isInt<16>(Imm)) {
- ErrInfo = "invalid immediate for SOPK instruction";
- return false;
+ uint64_t Imm = Op->getImm();
+ if (sopkIsZext(MI)) {
+ if (!isUInt<16>(Imm)) {
+ ErrInfo = "invalid immediate for SOPK instruction";
+ return false;
+ }
+ } else {
+ if (!isInt<16>(Imm)) {
+ ErrInfo = "invalid immediate for SOPK instruction";
+ return false;
+ }
}
}
}
@@ -3155,6 +3538,53 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (isMIMG(MI)) {
+ const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
+ if (DimOp) {
+ int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
+ AMDGPU::OpName::vaddr0);
+ int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+ const AMDGPU::MIMGDimInfo *Dim =
+ AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm());
+
+ if (!Dim) {
+ ErrInfo = "dim is out of range";
+ return false;
+ }
+
+ bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
+ unsigned AddrWords = BaseOpcode->NumExtraArgs +
+ (BaseOpcode->Gradients ? Dim->NumGradients : 0) +
+ (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+
+ unsigned VAddrWords;
+ if (IsNSA) {
+ VAddrWords = SRsrcIdx - VAddr0Idx;
+ } else {
+ const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
+ VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
+ if (AddrWords > 8)
+ AddrWords = 16;
+ else if (AddrWords > 4)
+ AddrWords = 8;
+ else if (AddrWords == 3 && VAddrWords == 4) {
+ // CodeGen uses the V4 variant of instructions for three addresses,
+ // because the selection DAG does not support non-power-of-two types.
+ AddrWords = 4;
+ }
+ }
+
+ if (VAddrWords != AddrWords) {
+ ErrInfo = "bad vaddr size";
+ return false;
+ }
+ }
+ }
+
const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
if (DppCt) {
using namespace AMDGPU::DPP;
@@ -3165,10 +3595,29 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
(DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
(DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
(DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
- (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) {
+ (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
+ (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
ErrInfo = "Invalid dpp_ctrl value";
return false;
}
+ if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
+ ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ ErrInfo = "Invalid dpp_ctrl value: "
+ "wavefront shifts are not supported on GFX10+";
+ return false;
+ }
+ if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
+ ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ ErrInfo = "Invalid dpp_ctrl value: "
+ "broadcats are not supported on GFX10+";
+ return false;
+ }
+ if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
+ ST.getGeneration() < AMDGPUSubtarget::GFX10) {
+ ErrInfo = "Invalid dpp_ctrl value: "
+ "row_share and row_xmask are not supported before GFX10";
+ return false;
+ }
}
return true;
@@ -3183,9 +3632,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
case AMDGPU::WQM: return AMDGPU::WQM;
case AMDGPU::WWM: return AMDGPU::WWM;
- case AMDGPU::S_MOV_B32:
- return MI.getOperand(1).isReg() ?
+ case AMDGPU::S_MOV_B32: {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ return MI.getOperand(1).isReg() ||
+ RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
+ }
case AMDGPU::S_ADD_I32:
return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
case AMDGPU::S_ADDC_U32:
@@ -3199,7 +3651,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_SUB_U32:
return AMDGPU::V_SUB_I32_e32;
case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
- case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
+ case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32;
+ case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32;
+ case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32;
case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
@@ -3244,6 +3698,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
}
+ llvm_unreachable(
+ "Unexpected scalar opcode without corresponding vector one!");
}
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
@@ -3263,30 +3719,21 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
return RI.getRegClass(RCID);
}
-bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
- switch (MI.getOpcode()) {
- case AMDGPU::COPY:
- case AMDGPU::REG_SEQUENCE:
- case AMDGPU::PHI:
- case AMDGPU::INSERT_SUBREG:
- return RI.hasVGPRs(getOpRegClass(MI, 0));
- default:
- return RI.hasVGPRs(getOpRegClass(MI, OpNo));
- }
-}
-
void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
MachineBasicBlock::iterator I = MI;
MachineBasicBlock *MBB = MI.getParent();
MachineOperand &MO = MI.getOperand(OpIdx);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
const TargetRegisterClass *RC = RI.getRegClass(RCID);
- unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
if (MO.isReg())
Opcode = AMDGPU::COPY;
else if (RI.isSGPRClass(RC))
- Opcode = AMDGPU::S_MOV_B32;
+ Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
@@ -3396,37 +3843,53 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
return isLegalRegOperand(MRI, OpInfo, MO);
// Handle non-register types that are treated like immediates.
- assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+ assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
return true;
}
bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO) const {
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const TargetRegisterClass *DefinedRC =
OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
if (!MO)
MO = &MI.getOperand(OpIdx);
+ int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
+ int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
+ if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--)
+ return false;
- RegSubRegPair SGPRUsed;
+ SmallDenseSet<RegSubRegPair> SGPRsUsed;
if (MO->isReg())
- SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
+ SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
if (i == OpIdx)
continue;
const MachineOperand &Op = MI.getOperand(i);
if (Op.isReg()) {
- if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
+ RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
+ if (!SGPRsUsed.count(SGPR) &&
usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
- return false;
+ if (--ConstantBusLimit <= 0)
+ return false;
+ SGPRsUsed.insert(SGPR);
}
} else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
- return false;
+ if (--ConstantBusLimit <= 0)
+ return false;
+ } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) &&
+ isLiteralConstantLike(Op, InstDesc.OpInfo[i])) {
+ if (!VOP3LiteralLimit--)
+ return false;
+ if (--ConstantBusLimit <= 0)
+ return false;
}
}
}
@@ -3437,7 +3900,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
}
// Handle non-register types that are treated like immediates.
- assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
+ assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
if (!DefinedRC) {
// This operand expects an immediate.
@@ -3452,30 +3915,24 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
unsigned Opc = MI.getOpcode();
const MCInstrDesc &InstrDesc = get(Opc);
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
MachineOperand &Src1 = MI.getOperand(Src1Idx);
// If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
- // we need to only have one constant bus use.
- //
- // Note we do not need to worry about literal constants here. They are
- // disabled for the operand type for instructions because they will always
- // violate the one constant bus use rule.
+ // we need to only have one constant bus use before GFX10.
bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
- if (HasImplicitSGPR) {
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
- MachineOperand &Src0 = MI.getOperand(Src0Idx);
-
- if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
- legalizeOpWithMove(MI, Src0Idx);
- }
+ if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 &&
+ Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) ||
+ isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx])))
+ legalizeOpWithMove(MI, Src0Idx);
// Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
// both the value to write (src0) and lane select (src1). Fix up non-SGPR
// src0/src1 with V_READFIRSTLANE.
if (Opc == AMDGPU::V_WRITELANE_B32) {
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
- MachineOperand &Src0 = MI.getOperand(Src0Idx);
const DebugLoc &DL = MI.getDebugLoc();
if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -3493,6 +3950,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
return;
}
+ // No VOP2 instructions support AGPRs.
+ if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
+ legalizeOpWithMove(MI, Src0Idx);
+
+ if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
+ legalizeOpWithMove(MI, Src1Idx);
+
// VOP2 src0 instructions support all operand types, so we don't need to check
// their legality. If src1 is already legal, we don't need to do anything.
if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
@@ -3520,9 +3984,6 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
return;
}
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
- MachineOperand &Src0 = MI.getOperand(Src0Idx);
-
// If src0 can be used as src1, commuting will make the operands legal.
// Otherwise we have to give up and insert a move.
//
@@ -3556,12 +4017,11 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
Src1.setSubReg(Src0SubReg);
+ fixImplicitOperands(MI);
}
-// Legalize VOP3 operands. Because all operand types are supported for any
-// operand, and since literal constants are not allowed and should never be
-// seen, we only need to worry about inserting copies if we use multiple SGPR
-// operands.
+// Legalize VOP3 operands. All operand types are supported for any operand
+// but only one literal constant and only starting from GFX10.
void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
@@ -3572,8 +4032,35 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
};
+ if (Opc == AMDGPU::V_PERMLANE16_B32 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32) {
+ // src1 and src2 must be scalar
+ MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
+ MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
+ const DebugLoc &DL = MI.getDebugLoc();
+ if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src1);
+ Src1.ChangeToRegister(Reg, false);
+ }
+ if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src2);
+ Src2.ChangeToRegister(Reg, false);
+ }
+ }
+
// Find the one SGPR operand we are allowed to use.
+ int ConstantBusLimit = ST.getConstantBusLimit(Opc);
+ int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+ SmallDenseSet<unsigned> SGPRsUsed;
unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+ if (SGPRReg != AMDGPU::NoRegister) {
+ SGPRsUsed.insert(SGPRReg);
+ --ConstantBusLimit;
+ }
for (unsigned i = 0; i < 3; ++i) {
int Idx = VOP3Idx[i];
@@ -3581,16 +4068,38 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
break;
MachineOperand &MO = MI.getOperand(Idx);
- // We should never see a VOP3 instruction with an illegal immediate operand.
- if (!MO.isReg())
+ if (!MO.isReg()) {
+ if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx]))
+ continue;
+
+ if (LiteralLimit > 0 && ConstantBusLimit > 0) {
+ --LiteralLimit;
+ --ConstantBusLimit;
+ continue;
+ }
+
+ --LiteralLimit;
+ --ConstantBusLimit;
+ legalizeOpWithMove(MI, Idx);
continue;
+ }
+
+ if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) &&
+ !isOperandLegal(MI, Idx, &MO)) {
+ legalizeOpWithMove(MI, Idx);
+ continue;
+ }
if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
continue; // VGPRs are legal
- if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
- SGPRReg = MO.getReg();
- // We can use one SGPR in each VOP3 instruction.
+ // We can use one SGPR in each VOP3 instruction prior to GFX10
+ // and two starting from GFX10.
+ if (SGPRsUsed.count(MO.getReg()))
+ continue;
+ if (ConstantBusLimit > 0) {
+ SGPRsUsed.insert(MO.getReg());
+ --ConstantBusLimit;
continue;
}
@@ -3607,6 +4116,15 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
unsigned DstReg = MRI.createVirtualRegister(SRC);
unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
+ if (RI.hasAGPRs(VRC)) {
+ VRC = RI.getEquivalentVGPRClass(VRC);
+ unsigned NewSrcReg = MRI.createVirtualRegister(VRC);
+ BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+ get(TargetOpcode::COPY), NewSrcReg)
+ .addReg(SrcReg);
+ SrcReg = NewSrcReg;
+ }
+
if (SubRegs == 1) {
BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
@@ -3691,15 +4209,27 @@ static void
emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
const DebugLoc &DL, MachineOperand &Rsrc) {
+ MachineFunction &MF = *OrigBB.getParent();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned SaveExecOpc =
+ ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+ unsigned XorTermOpc =
+ ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
+ unsigned AndOpc =
+ ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+
MachineBasicBlock::iterator I = LoopBB.begin();
unsigned VRsrc = Rsrc.getReg();
unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
- unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned AndCond = MRI.createVirtualRegister(BoolXExecRC);
unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
@@ -3737,22 +4267,22 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
.addReg(SRsrc, 0, AMDGPU::sub2_sub3)
.addReg(VRsrc, 0, AMDGPU::sub2_sub3);
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
+ BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond)
.addReg(CondReg0)
.addReg(CondReg1);
MRI.setSimpleHint(SaveExec, AndCond);
// Update EXEC to matching lanes, saving original to SaveExec.
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
+ BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
.addReg(AndCond, RegState::Kill);
// The original instruction is here; we insert the terminators after it.
I = LoopBB.end();
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
+ .addReg(Exec)
.addReg(SaveExec);
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
}
@@ -3763,15 +4293,19 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
MachineOperand &Rsrc, MachineDominatorTree *MDT) {
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
MachineBasicBlock::iterator I(&MI);
const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
// Save the EXEC mask
- BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
- .addReg(AMDGPU::EXEC);
+ BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
// Killed uses in the instruction we are waterfalling around will be
// incorrect due to the added control-flow.
@@ -3820,8 +4354,7 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
// Restore the EXEC mask
MachineBasicBlock::iterator First = RemainderBB->begin();
- BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
- .addReg(SaveExec);
+ BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
}
// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
@@ -3901,7 +4434,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
continue;
const TargetRegisterClass *OpRC =
MRI.getRegClass(MI.getOperand(i).getReg());
- if (RI.hasVGPRs(OpRC)) {
+ if (RI.hasVectorRegisters(OpRC)) {
VRC = OpRC;
} else {
SRC = OpRC;
@@ -3914,7 +4447,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
if (!VRC) {
assert(SRC);
- VRC = RI.getEquivalentVGPRClass(SRC);
+ VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC)
+ : RI.getEquivalentVGPRClass(SRC);
}
RC = VRC;
} else {
@@ -3983,7 +4517,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// Legalize SI_INIT_M0
if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
MachineOperand &Src = MI.getOperand(0);
- if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
+ if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
return;
}
@@ -4047,19 +4581,28 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
+ unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
+
unsigned RsrcPtr, NewSRsrc;
std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
// NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
- DebugLoc DL = MI.getDebugLoc();
- BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
- .addReg(RsrcPtr, 0, AMDGPU::sub0)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo)
+ .addDef(CondReg0)
+ .addReg(RsrcPtr, 0, AMDGPU::sub0)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
+ .addImm(0);
// NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
- BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
- .addReg(RsrcPtr, 0, AMDGPU::sub1)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
+ .addDef(CondReg1, RegState::Dead)
+ .addReg(RsrcPtr, 0, AMDGPU::sub1)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
+ .addReg(CondReg0, RegState::Kill)
+ .addImm(0);
// NewVaddr = {NewVaddrHi, NewVaddrLo}
BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
@@ -4106,6 +4649,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
getNamedOperand(MI, AMDGPU::OpName::glc)) {
MIB.addImm(GLC->getImm());
}
+ if (const MachineOperand *DLC =
+ getNamedOperand(MI, AMDGPU::OpName::dlc)) {
+ MIB.addImm(DLC->getImm());
+ }
MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
@@ -4235,37 +4782,37 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
continue;
case AMDGPU::S_LSHL_B32:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_ASHR_I32:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHR_B32:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHL_B64:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_LSHLREV_B64;
swapOperands(Inst);
}
break;
case AMDGPU::S_ASHR_I64:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_ASHRREV_I64;
swapOperands(Inst);
}
break;
case AMDGPU::S_LSHR_B64:
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_LSHRREV_B64;
swapOperands(Inst);
}
@@ -4279,10 +4826,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
case AMDGPU::S_CBRANCH_SCC0:
case AMDGPU::S_CBRANCH_SCC1:
// Clear unused bits of vcc
- BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
- AMDGPU::VCC)
- .addReg(AMDGPU::EXEC)
- .addReg(AMDGPU::VCC);
+ if (ST.isWave32())
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32),
+ AMDGPU::VCC_LO)
+ .addReg(AMDGPU::EXEC_LO)
+ .addReg(AMDGPU::VCC_LO);
+ else
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
+ AMDGPU::VCC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(AMDGPU::VCC);
break;
case AMDGPU::S_BFE_U64:
@@ -4339,8 +4892,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
MachineOperand &Op = Inst.getOperand(i);
if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
+ // Only propagate through live-def of SCC.
+ if (Op.isDef() && !Op.isDead())
+ addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
Inst.RemoveOperand(i);
- addSCCDefUsersToVALUWorklist(Inst, Worklist);
}
}
@@ -4358,6 +4913,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
}
Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
+ fixImplicitOperands(Inst);
if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
@@ -4445,6 +5001,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
Inst.RemoveOperand(3);
Inst.setDesc(get(NewOpc));
+ Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
Inst.addImplicitDefUseOperands(*MBB.getParent());
MRI.replaceRegWith(OldDstReg, ResultReg);
legalizeOperands(Inst, MDT);
@@ -4514,8 +5071,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
bool Src1IsSGPR = Src1.isReg() &&
RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
- MachineInstr *Not = nullptr;
- MachineInstr *Xor = nullptr;
+ MachineInstr *Xor;
unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -4523,14 +5079,12 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
// The next iteration over the work list will lower these to the vector
// unit as necessary.
if (Src0IsSGPR) {
- Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
- .add(Src0);
+ BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
.addReg(Temp)
.add(Src1);
} else if (Src1IsSGPR) {
- Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
- .add(Src1);
+ BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
.add(Src0)
.addReg(Temp);
@@ -4538,8 +5092,8 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
.add(Src0)
.add(Src1);
- Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
- .addReg(Temp);
+ MachineInstr *Not =
+ BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
Worklist.insert(Not);
}
@@ -4670,13 +5224,14 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned CarryReg = MRI.createVirtualRegister(CarryRC);
+ unsigned DeadCarryReg = MRI.createVirtualRegister(CarryRC);
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src0 = Inst.getOperand(1);
@@ -4705,7 +5260,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
.addReg(CarryReg, RegState::Define)
.add(SrcReg0Sub0)
- .add(SrcReg1Sub0);
+ .add(SrcReg1Sub0)
+ .addImm(0); // clamp bit
unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
MachineInstr *HiHalf =
@@ -4713,7 +5269,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
.addReg(DeadCarryReg, RegState::Define | RegState::Dead)
.add(SrcReg0Sub1)
.add(SrcReg1Sub1)
- .addReg(CarryReg, RegState::Kill);
+ .addReg(CarryReg, RegState::Kill)
+ .addImm(0); // clamp bit
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
@@ -4943,7 +5500,23 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
E = MRI.use_end(); I != E;) {
MachineInstr &UseMI = *I->getParent();
- if (!canReadVGPR(UseMI, I.getOperandNo())) {
+
+ unsigned OpNo = 0;
+
+ switch (UseMI.getOpcode()) {
+ case AMDGPU::COPY:
+ case AMDGPU::WQM:
+ case AMDGPU::WWM:
+ case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::PHI:
+ case AMDGPU::INSERT_SUBREG:
+ break;
+ default:
+ OpNo = I.getOperandNo();
+ break;
+ }
+
+ if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
Worklist.insert(&UseMI);
do {
@@ -5017,19 +5590,23 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::addSCCDefUsersToVALUWorklist(
- MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+ MachineInstr &SCCDefInst,
+ SetVectorType &Worklist) const {
+ // Ensure that def inst defines SCC, which is still live.
+ assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
+ !Op.isDead() && Op.getParent() == &SCCDefInst);
// This assumes that all the users of SCC are in the same block
// as the SCC def.
- for (MachineInstr &MI :
- make_range(MachineBasicBlock::iterator(SCCDefInst),
- SCCDefInst.getParent()->end())) {
+ for (MachineInstr &MI : // Skip the def inst itself.
+ make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
+ SCCDefInst.getParent()->end())) {
+ // Check if SCC is used first.
+ if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
+ Worklist.insert(&MI);
// Exit if we find another SCC def.
if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
return;
-
- if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
- Worklist.insert(&MI);
}
}
@@ -5046,14 +5623,26 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
case AMDGPU::REG_SEQUENCE:
case AMDGPU::INSERT_SUBREG:
case AMDGPU::WQM:
- case AMDGPU::WWM:
- if (RI.hasVGPRs(NewDstRC))
- return nullptr;
+ case AMDGPU::WWM: {
+ const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
+ if (RI.hasAGPRs(SrcRC)) {
+ if (RI.hasAGPRs(NewDstRC))
+ return nullptr;
+
+ NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
+ if (!NewDstRC)
+ return nullptr;
+ } else {
+ if (RI.hasVGPRs(NewDstRC))
+ return nullptr;
+
+ NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+ if (!NewDstRC)
+ return nullptr;
+ }
- NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
- if (!NewDstRC)
- return nullptr;
return NewDstRC;
+ }
default:
return NewDstRC;
}
@@ -5139,6 +5728,12 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
}
uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ return (22ULL << 44) | // IMG_FORMAT_32_FLOAT
+ (1ULL << 56) | // RESOURCE_LEVEL = 1
+ (3ULL << 60); // OOB_SELECT = 3
+ }
+
uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
if (ST.isAmdHsaOS()) {
// Set ATC = 1. GFX9 doesn't have this bit.
@@ -5165,12 +5760,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
}
- // IndexStride = 64.
- Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
+ // IndexStride = 64 / 32.
+ uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
+ Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
// If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
// Clear them unless we want a huge stride.
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ ST.getGeneration() <= AMDGPUSubtarget::GFX9)
Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
return Rsrc23;
@@ -5267,25 +5864,35 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return DescSize; // No operands.
if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
- return DescSize + 4;
+ return isVOP3(MI) ? 12 : (DescSize + 4);
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx == -1)
return DescSize;
if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
- return DescSize + 4;
+ return isVOP3(MI) ? 12 : (DescSize + 4);
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (Src2Idx == -1)
return DescSize;
if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
- return DescSize + 4;
+ return isVOP3(MI) ? 12 : (DescSize + 4);
return DescSize;
}
+ // Check whether we have extra NSA words.
+ if (isMIMG(MI)) {
+ int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
+ if (VAddr0Idx < 0)
+ return 8;
+
+ int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+ return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
+ }
+
switch (Opc) {
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::KILL:
@@ -5294,10 +5901,12 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return 0;
case TargetOpcode::BUNDLE:
return getInstBundleSize(MI);
- case TargetOpcode::INLINEASM: {
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR: {
const MachineFunction *MF = MI.getParent()->getParent();
const char *AsmStr = MI.getOperand(0).getSymbolName();
- return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+ return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(),
+ &MF->getSubtarget());
}
default:
return DescSize;
@@ -5332,7 +5941,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
- unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC());
MachineInstr *SIIF =
BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
.add(Branch->getOperand(0))
@@ -5359,8 +5968,8 @@ void SIInstrInfo::convertNonUniformLoopRegion(
if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
- unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC());
+ unsigned BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
MachineInstrBuilder HeaderPHIBuilder =
BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
@@ -5370,7 +5979,7 @@ void SIInstrInfo::convertNonUniformLoopRegion(
HeaderPHIBuilder.addReg(BackEdgeReg);
} else {
MachineBasicBlock *PMBB = *PI;
- unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
ZeroReg, 0);
HeaderPHIBuilder.addReg(ZeroReg);
@@ -5432,7 +6041,9 @@ SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
{ MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
{ MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
{ MO_REL32_LO, "amdgpu-rel32-lo" },
- { MO_REL32_HI, "amdgpu-rel32-hi" }
+ { MO_REL32_HI, "amdgpu-rel32-hi" },
+ { MO_ABS32_LO, "amdgpu-abs32-lo" },
+ { MO_ABS32_HI, "amdgpu-abs32-hi" },
};
return makeArrayRef(TargetFlags);
@@ -5452,8 +6063,8 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC);
+ unsigned UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
+ MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
.addReg(UnusedCarry, RegState::Define | RegState::Dead);
@@ -5480,6 +6091,20 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con
}
}
+void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+
+ if (!ST.isWave32())
+ return;
+
+ for (auto &Op : MI.implicit_operands()) {
+ if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
+ Op.setReg(AMDGPU::VCC_LO);
+ }
+}
+
bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
if (!isSMRD(MI))
return false;
@@ -5493,6 +6118,25 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
return RCID == AMDGPU::SReg_128RegClassID;
}
+bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
+ bool Signed) const {
+ // TODO: Should 0 be special cased?
+ if (!ST.hasFlatInstOffsets())
+ return false;
+
+ if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
+ return false;
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ return (Signed && isInt<12>(Offset)) ||
+ (!Signed && isUInt<11>(Offset));
+ }
+
+ return (Signed && isInt<13>(Offset)) ||
+ (!Signed && isUInt<12>(Offset));
+}
+
+
// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
enum SIEncodingFamily {
SI = 0,
@@ -5500,7 +6144,9 @@ enum SIEncodingFamily {
SDWA = 2,
SDWA9 = 3,
GFX80 = 4,
- GFX9 = 5
+ GFX9 = 5,
+ GFX10 = 6,
+ SDWA10 = 7
};
static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
@@ -5513,6 +6159,8 @@ static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
case AMDGPUSubtarget::VOLCANIC_ISLANDS:
case AMDGPUSubtarget::GFX9:
return SIEncodingFamily::VI;
+ case AMDGPUSubtarget::GFX10:
+ return SIEncodingFamily::GFX10;
}
llvm_unreachable("Unknown subtarget generation!");
}
@@ -5521,18 +6169,29 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
SIEncodingFamily Gen = subtargetEncodingFamily(ST);
if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
- ST.getGeneration() >= AMDGPUSubtarget::GFX9)
+ ST.getGeneration() == AMDGPUSubtarget::GFX9)
Gen = SIEncodingFamily::GFX9;
- if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
- Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
- : SIEncodingFamily::SDWA;
// Adjust the encoding family to GFX80 for D16 buffer instructions when the
// subtarget has UnpackedD16VMem feature.
// TODO: remove this when we discard GFX80 encoding.
if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
Gen = SIEncodingFamily::GFX80;
+ if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
+ switch (ST.getGeneration()) {
+ default:
+ Gen = SIEncodingFamily::SDWA;
+ break;
+ case AMDGPUSubtarget::GFX9:
+ Gen = SIEncodingFamily::SDWA9;
+ break;
+ case AMDGPUSubtarget::GFX10:
+ Gen = SIEncodingFamily::SDWA10;
+ break;
+ }
+ }
+
int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
// -1 means that Opcode is already a native instruction.
@@ -5627,3 +6286,77 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
}
return nullptr;
}
+
+bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
+ Register VReg,
+ const MachineInstr &DefMI,
+ const MachineInstr &UseMI) {
+ assert(MRI.isSSA() && "Must be run on SSA");
+
+ auto *TRI = MRI.getTargetRegisterInfo();
+ auto *DefBB = DefMI.getParent();
+
+ // Don't bother searching between blocks, although it is possible this block
+ // doesn't modify exec.
+ if (UseMI.getParent() != DefBB)
+ return true;
+
+ const int MaxInstScan = 20;
+ int NumInst = 0;
+
+ // Stop scan at the use.
+ auto E = UseMI.getIterator();
+ for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
+ if (I->isDebugInstr())
+ continue;
+
+ if (++NumInst > MaxInstScan)
+ return true;
+
+ if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+ return true;
+ }
+
+ return false;
+}
+
+bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
+ Register VReg,
+ const MachineInstr &DefMI) {
+ assert(MRI.isSSA() && "Must be run on SSA");
+
+ auto *TRI = MRI.getTargetRegisterInfo();
+ auto *DefBB = DefMI.getParent();
+
+ const int MaxUseInstScan = 10;
+ int NumUseInst = 0;
+
+ for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
+ // Don't bother searching between blocks, although it is possible this block
+ // doesn't modify exec.
+ if (UseInst.getParent() != DefBB)
+ return true;
+
+ if (++NumUseInst > MaxUseInstScan)
+ return true;
+ }
+
+ const int MaxInstScan = 20;
+ int NumInst = 0;
+
+ // Stop scan when we have seen all the uses.
+ for (auto I = std::next(DefMI.getIterator()); ; ++I) {
+ if (I->isDebugInstr())
+ continue;
+
+ if (++NumInst > MaxInstScan)
+ return true;
+
+ if (I->readsRegister(VReg))
+ if (--NumUseInst == 0)
+ return false;
+
+ if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+ return true;
+ }
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 5b1a05f3785e..3ff35da0b963 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1,9 +1,8 @@
//===- SIInstrInfo.h - SI Instruction Info Interface ------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -121,14 +120,15 @@ private:
void addUsersToMoveToVALUWorklist(unsigned Reg, MachineRegisterInfo &MRI,
SetVectorType &Worklist) const;
- void
- addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
- SetVectorType &Worklist) const;
+ void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+ MachineInstr &SCCDefInst,
+ SetVectorType &Worklist) const;
const TargetRegisterClass *
getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
- bool checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const;
+ bool checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
+ const MachineInstr &MIb) const;
unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
@@ -143,7 +143,7 @@ protected:
public:
enum TargetOperandFlags {
- MO_MASK = 0x7,
+ MO_MASK = 0xf,
MO_NONE = 0,
// MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL.
@@ -157,7 +157,13 @@ public:
MO_REL32 = 4,
MO_REL32_LO = 4,
// MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI.
- MO_REL32_HI = 5
+ MO_REL32_HI = 5,
+
+ MO_LONG_BRANCH_FORWARD = 6,
+ MO_LONG_BRANCH_BACKWARD = 7,
+
+ MO_ABS32_LO = 8,
+ MO_ABS32_HI = 9,
};
explicit SIInstrInfo(const GCNSubtarget &ST);
@@ -173,11 +179,13 @@ public:
int64_t &Offset1,
int64_t &Offset2) const override;
- bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+ bool getMemOperandWithOffset(const MachineInstr &LdSt,
+ const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const final;
- bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2,
+ bool shouldClusterMemOps(const MachineOperand &BaseOp1,
+ const MachineOperand &BaseOp2,
unsigned NumLoads) const override;
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
@@ -294,7 +302,8 @@ public:
unsigned Kind) const override;
bool
- areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+ areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+ const MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const override;
bool isFoldableCopy(const MachineInstr &MI) const;
@@ -376,6 +385,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::SOPP;
}
+ static bool isPacked(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsPacked;
+ }
+
+ bool isPacked(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsPacked;
+ }
+
static bool isVOP1(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::VOP1;
}
@@ -450,6 +467,8 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::DS;
}
+ bool isAlwaysGDS(uint16_t Opcode) const;
+
static bool isMIMG(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::MIMG;
}
@@ -477,6 +496,11 @@ public:
return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
}
+ // FIXME: Make this more precise
+ static bool isFLATScratch(const MachineInstr &MI) {
+ return isSegmentSpecificFLAT(MI);
+ }
+
// Any FLAT encoded instruction, including global_* and scratch_*.
bool isFLAT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
@@ -546,6 +570,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::VINTRP;
}
+ static bool isMAI(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsMAI;
+ }
+
+ bool isMAI(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsMAI;
+ }
+
static bool isScalarUnit(const MachineInstr &MI) {
return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
}
@@ -612,6 +644,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding;
}
+ static bool isFPAtomic(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::FPAtomic;
+ }
+
+ bool isFPAtomic(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::FPAtomic;
+ }
+
bool isVGPRCopy(const MachineInstr &MI) const {
assert(MI.isCopy());
unsigned Dest = MI.getOperand(0).getReg();
@@ -620,9 +660,21 @@ public:
return !RI.isSGPRReg(MRI, Dest);
}
+ bool hasVGPRUses(const MachineInstr &MI) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return llvm::any_of(MI.explicit_uses(),
+ [&MRI, this](const MachineOperand &MO) {
+ return MO.isReg() && RI.isVGPR(MRI, MO.getReg());});
+ }
+
/// Whether we must prevent this instruction from executing with EXEC = 0.
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
+ /// Returns true if the instruction could potentially depend on the value of
+ /// exec. If false, exec dependencies may safely be ignored.
+ bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const;
+
bool isInlineConstant(const APInt &Imm) const;
bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
@@ -761,10 +813,6 @@ public:
return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
}
- /// \returns true if it is legal for the operand at index \p OpNo
- /// to read a VGPR.
- bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
-
/// Legalize the \p OpIndex operand of this instruction by inserting
/// a MOV. For example:
/// ADD_I32_e32 VGPR0, 15
@@ -836,7 +884,7 @@ public:
void insertReturn(MachineBasicBlock &MBB) const;
/// Return the number of wait states that result from executing this
/// instruction.
- unsigned getNumWaitStates(const MachineInstr &MI) const;
+ static unsigned getNumWaitStates(const MachineInstr &MI);
/// Returns the operand named \p Op. If \p MI does not have an
/// operand named \c Op, this function returns nullptr.
@@ -922,10 +970,27 @@ public:
return isUInt<12>(Imm);
}
+ /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT
+ /// encoded instruction. If \p Signed, this is for an instruction that
+ /// interprets the offset as signed.
+ bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
+ bool Signed) const;
+
/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
int pseudoToMCOpcode(int Opcode) const;
+
+ const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum,
+ const TargetRegisterInfo *TRI,
+ const MachineFunction &MF)
+ const override {
+ if (OpNum >= TID.getNumOperands())
+ return nullptr;
+ return RI.getRegClass(TID.OpInfo[OpNum].RegClass);
+ }
+
+ void fixImplicitOperands(MachineInstr &MI) const;
};
/// \brief Returns true if a reg:subreg pair P has a TRC class
@@ -956,6 +1021,21 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
MachineRegisterInfo &MRI);
+/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
+/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
+/// attempt to track between blocks.
+bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
+ Register VReg,
+ const MachineInstr &DefMI,
+ const MachineInstr &UseMI);
+
+/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
+/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to
+/// track between blocks.
+bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
+ Register VReg,
+ const MachineInstr &DefMI);
+
namespace AMDGPU {
LLVM_READONLY
@@ -1003,17 +1083,14 @@ namespace AMDGPU {
LLVM_READONLY
int getGlobalSaddrOp(uint16_t Opcode);
+ LLVM_READONLY
+ int getVCMPXNoSDstOp(uint16_t Opcode);
+
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23);
- // For MachineOperands.
- enum TargetFlags {
- TF_LONG_BRANCH_FORWARD = 1 << 0,
- TF_LONG_BRANCH_BACKWARD = 1 << 1
- };
-
} // end namespace AMDGPU
namespace SI {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 13afa4d4974b..c382c816e0b4 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1,25 +1,21 @@
//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-def isCI : Predicate<"Subtarget->getGeneration() "
- ">= AMDGPUSubtarget::SEA_ISLANDS">;
-def isCIOnly : Predicate<"Subtarget->getGeneration() =="
- "AMDGPUSubtarget::SEA_ISLANDS">,
- AssemblerPredicate <"FeatureSeaIslands">;
-def isVIOnly : Predicate<"Subtarget->getGeneration() =="
- "AMDGPUSubtarget::VOLCANIC_ISLANDS">,
- AssemblerPredicate <"FeatureVolcanicIslands">;
+
+def isWave32 : Predicate<"Subtarget->getWavefrontSize() == 32">,
+ AssemblerPredicate <"FeatureWavefrontSize32">;
+def isWave64 : Predicate<"Subtarget->getWavefrontSize() == 64">,
+ AssemblerPredicate <"FeatureWavefrontSize64">;
def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
class GCNPredicateControl : PredicateControl {
- Predicate SIAssemblerPredicate = isSICI;
- Predicate VIAssemblerPredicate = isVI;
+ Predicate SIAssemblerPredicate = isGFX6GFX7;
+ Predicate VIAssemblerPredicate = isGFX8GFX9;
}
// Execpt for the NONE field, this must be kept in sync with the
@@ -32,6 +28,8 @@ def SIEncodingFamily {
int SDWA9 = 3;
int GFX80 = 4;
int GFX9 = 5;
+ int GFX10 = 6;
+ int SDWA10 = 7;
}
//===----------------------------------------------------------------------===//
@@ -41,10 +39,16 @@ def SIEncodingFamily {
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
- SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>,
+ SDTypeProfile<1, 4, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>,
+ SDTCisVT<4, i1>]>,
[SDNPMayLoad, SDNPMemOperand]
>;
+def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i16>]>,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain, SDNPInGlue]
+>;
+
def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
@@ -57,10 +61,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
]>;
-def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
@@ -69,6 +69,13 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
+// load_d16_{lo|hi} ptr, tied_input
+def SIload_d16 : SDTypeProfile<1, 2, [
+ SDTCisPtrTy<1>,
+ SDTCisSameAs<0, 2>
+]>;
+
+
def SDTtbuffer_load : SDTypeProfile<1, 8,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
@@ -101,9 +108,6 @@ def SDTtbuffer_store : SDTypeProfile<0, 9,
def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
-def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
- SDTtbuffer_store,
- [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16",
SDTtbuffer_store,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
@@ -120,6 +124,14 @@ def SDTBufferLoad : SDTypeProfile<1, 7,
def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_ubyte : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_ushort : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_byte : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad,
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16",
@@ -138,6 +150,12 @@ def SDTBufferStore : SDTypeProfile<0, 8,
def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_byte: SDNode <"AMDGPUISD::BUFFER_STORE_BYTE",
+ SDTBufferStore,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SIbuffer_store_short : SDNode <"AMDGPUISD::BUFFER_STORE_SHORT",
+ SDTBufferStore,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT",
SDTBufferStore,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
@@ -147,9 +165,7 @@ def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16",
class SDBufferAtomic<string opcode> : SDNode <opcode,
SDTypeProfile<1, 8,
- [SDTCisVT<0, i32>, // dst
- SDTCisVT<1, i32>, // vdata
- SDTCisVT<2, v4i32>, // rsrc
+ [SDTCisVT<2, v4i32>, // rsrc
SDTCisVT<3, i32>, // vindex(VGPR)
SDTCisVT<4, i32>, // voffset(VGPR)
SDTCisVT<5, i32>, // soffset(SGPR)
@@ -159,6 +175,19 @@ class SDBufferAtomic<string opcode> : SDNode <opcode,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
+class SDBufferAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
+ SDTypeProfile<0, 8,
+ [SDTCisVT<0, ty>, // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // vindex(VGPR)
+ SDTCisVT<3, i32>, // voffset(VGPR)
+ SDTCisVT<4, i32>, // soffset(SGPR)
+ SDTCisVT<5, i32>, // offset(imm)
+ SDTCisVT<6, i32>, // cachepolicy(imm)
+ SDTCisVT<7, i1>]>, // idxen(imm)
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
+>;
+
def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">;
def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">;
def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">;
@@ -169,6 +198,8 @@ def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">;
def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">;
def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
+def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>;
+def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>;
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
@@ -185,10 +216,54 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
+class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
+ SDTypeProfile<0, 2,
+ [SDTCisPtrTy<0>, // vaddr
+ SDTCisVT<1, ty>]>, // vdata
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
+>;
+
+def SIglobal_atomic_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_FADD", f32>;
+def SIglobal_atomic_pk_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_PK_FADD", v2f16>;
+
def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
>;
+def SIlds : SDNode<"AMDGPUISD::LDS",
+ SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
+>;
+
+def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_lo_u8 : SDNode<"AMDGPUISD::LOAD_D16_LO_U8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_lo_i8 : SDNode<"AMDGPUISD::LOAD_D16_LO_I8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_hi : SDNode<"AMDGPUISD::LOAD_D16_HI",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_hi_u8 : SDNode<"AMDGPUISD::LOAD_D16_HI_U8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
//===----------------------------------------------------------------------===//
// ValueType helpers
//===----------------------------------------------------------------------===//
@@ -201,7 +276,8 @@ class isFloatType<ValueType SrcVT> {
!if(!eq(SrcVT.Value, f32.Value), 1,
!if(!eq(SrcVT.Value, f64.Value), 1,
!if(!eq(SrcVT.Value, v2f16.Value), 1,
- 0))));
+ !if(!eq(SrcVT.Value, v4f16.Value), 1,
+ 0)))));
}
class isIntType<ValueType SrcVT> {
@@ -215,8 +291,9 @@ class isIntType<ValueType SrcVT> {
class isPackedType<ValueType SrcVT> {
bit ret =
!if(!eq(SrcVT.Value, v2i16.Value), 1,
- !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
- );
+ !if(!eq(SrcVT.Value, v2f16.Value), 1,
+ !if(!eq(SrcVT.Value, v4f16.Value), 1, 0)
+ ));
}
//===----------------------------------------------------------------------===//
@@ -228,7 +305,7 @@ defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
-def atomic_load_fadd_local : local_binary_atomic_op<SIatomic_fadd>;
+def atomic_load_fadd_local : local_binary_atomic_op<atomic_load_fadd>;
def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>;
def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>;
@@ -250,13 +327,13 @@ def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
>;
-def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
-}]>;
+def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr)> {
+ let IsUnindexed = 1;
+}
-def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
-}]>;
+def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> {
+ let IsNonExtLoad = 1;
+}
def atomic_load_32_glue : PatFrag<(ops node:$ptr),
(AMDGPUatomic_ld_glue node:$ptr)> {
@@ -270,35 +347,49 @@ def atomic_load_64_glue : PatFrag<(ops node:$ptr),
let MemoryVT = i64;
}
-def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD;
-}]>;
+def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> {
+ let IsLoad = 1;
+ let IsAnyExtLoad = 1;
+}
def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{
return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
}]>;
-def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
-}]>;
+def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let IsZeroExtLoad = 1;
+}
-def az_extload_glue : AZExtLoadBase <unindexedload_glue>;
+def extloadi8_glue : PatFrag<(ops node:$ptr), (extload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
-def az_extloadi8_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+def zextloadi8_glue : PatFrag<(ops node:$ptr), (zextload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
-def az_extloadi16_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+def extloadi16_glue : PatFrag<(ops node:$ptr), (extload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
-def sextloadi8_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+def zextloadi16_glue : PatFrag<(ops node:$ptr), (zextload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
-def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+def sextloadi8_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i8;
+}
+
+def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let MemoryVT = i16;
+}
def load_glue_align8 : Aligned8Bytes <
(ops node:$ptr), (load_glue node:$ptr)
@@ -311,8 +402,10 @@ def load_glue_align16 : Aligned16Bytes <
def load_local_m0 : LoadFrag<load_glue>, LocalAddress;
def sextloadi8_local_m0 : LoadFrag<sextloadi8_glue>, LocalAddress;
def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress;
-def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress;
-def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress;
+def extloadi8_local_m0 : LoadFrag<extloadi8_glue>, LocalAddress;
+def zextloadi8_local_m0 : LoadFrag<zextloadi8_glue>, LocalAddress;
+def extloadi16_local_m0 : LoadFrag<extloadi16_glue>, LocalAddress;
+def zextloadi16_local_m0 : LoadFrag<zextloadi16_glue>, LocalAddress;
def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress;
def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress;
def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress;
@@ -386,6 +479,51 @@ def si_setcc_uniform : PatFrag <
return true;
}]>;
+//===----------------------------------------------------------------------===//
+// SDNodes PatFrags for d16 loads
+//===----------------------------------------------------------------------===//
+
+class LoadD16Frag <SDPatternOperator op> : PatFrag<(ops node:$ptr, node:$tied_in), (op node:$ptr, node:$tied_in)>;
+class LocalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, LocalAddress;
+class GlobalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, GlobalLoadAddress;
+class PrivateLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, PrivateAddress;
+class FlatLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, FlatLoadAddress;
+
+def load_d16_hi_local : LocalLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_i8>;
+
+def load_d16_hi_global : GlobalLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_i8>;
+
+def load_d16_hi_private : PrivateLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_i8>;
+
+def load_d16_hi_flat : FlatLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_i8>;
+
+
+def load_d16_lo_local : LocalLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_i8>;
+
+def load_d16_lo_global : GlobalLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_i8>;
+
+def load_d16_lo_private : PrivateLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_i8>;
+
+def load_d16_lo_flat : FlatLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_i8>;
+
+
+
def lshr_rev : PatFrag <
(ops node:$src1, node:$src0),
(srl $src0, $src1)
@@ -410,6 +548,7 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
>;
def _local_m0 : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ def _region_m0 : region_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
}
defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
@@ -424,7 +563,7 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
-defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>;
+defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32>;
defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>;
defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>;
@@ -433,6 +572,7 @@ def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
>;
def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal<atomic_cmp_swap_glue>;
+def atomic_cmp_swap_region_m0 : AtomicCmpSwapRegion<atomic_cmp_swap_glue>;
def as_i1imm : SDNodeXForm<imm, [{
@@ -482,8 +622,12 @@ class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1);
}]>;
-def SIMM16bit : PatLeaf <(imm),
- [{return isInt<16>(N->getSExtValue());}]
+def SIMM16bit : ImmLeaf <i32,
+ [{return isInt<16>(Imm);}]
+>;
+
+def UIMM16bit : ImmLeaf <i32,
+ [{return isUInt<16>(Imm); }]
>;
class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
@@ -515,6 +659,22 @@ def ShiftAmt32Imm : PatLeaf <(imm), [{
return N->getZExtValue() < 32;
}]>;
+def getNegV2I16Imm : SDNodeXForm<build_vector, [{
+ return SDValue(packNegConstantV2I16(N, *CurDAG), 0);
+}]>;
+
+def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
+ assert(N->getNumOperands() == 2);
+ assert(N->getOperand(0).getValueType().getSizeInBits() == 16);
+ SDValue Src0 = N->getOperand(0);
+ SDValue Src1 = N->getOperand(1);
+ if (Src0 == Src1)
+ return isNegInlineImmediate(Src0.getNode());
+
+ return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) ||
+ (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
+}], getNegV2I16Imm>;
+
//===----------------------------------------------------------------------===//
// Custom Operands
//===----------------------------------------------------------------------===//
@@ -588,6 +748,14 @@ def SwizzleMatchClass : AsmOperandClass {
let IsOptional = 1;
}
+def EndpgmMatchClass : AsmOperandClass {
+ let Name = "EndpgmImm";
+ let PredicateMethod = "isEndpgm";
+ let ParserMethod = "parseEndpgmOp";
+ let RenderMethod = "addImmOperands";
+ let IsOptional = 1;
+}
+
def ExpTgtMatchClass : AsmOperandClass {
let Name = "ExpTgt";
let PredicateMethod = "isExpTgt";
@@ -605,6 +773,11 @@ def SwizzleImm : Operand<i16> {
let ParserMatchClass = SwizzleMatchClass;
}
+def EndpgmImm : Operand<i16> {
+ let PrintMethod = "printEndpgm";
+ let ParserMatchClass = EndpgmMatchClass;
+}
+
def SWaitMatchClass : AsmOperandClass {
let Name = "SWaitCnt";
let RenderMethod = "addImmOperands";
@@ -619,11 +792,41 @@ def VReg32OrOffClass : AsmOperandClass {
def WAIT_FLAG : Operand <i32> {
let ParserMatchClass = SWaitMatchClass;
let PrintMethod = "printWaitFlag";
+ let OperandType = "OPERAND_IMMEDIATE";
}
include "SIInstrFormats.td"
include "VIInstrFormats.td"
+def BoolReg : AsmOperandClass {
+ let Name = "BoolReg";
+ let ParserMethod = "parseBoolReg";
+ let RenderMethod = "addRegOperands";
+}
+
+class BoolRC : RegisterOperand<SReg_1> {
+ let ParserMatchClass = BoolReg;
+ let DecoderMethod = "decodeBoolReg";
+}
+
+def SSrc_i1 : RegisterOperand<SReg_1_XEXEC> {
+ let ParserMatchClass = BoolReg;
+ let DecoderMethod = "decodeBoolReg";
+}
+
+def VOPDstS64orS32 : BoolRC {
+ let PrintMethod = "printVOPDst";
+}
+
+// SCSrc_i1 is the operand for pseudo instructions only.
+// Boolean immeadiates shall not be exposed to codegen instructions.
+def SCSrc_i1 : RegisterOperand<SReg_1_XEXEC> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_REG_IMM_INT32";
+ let ParserMatchClass = BoolReg;
+ let DecoderMethod = "decodeBoolReg";
+}
+
// ===----------------------------------------------------------------------===//
// ExpSrc* Special cases for exp src operands which are printed as
// "off" depending on en operand.
@@ -662,11 +865,12 @@ def SDWASrc_i16 : SDWASrc<i16>;
def SDWASrc_f32 : SDWASrc<f32>;
def SDWASrc_f16 : SDWASrc<f16>;
-def SDWAVopcDst : VOPDstOperand<SReg_64> {
+def SDWAVopcDst : BoolRC {
let OperandNamespace = "AMDGPU";
let OperandType = "OPERAND_SDWA_VOPC_DST";
let EncoderMethod = "getSDWAVopcDstEncoding";
let DecoderMethod = "decodeSDWAVopcDst";
+ let PrintMethod = "printVOPDst";
}
class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
@@ -688,21 +892,11 @@ class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> {
let ParserMatchClass = MatchClass;
}
-class NamedOperandU12<string Name, AsmOperandClass MatchClass> : Operand<i16> {
- let PrintMethod = "print"#Name;
- let ParserMatchClass = MatchClass;
-}
-
class NamedOperandU16<string Name, AsmOperandClass MatchClass> : Operand<i16> {
let PrintMethod = "print"#Name;
let ParserMatchClass = MatchClass;
}
-class NamedOperandS13<string Name, AsmOperandClass MatchClass> : Operand<i16> {
- let PrintMethod = "print"#Name;
- let ParserMatchClass = MatchClass;
-}
-
class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> {
let PrintMethod = "print"#Name;
let ParserMatchClass = MatchClass;
@@ -720,8 +914,7 @@ def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
-def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>;
-def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>;
+def flat_offset : NamedOperandU16<"FlatOffset", NamedMatchClass<"FlatOffset">>;
def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
@@ -732,6 +925,7 @@ def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>;
def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
+def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>;
def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
@@ -746,11 +940,15 @@ def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>;
def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
+def Dim : NamedOperandU8<"Dim", NamedMatchClass<"Dim", 0>>;
+
+def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>;
def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>;
def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>;
+def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>;
def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>;
def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
@@ -762,6 +960,10 @@ def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
+def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>;
+def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>;
+def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>;
+
def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
@@ -793,9 +995,6 @@ def f32kimm : kimmOperand<i32>;
def KImmFP16MatchClass : KImmMatchClass<16>;
def f16kimm : kimmOperand<i16>;
-
-def VOPDstS64 : VOPDstOperand <SReg_64>;
-
class FPInputModsMatchClass <int opSize> : AsmOperandClass {
let Name = "RegOrImmWithFP"#opSize#"InputMods";
let ParserMethod = "parseRegOrImmWithFPInputMods";
@@ -863,7 +1062,7 @@ def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>;
def FPVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithFPInputMods";
let ParserMethod = "parseRegWithFPInputMods";
- let PredicateMethod = "isVReg";
+ let PredicateMethod = "isVReg32";
}
def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
@@ -890,7 +1089,7 @@ def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
def IntVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithIntInputMods";
let ParserMethod = "parseRegWithIntInputMods";
- let PredicateMethod = "isVReg";
+ let PredicateMethod = "isVReg32";
}
def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
@@ -941,6 +1140,8 @@ def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">;
// VOP3Mods, but the input source is known to never be NaN.
def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">;
+// VOP3Mods, but only allowed for f32 operands.
+def VOP3Mods_f32 : ComplexPattern<fAny, 2, "SelectVOP3Mods_f32">;
def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
@@ -995,6 +1196,31 @@ def TRAPID{
int LLVM_DEBUG_TRAP = 3;
}
+def HWREG {
+ int MODE = 1;
+ int STATUS = 2;
+ int TRAPSTS = 3;
+ int HW_ID = 4;
+ int GPR_ALLOC = 5;
+ int LDS_ALLOC = 6;
+ int IB_STS = 7;
+ int MEM_BASES = 15;
+ int TBA_LO = 16;
+ int TBA_HI = 17;
+ int TMA_LO = 18;
+ int TMA_HI = 19;
+ int FLAT_SCR_LO = 20;
+ int FLAT_SCR_HI = 21;
+ int XNACK_MASK = 22;
+ int POPS_PACKER = 25;
+}
+
+class getHwRegImm<int Reg, int Offset = 0, int Size = 32> {
+ int ret = !or(Reg,
+ !or(!shl(Offset, 6),
+ !shl(!add(Size, -1), 11)));
+}
+
//===----------------------------------------------------------------------===//
//
// SI Instruction multiclass helpers.
@@ -1045,18 +1271,26 @@ multiclass EXP_m<bit done, SDPatternOperator node> {
def _si : EXP_Helper<done>,
SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>,
EXPe {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
+ let AssemblerPredicates = [isGFX6GFX7];
+ let DecoderNamespace = "GFX6GFX7";
let DisableDecoder = DisableSIDecoder;
}
def _vi : EXP_Helper<done>,
SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>,
EXPe_vi {
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
+ let AssemblerPredicates = [isGFX8GFX9];
+ let DecoderNamespace = "GFX8";
let DisableDecoder = DisableVIDecoder;
}
+
+ def _gfx10 : EXP_Helper<done>,
+ SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.GFX10>,
+ EXPe {
+ let AssemblerPredicates = [isGFX10Plus];
+ let DecoderNamespace = "GFX10";
+ let DisableDecoder = DisableSIDecoder;
+ }
}
}
}
@@ -1080,7 +1314,19 @@ class getVALUDstForVT<ValueType VT> {
!if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
!if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
!if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
- VOPDstOperand<SReg_64>)))); // else VT == i1
+ VOPDstS64orS32)))); // else VT == i1
+}
+
+// Returns true if VT is floating point.
+class getIsFP<ValueType VT> {
+ bit ret = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, v2f16.Value), 1,
+ !if(!eq(VT.Value, v4f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, v2f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ !if(!eq(VT.Value, v2f64.Value), 1,
+ 0)))))));
}
// Returns the register class to use for the destination of VOP[12C]
@@ -1094,11 +1340,7 @@ class getSDWADstForVT<ValueType VT> {
// Returns the register class to use for source 0 of VOP[12C]
// instructions for the given VT.
class getVOPSrc0ForVT<ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, v2f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- 0))));
+ bit isFP = getIsFP<VT>.ret;
RegisterOperand ret =
!if(isFP,
@@ -1107,8 +1349,11 @@ class getVOPSrc0ForVT<ValueType VT> {
!if(!eq(VT.Value, f16.Value),
VSrc_f16,
!if(!eq(VT.Value, v2f16.Value),
- VCSrc_v2f16,
- VSrc_f32
+ VSrc_v2f16,
+ !if(!eq(VT.Value, v4f16.Value),
+ AVSrc_64,
+ VSrc_f32
+ )
)
)
),
@@ -1117,7 +1362,7 @@ class getVOPSrc0ForVT<ValueType VT> {
!if(!eq(VT.Value, i16.Value),
VSrc_b16,
!if(!eq(VT.Value, v2i16.Value),
- VCSrc_v2b16,
+ VSrc_v2b16,
VSrc_b32
)
)
@@ -1132,9 +1377,7 @@ class getVregSrcForVT<ValueType VT> {
}
class getSDWASrcForVT <ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- 0));
+ bit isFP = getIsFP<VT>.ret;
RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32);
RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32);
RegisterOperand ret = !if(isFP, retFlt, retInt);
@@ -1143,33 +1386,32 @@ class getSDWASrcForVT <ValueType VT> {
// Returns the register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3SrcForVT<ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, v2f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- 0))));
+ bit isFP = getIsFP<VT>.ret;
RegisterOperand ret =
!if(!eq(VT.Size, 128),
VSrc_128,
!if(!eq(VT.Size, 64),
!if(isFP,
- VCSrc_f64,
- VCSrc_b64),
+ VSrc_f64,
+ VSrc_b64),
!if(!eq(VT.Value, i1.Value),
- SCSrc_i1,
+ SSrc_i1,
!if(isFP,
!if(!eq(VT.Value, f16.Value),
- VCSrc_f16,
+ VSrc_f16,
!if(!eq(VT.Value, v2f16.Value),
- VCSrc_v2f16,
- VCSrc_f32
+ VSrc_v2f16,
+ !if(!eq(VT.Value, v4f16.Value),
+ AVSrc_64,
+ VSrc_f32
+ )
)
),
!if(!eq(VT.Value, i16.Value),
- VCSrc_b16,
+ VSrc_b16,
!if(!eq(VT.Value, v2i16.Value),
- VCSrc_v2b16,
- VCSrc_b32
+ VSrc_v2b16,
+ VSrc_b32
)
)
)
@@ -1190,11 +1432,8 @@ class isModifierType<ValueType SrcVT> {
}
// Return type of input modifiers operand for specified input operand
-class getSrcMod <ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- 0)));
+class getSrcMod <ValueType VT, bit EnableF32SrcMods> {
+ bit isFP = getIsFP<VT>.ret;
bit isPacked = isPackedType<VT>.ret;
Operand ret = !if(!eq(VT.Size, 64),
!if(isFP, FP64InputMods, Int64InputMods),
@@ -1203,7 +1442,7 @@ class getSrcMod <ValueType VT> {
FP16InputMods,
FP32InputMods
),
- Int32InputMods)
+ !if(EnableF32SrcMods, FP32InputMods, Int32InputMods))
);
}
@@ -1213,10 +1452,7 @@ class getOpSelMod <ValueType VT> {
// Return type of input modifiers operand specified input operand for DPP
class getSrcModExt <ValueType VT> {
- bit isFP = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- 0)));
+ bit isFP = getIsFP<VT>.ret;
Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
}
@@ -1238,7 +1474,7 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
// Returns the input arguments for VOP3 instructions for the given SrcVT.
class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
RegisterOperand Src2RC, int NumSrcArgs,
- bit HasIntClamp, bit HasModifiers, bit HasOMod,
+ bit HasIntClamp, bit HasModifiers, bit HasSrc2Mods, bit HasOMod,
Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
dag ret =
@@ -1276,16 +1512,33 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
/* endif */ )
/* NumSrcArgs == 3 */,
!if (!eq(HasModifiers, 1),
- // VOP3 with modifiers
- !if (!eq(HasOMod, 1),
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- Src2Mod:$src2_modifiers, Src2RC:$src2,
- clampmod:$clamp, omod:$omod),
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- Src2Mod:$src2_modifiers, Src2RC:$src2,
- clampmod:$clamp))
+ !if (!eq(HasSrc2Mods, 1),
+ // VOP3 with modifiers
+ !if (!eq(HasOMod, 1),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2,
+ clampmod:$clamp, omod:$omod),
+ !if (!eq(HasIntClamp, 1),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2,
+ clampmod:$clamp),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2Mod:$src2_modifiers, Src2RC:$src2))),
+ // VOP3 with modifiers except src2
+ !if (!eq(HasOMod, 1),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2RC:$src2, clampmod:$clamp, omod:$omod),
+ !if (!eq(HasIntClamp, 1),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2RC:$src2, clampmod:$clamp),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ Src2RC:$src2))))
/* else */,
// VOP3 without modifiers
!if (!eq(HasIntClamp, 1),
@@ -1398,6 +1651,42 @@ class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1
/* endif */)));
}
+class getInsDPP16 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+ int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod> {
+ dag ret = !con(getInsDPP<DstRC, Src0RC, Src1RC, NumSrcArgs,
+ HasModifiers, Src0Mod, Src1Mod>.ret,
+ (ins FI:$fi));
+}
+
+class getInsDPP8 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+ int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod> {
+ dag ret = !if (!eq(NumSrcArgs, 0),
+ // VOP1 without input operands (V_NOP)
+ (ins dpp8:$dpp8, FI:$fi),
+ !if (!eq(NumSrcArgs, 1),
+ !if (!eq(HasModifiers, 1),
+ // VOP1_DPP with modifiers
+ (ins DstRC:$old, Src0Mod:$src0_modifiers,
+ Src0RC:$src0, dpp8:$dpp8, FI:$fi)
+ /* else */,
+ // VOP1_DPP without modifiers
+ (ins DstRC:$old, Src0RC:$src0, dpp8:$dpp8, FI:$fi)
+ /* endif */)
+ /* NumSrcArgs == 2 */,
+ !if (!eq(HasModifiers, 1),
+ // VOP2_DPP with modifiers
+ (ins DstRC:$old,
+ Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ dpp8:$dpp8, FI:$fi)
+ /* else */,
+ // VOP2_DPP without modifiers
+ (ins DstRC:$old,
+ Src0RC:$src0, Src1RC:$src1, dpp8:$dpp8, FI:$fi)
+ /* endif */)));
+}
// Ins for SDWA
@@ -1556,6 +1845,26 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =
string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
}
+class getAsmDPP16 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+ string ret = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret#"$fi";
+}
+
+class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+ string dst = !if(HasDst,
+ !if(!eq(DstVT.Size, 1),
+ "$sdst",
+ "$vdst"),
+ ""); // use $sdst for VOPC
+ string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+ string src1 = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+ " $src1_modifiers,"));
+ string args = !if(!eq(HasModifiers, 0),
+ getAsm32<0, NumSrcArgs, DstVT>.ret,
+ ", "#src0#src1);
+ string ret = dst#args#"$dpp8$fi";
+}
+
class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
string dst = !if(HasDst,
!if(!eq(DstVT.Size, 1),
@@ -1650,9 +1959,12 @@ def PatGenMode {
int Pattern = 1;
}
-class VOPProfile <list<ValueType> _ArgVT> {
+class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
+ bit _EnableClamp = 0> {
field list<ValueType> ArgVT = _ArgVT;
+ field bit EnableF32SrcMods = _EnableF32SrcMods;
+ field bit EnableClamp = _EnableClamp;
field ValueType DstVT = ArgVT[0];
field ValueType Src0VT = ArgVT[1];
@@ -1670,9 +1982,9 @@ class VOPProfile <list<ValueType> _ArgVT> {
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
- field Operand Src0Mod = getSrcMod<Src0VT>.ret;
- field Operand Src1Mod = getSrcMod<Src1VT>.ret;
- field Operand Src2Mod = getSrcMod<Src2VT>.ret;
+ field Operand Src0Mod = getSrcMod<Src0VT, EnableF32SrcMods>.ret;
+ field Operand Src1Mod = getSrcMod<Src1VT, EnableF32SrcMods>.ret;
+ field Operand Src2Mod = getSrcMod<Src2VT, EnableF32SrcMods>.ret;
field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
@@ -1688,12 +2000,16 @@ class VOPProfile <list<ValueType> _ArgVT> {
field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1);
// TODO: Modifiers logic is somewhat adhoc here, to be refined later
- field bit HasModifiers = isModifierType<Src0VT>.ret;
+ // HasModifiers affects the normal and DPP encodings. We take note of EnableF32SrcMods, which
+ // enables modifiers for i32 type.
+ field bit HasModifiers = BitOr<isModifierType<Src0VT>.ret, EnableF32SrcMods>.ret;
+ // HasSrc*FloatMods affects the SDWA encoding. We ignore EnableF32SrcMods.
field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret;
field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret;
field bit HasSrc2FloatMods = isFloatType<Src2VT>.ret;
+ // HasSrc*IntMods affects the SDWA encoding. We ignore EnableF32SrcMods.
field bit HasSrc0IntMods = isIntType<Src0VT>.ret;
field bit HasSrc1IntMods = isIntType<Src1VT>.ret;
field bit HasSrc2IntMods = isIntType<Src2VT>.ret;
@@ -1702,7 +2018,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
- field bit HasClamp = HasModifiers;
+ field bit HasClamp = BitOr<isModifierType<Src0VT>.ret, EnableClamp>.ret;
field bit HasSDWAClamp = EmitDst;
field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
field bit HasIntClamp = !if(isFloatType<DstVT>.ret, 0, HasClamp);
@@ -1721,6 +2037,8 @@ class VOPProfile <list<ValueType> _ArgVT> {
field bit HasExtSDWA9 = HasExt;
field int NeedPatGen = PatGenMode.NoPattern;
+ field bit IsMAI = 0;
+
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods);
@@ -1732,12 +2050,13 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag Outs32 = Outs;
field dag Outs64 = Outs;
field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+ field dag OutsDPP8 = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
- HasIntClamp, HasModifiers, HasOMod, Src0Mod, Src1Mod,
- Src2Mod>.ret;
+ HasIntClamp, HasModifiers, HasSrc2Mods,
+ HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64,
NumSrcArgs, HasClamp,
Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
@@ -1751,6 +2070,10 @@ class VOPProfile <list<ValueType> _ArgVT> {
getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
HasModifiers, Src0ModDPP, Src1ModDPP>.ret,
(ins));
+ field dag InsDPP16 = getInsDPP16<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
+ HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
+ field dag InsDPP8 = getInsDPP8<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, 0,
+ Src0ModDPP, Src1ModDPP>.ret;
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
DstVT>.ret;
@@ -1766,8 +2089,12 @@ class VOPProfile <list<ValueType> _ArgVT> {
HasSrc2FloatMods>.ret;
field string AsmDPP = !if(HasExtDPP,
getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, "");
+ field string AsmDPP16 = getAsmDPP16<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+ field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0, DstVT>.ret;
field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
+
+ field string TieRegDPP = "$old";
}
class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
@@ -1828,6 +2155,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], 0, /*EnableClamp=*/1>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
@@ -1848,6 +2176,19 @@ def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>;
def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>;
+def VOP_V4F32_F32_F32_V4F32 : VOPProfile <[v4f32, f32, f32, v4f32]>;
+def VOP_V16F32_F32_F32_V16F32 : VOPProfile <[v16f32, f32, f32, v16f32]>;
+def VOP_V32F32_F32_F32_V32F32 : VOPProfile <[v32f32, f32, f32, v32f32]>;
+def VOP_V4F32_V4F16_V4F16_V4F32 : VOPProfile <[v4f32, v4f16, v4f16, v4f32]>;
+def VOP_V16F32_V4F16_V4F16_V16F32 : VOPProfile <[v16f32, v4f16, v4f16, v16f32]>;
+def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32f32, v4f16, v4f16, v32f32]>;
+def VOP_V4F32_V2I16_V2I16_V4F32 : VOPProfile <[v4f32, v2i16, v2i16, v4f32]>;
+def VOP_V16F32_V2I16_V2I16_V16F32 : VOPProfile <[v16f32, v2i16, v2i16, v16f32]>;
+def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32f32, v2i16, v2i16, v32f32]>;
+def VOP_V4I32_I32_I32_V4I32 : VOPProfile <[v4i32, i32, i32, v4i32]>;
+def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>;
+def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>;
+
class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
@@ -1871,13 +2212,12 @@ class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
let isCodeGenOnly = 1;
}
+// FIXME-GFX10: WIP.
class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
- string asm> :
+ string asm, int encodingFamily> :
VINTRPCommon <outs, ins, asm, []>,
VINTRPe <op>,
- SIMCInstr<opName, SIEncodingFamily.SI> {
- let AssemblerPredicate = SIAssemblerPredicate;
- let DecoderNamespace = "SICI";
+ SIMCInstr<opName, encodingFamily> {
let DisableDecoder = DisableSIDecoder;
}
@@ -1887,19 +2227,25 @@ class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
VINTRPe_vi <op>,
SIMCInstr<opName, SIEncodingFamily.VI> {
let AssemblerPredicate = VIAssemblerPredicate;
- let DecoderNamespace = "VI";
+ let DecoderNamespace = "GFX8";
let DisableDecoder = DisableVIDecoder;
}
+// FIXME-GFX10: WIP.
multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
list<dag> pattern = []> {
def "" : VINTRP_Pseudo <NAME, outs, ins, pattern>;
- def _si : VINTRP_Real_si <op, NAME, outs, ins, asm>;
+ let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ def _si : VINTRP_Real_si <op, NAME, outs, ins, asm, SIEncodingFamily.SI>;
+ } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>;
-}
+ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ def _gfx10 : VINTRP_Real_si<op, NAME, outs, ins, asm, SIEncodingFamily.GFX10>;
+ } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+}
//===----------------------------------------------------------------------===//
// Vector instruction mappings
//===----------------------------------------------------------------------===//
@@ -1981,7 +2327,9 @@ def getMCOpcodeGen : InstrMapping {
// does not actually change the encoding, and thus may be
// removed later.
[!cast<string>(SIEncodingFamily.GFX80)],
- [!cast<string>(SIEncodingFamily.GFX9)]];
+ [!cast<string>(SIEncodingFamily.GFX9)],
+ [!cast<string>(SIEncodingFamily.GFX10)],
+ [!cast<string>(SIEncodingFamily.SDWA10)]];
}
// Get equivalent SOPK instruction.
@@ -2044,6 +2392,24 @@ def getGlobalSaddrOp : InstrMapping {
let ValueCols = [["1"]];
}
+// Maps a v_cmpx opcode with sdst to opcode without sdst.
+def getVCMPXNoSDstOp : InstrMapping {
+ let FilterClass = "VCMPXNoSDstTable";
+ let RowFields = ["NoSDstOp"];
+ let ColFields = ["HasSDst"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
+}
+
+// Maps a SOPP to a SOPP with S_NOP
+def getSOPPWithRelaxation : InstrMapping {
+ let FilterClass = "Base_SOPP";
+ let RowFields = ["AsmString"];
+ let ColFields = ["Size"];
+ let KeyCol = ["4"];
+ let ValueCols = [["8"]];
+}
+
include "SIInstructions.td"
include "DSInstructions.td"
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index b6b00c2e4257..70f20bb69370 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1,9 +1,8 @@
//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file was originally auto-generated from a GPU register header file and
@@ -12,7 +11,7 @@
//===----------------------------------------------------------------------===//
class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl {
- let SubtargetPredicate = isGCN;
+
}
include "SOPInstructions.td"
@@ -122,7 +121,14 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
-def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
+def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
+ let Defs = [EXEC];
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
+def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
let hasSideEffects = 0;
let mayLoad = 0;
let mayStore = 0;
@@ -155,13 +161,12 @@ def S_SUB_U64_PSEUDO : SPseudoInstSI <
>;
def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
- (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+ (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
>;
def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
- (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+ (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
>;
-
} // End usesCustomInserter = 1, Defs = [SCC]
let usesCustomInserter = 1 in {
@@ -169,23 +174,30 @@ def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
} // End let usesCustomInserter = 1, SALU = 1
-def S_MOV_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
- (ins SSrc_b64:$src0)> {
- let isAsCheapAsAMove = 1;
+// Wrap an instruction by duplicating it, except for setting isTerminator.
+class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
+ base_inst.OutOperandList,
+ base_inst.InOperandList> {
+ let Uses = base_inst.Uses;
+ let Defs = base_inst.Defs;
let isTerminator = 1;
+ let isAsCheapAsAMove = base_inst.isAsCheapAsAMove;
+ let hasSideEffects = base_inst.hasSideEffects;
+ let UseNamedOperandTable = base_inst.UseNamedOperandTable;
+ let CodeSize = base_inst.CodeSize;
}
-def S_XOR_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
- (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
- let isAsCheapAsAMove = 1;
- let isTerminator = 1;
- let Defs = [SCC];
+let WaveSizePredicate = isWave64 in {
+def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
+def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
+def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
}
-def S_ANDN2_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
- (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
- let isAsCheapAsAMove = 1;
- let isTerminator = 1;
+let WaveSizePredicate = isWave32 in {
+def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
+def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
+def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
+def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
}
def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
@@ -195,7 +207,6 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
- let isBarrier = 1;
let isConvergent = 1;
let FixedSize = 1;
let Size = 0;
@@ -222,30 +233,30 @@ let isTerminator = 1 in {
let OtherPredicates = [EnableLateCFGStructurize] in {
def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
(outs),
- (ins SReg_64:$vcc, brtarget:$target),
+ (ins SReg_1:$vcc, brtarget:$target),
[(brcond i1:$vcc, bb:$target)]> {
let Size = 12;
}
}
def SI_IF: CFPseudoInstSI <
- (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
- [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
+ (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target),
+ [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
let Constraints = "";
let Size = 12;
let hasSideEffects = 1;
}
def SI_ELSE : CFPseudoInstSI <
- (outs SReg_64:$dst),
- (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+ (outs SReg_1:$dst),
+ (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
let Size = 12;
let hasSideEffects = 1;
}
def SI_LOOP : CFPseudoInstSI <
- (outs), (ins SReg_64:$saved, brtarget:$target),
- [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
+ (outs), (ins SReg_1:$saved, brtarget:$target),
+ [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> {
let Size = 8;
let isBranch = 1;
let hasSideEffects = 1;
@@ -254,8 +265,7 @@ def SI_LOOP : CFPseudoInstSI <
} // End isTerminator = 1
def SI_END_CF : CFPseudoInstSI <
- (outs), (ins SReg_64:$saved),
- [(int_amdgcn_end_cf i64:$saved)], 1, 1> {
+ (outs), (ins SReg_1:$saved), [], 1, 1> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
@@ -265,8 +275,7 @@ def SI_END_CF : CFPseudoInstSI <
}
def SI_IF_BREAK : CFPseudoInstSI <
- (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
- [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
+ (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
@@ -292,7 +301,7 @@ multiclass PseudoInstKill <dag ins> {
}
}
-defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
+defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
let Defs = [EXEC,VCC] in
@@ -311,7 +320,7 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
}
def SI_PS_LIVE : PseudoInstSI <
- (outs SReg_64:$dst), (ins),
+ (outs SReg_1:$dst), (ins),
[(set i1:$dst, (int_amdgcn_ps_live))]> {
let SALU = 1;
}
@@ -340,6 +349,15 @@ def SI_INIT_EXEC : SPseudoInstSI <
let Defs = [EXEC];
let usesCustomInserter = 1;
let isAsCheapAsAMove = 1;
+ let WaveSizePredicate = isWave64;
+}
+
+def SI_INIT_EXEC_LO : SPseudoInstSI <
+ (outs), (ins i32imm:$src), []> {
+ let Defs = [EXEC_LO];
+ let usesCustomInserter = 1;
+ let isAsCheapAsAMove = 1;
+ let WaveSizePredicate = isWave32;
}
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
@@ -374,11 +392,14 @@ def SI_RETURN : SPseudoInstSI <
// This version is only needed so we can fill in the output regiter in
// the custom inserter.
def SI_CALL_ISEL : SPseudoInstSI <
- (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> {
+ (outs), (ins SSrc_b64:$src0, unknown:$callee),
+ [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> {
let Size = 4;
let isCall = 1;
let SchedRW = [WriteBranch];
let usesCustomInserter = 1;
+ // TODO: Should really base this on the call target
+ let isConvergent = 1;
}
// Wrapper around s_swappc_b64 with extra $callee parameter to track
@@ -389,23 +410,14 @@ def SI_CALL : SPseudoInstSI <
let isCall = 1;
let UseNamedOperandTable = 1;
let SchedRW = [WriteBranch];
+ // TODO: Should really base this on the call target
+ let isConvergent = 1;
}
// Tail call handling pseudo
-def SI_TCRETURN_ISEL : SPseudoInstSI<(outs),
- (ins SSrc_b64:$src0, i32imm:$fpdiff),
- [(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> {
- let isCall = 1;
- let isTerminator = 1;
- let isReturn = 1;
- let isBarrier = 1;
- let SchedRW = [WriteBranch];
- let usesCustomInserter = 1;
-}
-
-def SI_TCRETURN : SPseudoInstSI <
- (outs),
- (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> {
+def SI_TCRETURN : SPseudoInstSI <(outs),
+ (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff),
+ [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
let Size = 4;
let isCall = 1;
let isTerminator = 1;
@@ -413,6 +425,8 @@ def SI_TCRETURN : SPseudoInstSI <
let isBarrier = 1;
let UseNamedOperandTable = 1;
let SchedRW = [WriteBranch];
+ // TODO: Should really base this on the call target
+ let isConvergent = 1;
}
@@ -424,6 +438,8 @@ def ADJCALLSTACKUP : SPseudoInstSI<
let FixedSize = 1;
let hasSideEffects = 1;
let usesCustomInserter = 1;
+ let SchedRW = [WriteSALU];
+ let Defs = [SCC];
}
def ADJCALLSTACKDOWN : SPseudoInstSI<
@@ -433,6 +449,8 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
let Size = 8; // Worst case. (s_add_u32 + constant)
let hasSideEffects = 1;
let usesCustomInserter = 1;
+ let SchedRW = [WriteSALU];
+ let Defs = [SCC];
}
let Defs = [M0, EXEC, SCC],
@@ -490,9 +508,12 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
// SI_SPILL_32_* instructions.
defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
let UseNamedOperandTable = 1, VGPRSpill = 1,
@@ -504,7 +525,9 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
let mayStore = 1;
let mayLoad = 0;
// (2 * 4) + (8 * num_subregs) bytes maximum
- let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
}
def _RESTORE : VPseudoInstSI <
@@ -515,7 +538,9 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
let mayLoad = 1;
// (2 * 4) + (8 * num_subregs) bytes maximum
- let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
}
} // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
}
@@ -524,21 +549,74 @@ defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
+defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
+
+multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> {
+ let UseNamedOperandTable = 1, VGPRSpill = 1,
+ Constraints = "@earlyclobber $tmp",
+ SchedRW = [WriteVMEM] in {
+ def _SAVE : VPseudoInstSI <
+ (outs VGPR_32:$tmp),
+ (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+ SReg_32:$soffset, i32imm:$offset)> {
+ let mayStore = 1;
+ let mayLoad = 0;
+ // (2 * 4) + (16 * num_subregs) bytes maximum
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+ }
+
+ def _RESTORE : VPseudoInstSI <
+ (outs vgpr_class:$vdata, VGPR_32:$tmp),
+ (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
+ i32imm:$offset)> {
+ let mayStore = 0;
+ let mayLoad = 1;
+
+ // (2 * 4) + (16 * num_subregs) bytes maximum
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
+ // Size field is unsigned char and cannot fit more.
+ let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+ }
+ } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
+}
+
+defm SI_SPILL_A32 : SI_SPILL_AGPR <AGPR_32>;
+defm SI_SPILL_A64 : SI_SPILL_AGPR <AReg_64>;
+defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>;
+defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>;
+defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>;
def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
(ins si_ga:$ptr_lo, si_ga:$ptr_hi),
[(set SReg_64:$dst,
- (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> {
+ (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> {
let Defs = [SCC];
}
def : GCNPat <
+ (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0),
+ (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
+>;
+
+def : GCNPat <
(AMDGPUinit_exec i64:$src),
(SI_INIT_EXEC (as_i64imm $src))
->;
+> {
+ let WaveSizePredicate = isWave64;
+}
+
+def : GCNPat <
+ (AMDGPUinit_exec i64:$src),
+ (SI_INIT_EXEC_LO (as_i32imm $src))
+> {
+ let WaveSizePredicate = isWave32;
+}
def : GCNPat <
(AMDGPUinit_exec_from_input i32:$input, i32:$shift),
@@ -551,7 +629,7 @@ def : GCNPat<
>;
def : GCNPat<
- (AMDGPUelse i64:$src, bb:$target),
+ (AMDGPUelse i1:$src, bb:$target),
(SI_ELSE $src, $target, 0)
>;
@@ -584,7 +662,12 @@ def : Pat <
// TODO: we could add more variants for other types of conditionals
def : Pat <
- (int_amdgcn_icmp i1:$src, (i1 0), (i32 33)),
+ (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
+ (COPY $src) // Return the SGPRs representing i1 src
+>;
+
+def : Pat <
+ (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
(COPY $src) // Return the SGPRs representing i1 src
>;
@@ -592,7 +675,7 @@ def : Pat <
// VOP1 Patterns
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] in {
+let OtherPredicates = [UnsafeFPMath] in {
//def : RcpPat<V_RCP_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F64_e32, f64>;
@@ -615,7 +698,7 @@ def : GCNPat <
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-} // End SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath]
+} // End OtherPredicates = [UnsafeFPMath]
// f16_to_fp patterns
@@ -706,17 +789,18 @@ def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
let SubtargetPredicate = Has16BitInsts;
}
-multiclass SelectPat <ValueType vt, Instruction inst> {
+multiclass SelectPat <ValueType vt> {
def : GCNPat <
- (vt (select i1:$src0, vt:$src1, vt:$src2)),
- (inst $src2, $src1, $src0)
+ (vt (select i1:$src0, (VOP3Mods_f32 vt:$src1, i32:$src1_mods),
+ (VOP3Mods_f32 vt:$src2, i32:$src2_mods))),
+ (V_CNDMASK_B32_e64 $src2_mods, $src2, $src1_mods, $src1, $src0)
>;
}
-defm : SelectPat <i16, V_CNDMASK_B32_e64>;
-defm : SelectPat <i32, V_CNDMASK_B32_e64>;
-defm : SelectPat <f16, V_CNDMASK_B32_e64>;
-defm : SelectPat <f32, V_CNDMASK_B32_e64>;
+defm : SelectPat <i16>;
+defm : SelectPat <i32>;
+defm : SelectPat <f16>;
+defm : SelectPat <f32>;
let AddedComplexity = 1 in {
def : GCNPat <
@@ -749,6 +833,22 @@ foreach Index = 0-2 in {
>;
}
+foreach Index = 0-2 in {
+ def Extract_Element_v3i32_#Index : Extract_Element <
+ i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v3i32_#Index : Insert_Element <
+ i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v3f32_#Index : Extract_Element <
+ f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v3f32_#Index : Insert_Element <
+ f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
foreach Index = 0-3 in {
def Extract_Element_v4i32_#Index : Extract_Element <
i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -765,6 +865,22 @@ foreach Index = 0-3 in {
>;
}
+foreach Index = 0-4 in {
+ def Extract_Element_v5i32_#Index : Extract_Element <
+ i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v5i32_#Index : Insert_Element <
+ i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v5f32_#Index : Extract_Element <
+ f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v5f32_#Index : Insert_Element <
+ f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
foreach Index = 0-7 in {
def Extract_Element_v8i32_#Index : Extract_Element <
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -818,7 +934,23 @@ def : Pat <
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
>;
-let SubtargetPredicate = isGCN in {
+foreach Index = 0-31 in {
+ def Extract_Element_v32i32_#Index : Extract_Element <
+ i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Insert_Element_v32i32_#Index : Insert_Element <
+ i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v32f32_#Index : Extract_Element <
+ f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Insert_Element_v32f32_#Index : Insert_Element <
+ f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
// FIXME: Why do only some of these type combinations for SReg and
// VReg?
@@ -882,6 +1014,10 @@ def : BitConvert <i64, v4f16, VReg_64>;
def : BitConvert <v4i32, v4f32, VReg_128>;
def : BitConvert <v4f32, v4i32, VReg_128>;
+// 96-bit bitcast
+def : BitConvert <v3i32, v3f32, SGPR_96>;
+def : BitConvert <v3f32, v3i32, SGPR_96>;
+
// 128-bit bitcast
def : BitConvert <v2i64, v4i32, SReg_128>;
def : BitConvert <v4i32, v2i64, SReg_128>;
@@ -892,6 +1028,10 @@ def : BitConvert <v4i32, v2f64, VReg_128>;
def : BitConvert <v2i64, v2f64, VReg_128>;
def : BitConvert <v2f64, v2i64, VReg_128>;
+// 160-bit bitcast
+def : BitConvert <v5i32, v5f32, SGPR_160>;
+def : BitConvert <v5f32, v5i32, SGPR_160>;
+
// 256-bit bitcast
def : BitConvert <v8i32, v8f32, SReg_256>;
def : BitConvert <v8f32, v8i32, SReg_256>;
@@ -902,7 +1042,9 @@ def : BitConvert <v8f32, v8i32, VReg_256>;
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
-} // End SubtargetPredicate = isGCN
+// 1024-bit bitcast
+def : BitConvert <v32i32, v32f32, VReg_1024>;
+def : BitConvert <v32f32, v32i32, VReg_1024>;
/********** =================== **********/
/********** Src & Dst modifiers **********/
@@ -1070,6 +1212,16 @@ def : GCNPat <
(S_MOV_B32 imm:$imm)
>;
+def : GCNPat <
+ (VGPRImm<(SIlds tglobaladdr:$ga)>),
+ (V_MOV_B32_e32 $ga)
+>;
+
+def : GCNPat <
+ (SIlds tglobaladdr:$ga),
+ (S_MOV_B32 $ga)
+>;
+
// FIXME: Workaround for ordering issue with peephole optimizer where
// a register class copy interferes with immediate folding. Should
// use s_mov_b32, which can be shrunk to s_movk_i32
@@ -1104,7 +1256,16 @@ def : GCNPat <
def : GCNPat <
(i1 imm:$imm),
(S_MOV_B64 (i64 (as_i64imm $imm)))
->;
+> {
+ let WaveSizePredicate = isWave64;
+}
+
+def : GCNPat <
+ (i1 imm:$imm),
+ (S_MOV_B32 (i32 (as_i32imm $imm)))
+> {
+ let WaveSizePredicate = isWave32;
+}
def : GCNPat <
(f64 InlineFPImm<f64>:$imm),
@@ -1115,18 +1276,18 @@ def : GCNPat <
/********** Intrinsic Patterns **********/
/********** ================== **********/
-let SubtargetPredicate = isGCN in {
def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
-}
def : GCNPat <
(i32 (sext i1:$src0)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0)
>;
class Ext32Pat <SDNode ext> : GCNPat <
(i32 (ext i1:$src0)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 1), $src0)
>;
def : Ext32Pat <zext>;
@@ -1144,8 +1305,6 @@ def : GCNPat <
// VOP3 Patterns
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN in {
-
def : IMad24Pat<V_MAD_I32_I24, 1>;
def : UMad24Pat<V_MAD_U32_U24, 1>;
@@ -1153,8 +1312,6 @@ def : UMad24Pat<V_MAD_U32_U24, 1>;
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
-}
-
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
@@ -1261,8 +1418,9 @@ def : GCNPat <
class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
(i64 (ext i1:$src)),
(REG_SEQUENCE VReg_64,
- (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
- (S_MOV_B32 (i32 0)), sub1)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
+ sub0, (S_MOV_B32 (i32 0)), sub1)
>;
@@ -1280,8 +1438,10 @@ def : GCNPat <
def : GCNPat <
(i64 (sext i1:$src)),
(REG_SEQUENCE VReg_64,
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0,
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1)
>;
class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat <
@@ -1296,10 +1456,12 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
// If we need to perform a logical operation on i1 values, we need to
// use vector comparisons since there is only one SCC register. Vector
-// comparisons still write to a pair of SGPRs, so treat these as
-// 64-bit comparisons. When legalizing SGPR copies, instructions
-// resulting in the copies from SCC to these instructions will be
-// moved to the VALU.
+// comparisons may write to a pair of SGPRs or a single SGPR, so treat
+// these as 32 or 64-bit comparisons. When legalizing SGPR copies,
+// instructions resulting in the copies from SCC to these instructions
+// will be moved to the VALU.
+
+let WaveSizePredicate = isWave64 in {
def : GCNPat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B64 $src0, $src1)
@@ -1336,35 +1498,89 @@ def : GCNPat <
(S_NOT_B64 $src0)
>;
}
+} // end isWave64
+
+let WaveSizePredicate = isWave32 in {
+def : GCNPat <
+ (i1 (and i1:$src0, i1:$src1)),
+ (S_AND_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (or i1:$src0, i1:$src1)),
+ (S_OR_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (xor i1:$src0, i1:$src1)),
+ (S_XOR_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (add i1:$src0, i1:$src1)),
+ (S_XOR_B32 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (sub i1:$src0, i1:$src1)),
+ (S_XOR_B32 $src0, $src1)
+>;
+
+let AddedComplexity = 1 in {
+def : GCNPat <
+ (i1 (add i1:$src0, (i1 -1))),
+ (S_NOT_B32 $src0)
+>;
+
+def : GCNPat <
+ (i1 (sub i1:$src0, (i1 -1))),
+ (S_NOT_B32 $src0)
+>;
+}
+} // end isWave32
def : GCNPat <
(f16 (sint_to_fp i1:$src)),
- (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src))
+ (V_CVT_F16_F32_e32 (
+ V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
+ $src))
>;
def : GCNPat <
(f16 (uint_to_fp i1:$src)),
- (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src))
+ (V_CVT_F16_F32_e32 (
+ V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
+ $src))
>;
def : GCNPat <
(f32 (sint_to_fp i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
+ $src)
>;
def : GCNPat <
(f32 (uint_to_fp i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
+ $src)
>;
def : GCNPat <
(f64 (sint_to_fp i1:$src)),
- (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+ (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1),
+ $src))
>;
def : GCNPat <
(f64 (uint_to_fp i1:$src)),
- (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
+ (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 1),
+ $src))
>;
//===----------------------------------------------------------------------===//
@@ -1417,7 +1633,7 @@ def : GCNPat<
def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
- (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+ (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
>;
}
@@ -1478,6 +1694,14 @@ def : GCNPat <
>;
} // End OtherPredicates = [HasDLInsts]
+let SubtargetPredicate = isGFX10Plus in
+def : GCNPat <
+ (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+ (f16 (VOP3NoMods f32:$src2))),
+ (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+ SRCMODS.NONE, $src2, $clamp, $omod)
+>;
// Allow integer inputs
class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
@@ -1568,7 +1792,7 @@ def : GCNPat <
// Fract Patterns
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isSI in {
+let SubtargetPredicate = isGFX6 in {
// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
@@ -1595,7 +1819,7 @@ def : GCNPat <
DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-} // End SubtargetPredicates = isSI
+} // End SubtargetPredicates = isGFX6
//============================================================================//
// Miscellaneous Optimization Patterns
@@ -1609,6 +1833,13 @@ def : GCNPat<
(S_SUB_I32 $src0, NegSubInlineConst32:$src1)
>;
+// Avoid pointlessly materializing a constant in VGPR.
+// FIXME: Should also do this for readlane, but tablegen crashes on
+// the ignored src1.
+def : GCNPat<
+ (int_amdgcn_readfirstlane (i32 imm:$src)),
+ (S_MOV_B32 $src)
+>;
multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
def : GCNPat <
@@ -1622,8 +1853,6 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
>;
}
-let SubtargetPredicate = isGCN in {
-
defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
@@ -1633,8 +1862,6 @@ defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
-}
-
// This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
class FPMed3Pat<ValueType vt,
@@ -1683,8 +1910,8 @@ multiclass Int16Med3Pat<Instruction med3Inst,
def : FPMed3Pat<f32, V_MED3_F32>;
-let OtherPredicates = [isGFX9] in {
+let OtherPredicates = [isGFX9Plus] in {
def : FP16Med3Pat<f16, V_MED3_F16>;
defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
-} // End Predicates = [isGFX9]
+} // End Predicates = [isGFX9Plus]
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
deleted file mode 100644
index e51ff4b4bc50..000000000000
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ /dev/null
@@ -1,19 +0,0 @@
-//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Backend internal SI Intrinsic Definitions. User code should not
-// directly use these.
-//
-//===----------------------------------------------------------------------===//
-
-
-let TargetPrefix = "SI", isTarget = 1 in {
- def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-
-} // End TargetPrefix = "SI", isTarget = 1
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index be291b127301..ae8b967893a2 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1,9 +1,8 @@
//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -132,6 +131,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
bool GLC1;
bool SLC0;
bool SLC1;
+ bool DLC0;
+ bool DLC1;
bool UseST64;
SmallVector<MachineInstr *, 8> InstsToMove;
};
@@ -257,13 +258,11 @@ static void addDefsUsesToList(const MachineInstr &MI,
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
MachineBasicBlock::iterator B,
- const SIInstrInfo *TII,
AliasAnalysis *AA) {
// RAW or WAR - cannot reorder
// WAW - cannot reorder
// RAR - safe to reorder
- return !(A->mayStore() || B->mayStore()) ||
- TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
+ return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
}
// Add MI and its defs to the lists if MI reads one of the defs that are
@@ -282,6 +281,7 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
// registers are in SSA form.
if (Use.isReg() &&
((Use.readsReg() && RegDefs.count(Use.getReg())) ||
+ (Use.isDef() && RegDefs.count(Use.getReg())) ||
(Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
PhysRegUses.count(Use.getReg())))) {
Insts.push_back(&MI);
@@ -295,13 +295,13 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
ArrayRef<MachineInstr *> InstsToMove,
- const SIInstrInfo *TII, AliasAnalysis *AA) {
+ AliasAnalysis *AA) {
assert(MemOp.mayLoadOrStore());
for (MachineInstr *InstToMove : InstsToMove) {
if (!InstToMove->mayLoadOrStore())
continue;
- if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+ if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
return false;
}
return true;
@@ -326,7 +326,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
return (EltOffset0 + CI.Width0 == EltOffset1 ||
EltOffset1 + CI.Width1 == EltOffset0) &&
- CI.GLC0 == CI.GLC1 &&
+ CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
(CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
}
@@ -567,8 +567,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
}
if (MBBI->mayLoadOrStore() &&
- (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
+ (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
// We fail condition #1, but we may still be able to satisfy condition
// #2. Add this instruction to the move list and then we will check
// if condition #2 holds once we have selected the matching instruction.
@@ -640,6 +640,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
}
+ CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm();
+ CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm();
}
// Check both offsets fit in the reduced range.
@@ -647,7 +649,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
// move and make sure they are all safe to move down past the merged
// instruction.
if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
- if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+ if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
return true;
}
@@ -656,8 +658,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
// it was safe to move I and also all the instruction in InstsToMove
// down past this instruction.
// check if we can move I across MBBI and if we can move all I's users
- if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+ if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
break;
}
return false;
@@ -726,7 +728,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
.addReg(ImmReg)
- .addReg(AddrReg->getReg(), 0, BaseSubReg);
+ .addReg(AddrReg->getReg(), 0, BaseSubReg)
+ .addImm(0); // clamp bit
BaseSubReg = 0;
}
@@ -819,7 +822,8 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
.addReg(ImmReg)
- .addReg(AddrReg->getReg(), 0, BaseSubReg);
+ .addReg(AddrReg->getReg(), 0, BaseSubReg)
+ .addImm(0); // clamp bit
BaseSubReg = 0;
}
@@ -858,6 +862,7 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
.addImm(MergedOffset) // offset
.addImm(CI.GLC0) // glc
+ .addImm(CI.DLC0) // dlc
.cloneMergedMemRefs({&*CI.I, &*CI.Paired});
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
@@ -910,6 +915,7 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
.addImm(CI.GLC0) // glc
.addImm(CI.SLC0) // slc
.addImm(0) // tfe
+ .addImm(CI.DLC0) // dlc
.cloneMergedMemRefs({&*CI.I, &*CI.Paired});
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
@@ -1089,9 +1095,10 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(std::min(CI.Offset0, CI.Offset1)) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.SLC0) // slc
- .addImm(0) // tfe
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .addImm(CI.DLC0) // dlc
.cloneMergedMemRefs({&*CI.I, &*CI.Paired});
moveInstsAfter(MIB, CI.InstsToMove);
@@ -1137,9 +1144,10 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
MachineOperand OffsetHi =
createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
- unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- unsigned DeadCarryReg =
- MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ unsigned CarryReg = MRI->createVirtualRegister(CarryRC);
+ unsigned DeadCarryReg = MRI->createVirtualRegister(CarryRC);
unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -1147,7 +1155,8 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
.addReg(CarryReg, RegState::Define)
.addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
- .add(OffsetLo);
+ .add(OffsetLo)
+ .addImm(0); // clamp bit
(void)LoHalf;
LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
@@ -1156,7 +1165,8 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
.addReg(DeadCarryReg, RegState::Define | RegState::Dead)
.addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
.add(OffsetHi)
- .addReg(CarryReg, RegState::Kill);
+ .addReg(CarryReg, RegState::Kill)
+ .addImm(0); // clamp bit
(void)HiHalf;
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 1aa1feebbdae..78f409cd9555 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -1,9 +1,8 @@
//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -83,6 +82,16 @@ private:
LiveIntervals *LIS = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ const TargetRegisterClass *BoolRC = nullptr;
+ unsigned AndOpc;
+ unsigned OrOpc;
+ unsigned XorOpc;
+ unsigned MovTermOpc;
+ unsigned Andn2TermOpc;
+ unsigned XorTermrOpc;
+ unsigned OrSaveExecOpc;
+ unsigned Exec;
+
void emitIf(MachineInstr &MI);
void emitElse(MachineInstr &MI);
void emitIfBreak(MachineInstr &MI);
@@ -176,7 +185,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
Cond.getSubReg() == AMDGPU::NoSubRegister);
- unsigned SaveExecReg = SaveExec.getReg();
+ Register SaveExecReg = SaveExec.getReg();
MachineOperand &ImpDefSCC = MI.getOperand(4);
assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
@@ -188,26 +197,26 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// Add an implicit def of exec to discourage scheduling VALU after this which
// will interfere with trying to form s_and_saveexec_b64 later.
- unsigned CopyReg = SimpleIf ? SaveExecReg
- : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register CopyReg = SimpleIf ? SaveExecReg
+ : MRI->createVirtualRegister(BoolRC);
MachineInstr *CopyExec =
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
- .addReg(AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC, RegState::ImplicitDefine);
+ .addReg(Exec)
+ .addReg(Exec, RegState::ImplicitDefine);
- unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned Tmp = MRI->createVirtualRegister(BoolRC);
MachineInstr *And =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp)
+ BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
.addReg(CopyReg)
- //.addReg(AMDGPU::EXEC)
- .addReg(Cond.getReg());
+ .add(Cond);
+
setImpSCCDefDead(*And, true);
MachineInstr *Xor = nullptr;
if (!SimpleIf) {
Xor =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+ BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg)
.addReg(Tmp)
.addReg(CopyReg);
setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
@@ -216,7 +225,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// Use a copy that is a terminator to get correct spill code placement it with
// fast regalloc.
MachineInstr *SetExec =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC)
+ BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
.addReg(Tmp, RegState::Kill);
// Insert a pseudo terminator to help keep the verifier happy. This will also
@@ -240,7 +249,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
LIS->InsertMachineInstrInMaps(*SetExec);
LIS->InsertMachineInstrInMaps(*NewBr);
- LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+ LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
MI.eraseFromParent();
// FIXME: Is there a better way of adjusting the liveness? It shouldn't be
@@ -257,7 +266,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
bool ExecModified = MI.getOperand(3).getImm() != 0;
@@ -266,17 +275,17 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
// We are running before TwoAddressInstructions, and si_else's operands are
// tied. In order to correctly tie the registers, split this into a copy of
// the src like it does.
- unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register CopyReg = MRI->createVirtualRegister(BoolRC);
MachineInstr *CopyExec =
BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
.add(MI.getOperand(1)); // Saved EXEC
// This must be inserted before phis and any spill code inserted before the
// else.
- unsigned SaveReg = ExecModified ?
- MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg;
+ Register SaveReg = ExecModified ?
+ MRI->createVirtualRegister(BoolRC) : DstReg;
MachineInstr *OrSaveExec =
- BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg)
+ BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
.addReg(CopyReg);
MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
@@ -285,8 +294,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
if (ExecModified) {
MachineInstr *And =
- BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
- .addReg(AMDGPU::EXEC)
+ BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
+ .addReg(Exec)
.addReg(SaveReg);
if (LIS)
@@ -294,8 +303,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
}
MachineInstr *Xor =
- BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
+ .addReg(Exec)
.addReg(DstReg);
MachineInstr *Branch =
@@ -324,7 +333,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
LIS->createAndComputeVirtRegInterval(SaveReg);
// Let this be recomputed.
- LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+ LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
}
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
@@ -348,14 +357,14 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
// exit" mask.
MachineInstr *And = nullptr, *Or = nullptr;
if (!SkipAnding) {
- And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
- .addReg(AMDGPU::EXEC)
+ And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Dst)
+ .addReg(Exec)
.add(MI.getOperand(1));
- Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+ Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
.addReg(Dst)
.add(MI.getOperand(2));
} else
- Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+ Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
.add(MI.getOperand(1))
.add(MI.getOperand(2));
@@ -373,8 +382,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
const DebugLoc &DL = MI.getDebugLoc();
MachineInstr *AndN2 =
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec)
+ .addReg(Exec)
.add(MI.getOperand(0));
MachineInstr *Branch =
@@ -395,8 +404,8 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock::iterator InsPt = MBB.begin();
MachineInstr *NewMI =
- BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec)
+ .addReg(Exec)
.add(MI.getOperand(0));
if (LIS)
@@ -428,13 +437,13 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
// does not really modify exec.
for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
- !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
+ !(I->isCopy() && I->getOperand(0).getReg() != Exec))
return;
for (const auto &SrcOp : Def->explicit_operands())
if (SrcOp.isReg() && SrcOp.isUse() &&
(TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
- SrcOp.getReg() == AMDGPU::EXEC))
+ SrcOp.getReg() == Exec))
Src.push_back(SrcOp);
}
@@ -472,6 +481,27 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
// This doesn't actually need LiveIntervals, but we can preserve them.
LIS = getAnalysisIfAvailable<LiveIntervals>();
MRI = &MF.getRegInfo();
+ BoolRC = TRI->getBoolRC();
+
+ if (ST.isWave32()) {
+ AndOpc = AMDGPU::S_AND_B32;
+ OrOpc = AMDGPU::S_OR_B32;
+ XorOpc = AMDGPU::S_XOR_B32;
+ MovTermOpc = AMDGPU::S_MOV_B32_term;
+ Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
+ XorTermrOpc = AMDGPU::S_XOR_B32_term;
+ OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+ Exec = AMDGPU::EXEC_LO;
+ } else {
+ AndOpc = AMDGPU::S_AND_B64;
+ OrOpc = AMDGPU::S_OR_B64;
+ XorOpc = AMDGPU::S_XOR_B64;
+ MovTermOpc = AMDGPU::S_MOV_B64_term;
+ Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
+ XorTermrOpc = AMDGPU::S_XOR_B64_term;
+ OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+ Exec = AMDGPU::EXEC;
+ }
MachineFunction::iterator NextBB;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -508,6 +538,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::S_AND_B64:
case AMDGPU::S_OR_B64:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_OR_B32:
// Cleanup bit manipulations on exec mask
combineMasks(MI);
Last = I;
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index eb038bb5d5fc..1c0f836f07e6 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -1,15 +1,14 @@
//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
-// to lane masks (64-bit scalar registers). The pass assumes machine SSA form
-// and a wave-level control flow graph.
+// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA
+// form and a wave-level control flow graph.
//
// Before this pass, values that are semantically i1 and are defined and used
// within the same basic block are already represented as lane masks in scalar
@@ -51,6 +50,7 @@ public:
static char ID;
private:
+ bool IsWave32 = false;
MachineFunction *MF = nullptr;
MachineDominatorTree *DT = nullptr;
MachinePostDominatorTree *PDT = nullptr;
@@ -58,6 +58,14 @@ private:
const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
+ unsigned ExecReg;
+ unsigned MovOp;
+ unsigned AndOp;
+ unsigned OrOp;
+ unsigned XorOp;
+ unsigned AndN2Op;
+ unsigned OrN2Op;
+
DenseSet<unsigned> ConstrainRegs;
public:
@@ -87,6 +95,11 @@ private:
MachineBasicBlock::iterator
getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
+ bool isVreg1(unsigned Reg) const {
+ return TargetRegisterInfo::isVirtualRegister(Reg) &&
+ MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
+ }
+
bool isLaneMaskReg(unsigned Reg) const {
return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
@@ -412,8 +425,10 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
}
static unsigned createLaneMaskReg(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
- return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass
+ : &AMDGPU::SReg_64RegClass);
}
static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
@@ -443,13 +458,32 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
ST = &MF->getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
+ IsWave32 = ST->isWave32();
+
+ if (IsWave32) {
+ ExecReg = AMDGPU::EXEC_LO;
+ MovOp = AMDGPU::S_MOV_B32;
+ AndOp = AMDGPU::S_AND_B32;
+ OrOp = AMDGPU::S_OR_B32;
+ XorOp = AMDGPU::S_XOR_B32;
+ AndN2Op = AMDGPU::S_ANDN2_B32;
+ OrN2Op = AMDGPU::S_ORN2_B32;
+ } else {
+ ExecReg = AMDGPU::EXEC;
+ MovOp = AMDGPU::S_MOV_B64;
+ AndOp = AMDGPU::S_AND_B64;
+ OrOp = AMDGPU::S_OR_B64;
+ XorOp = AMDGPU::S_XOR_B64;
+ AndN2Op = AMDGPU::S_ANDN2_B64;
+ OrN2Op = AMDGPU::S_ORN2_B64;
+ }
lowerCopiesFromI1();
lowerPhis();
lowerCopiesToI1();
for (unsigned Reg : ConstrainRegs)
- MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
+ MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
ConstrainRegs.clear();
return true;
@@ -465,13 +499,10 @@ void SILowerI1Copies::lowerCopiesFromI1() {
unsigned DstReg = MI.getOperand(0).getReg();
unsigned SrcReg = MI.getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
- MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
+ if (!isVreg1(SrcReg))
continue;
- if (isLaneMaskReg(DstReg) ||
- (TargetRegisterInfo::isVirtualRegister(DstReg) &&
- MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
+ if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
continue;
// Copy into a 32-bit vector register.
@@ -484,6 +515,8 @@ void SILowerI1Copies::lowerCopiesFromI1() {
ConstrainRegs.insert(SrcReg);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
+ .addImm(0)
+ .addImm(0)
.addImm(-1)
.addReg(SrcReg);
DeadCopies.push_back(&MI);
@@ -503,18 +536,22 @@ void SILowerI1Copies::lowerPhis() {
SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
SmallVector<unsigned, 4> IncomingRegs;
SmallVector<unsigned, 4> IncomingUpdated;
+#ifndef NDEBUG
+ DenseSet<unsigned> PhiRegisters;
+#endif
for (MachineBasicBlock &MBB : *MF) {
LF.initialize(MBB);
for (MachineInstr &MI : MBB.phis()) {
unsigned DstReg = MI.getOperand(0).getReg();
- if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+ if (!isVreg1(DstReg))
continue;
LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);
- MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+ MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass
+ : &AMDGPU::SReg_64RegClass);
// Collect incoming values.
for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
@@ -525,18 +562,22 @@ void SILowerI1Copies::lowerPhis() {
if (IncomingDef->getOpcode() == AMDGPU::COPY) {
IncomingReg = IncomingDef->getOperand(1).getReg();
- assert(isLaneMaskReg(IncomingReg));
+ assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
assert(!IncomingDef->getOperand(1).getSubReg());
} else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
continue;
} else {
- assert(IncomingDef->isPHI());
+ assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
}
IncomingBlocks.push_back(IncomingMBB);
IncomingRegs.push_back(IncomingReg);
}
+#ifndef NDEBUG
+ PhiRegisters.insert(DstReg);
+#endif
+
// Phis in a loop that are observed outside the loop receive a simple but
// conservatively correct treatment.
MachineBasicBlock *PostDomBound = &MBB;
@@ -629,8 +670,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
continue;
unsigned DstReg = MI.getOperand(0).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(DstReg) ||
- MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+ if (!isVreg1(DstReg))
continue;
if (MRI->use_empty(DstReg)) {
@@ -640,7 +680,8 @@ void SILowerI1Copies::lowerCopiesToI1() {
LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
- MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+ MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass
+ : &AMDGPU::SReg_64RegClass);
if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
continue;
@@ -649,7 +690,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
assert(!MI.getOperand(1).getSubReg());
if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
- !isLaneMaskReg(SrcReg)) {
+ (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
unsigned TmpReg = createLaneMaskReg(*MF);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
@@ -699,7 +740,7 @@ bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
return false;
}
- if (MI->getOpcode() != AMDGPU::S_MOV_B64)
+ if (MI->getOpcode() != MovOp)
return false;
if (!MI->getOperand(1).isImm())
@@ -774,10 +815,10 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
if (PrevVal == CurVal) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
} else if (CurVal) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg);
} else {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
- .addReg(AMDGPU::EXEC)
+ BuildMI(MBB, I, DL, TII->get(XorOp), DstReg)
+ .addReg(ExecReg)
.addImm(-1);
}
return;
@@ -790,9 +831,9 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
PrevMaskedReg = PrevReg;
} else {
PrevMaskedReg = createLaneMaskReg(*MF);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
+ BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
.addReg(PrevReg)
- .addReg(AMDGPU::EXEC);
+ .addReg(ExecReg);
}
}
if (!CurConstant) {
@@ -801,9 +842,9 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
CurMaskedReg = CurReg;
} else {
CurMaskedReg = createLaneMaskReg(*MF);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
+ BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
.addReg(CurReg)
- .addReg(AMDGPU::EXEC);
+ .addReg(ExecReg);
}
}
@@ -814,12 +855,12 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
.addReg(PrevMaskedReg);
} else if (PrevConstant && PrevVal) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
+ BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg)
.addReg(CurMaskedReg)
- .addReg(AMDGPU::EXEC);
+ .addReg(ExecReg);
} else {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
+ BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
.addReg(PrevMaskedReg)
- .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
+ .addReg(CurMaskedReg ? CurMaskedReg : ExecReg);
}
}
diff --git a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
new file mode 100644
index 000000000000..a82047473370
--- /dev/null
+++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -0,0 +1,323 @@
+//===-- SILowerSGPRSPills.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Handle SGPR spills. This pass takes the place of PrologEpilogInserter for all
+// SGPR spills, so must insert CSR SGPR spills as well as expand them.
+//
+// This pass must never create new SGPR virtual registers.
+//
+// FIXME: Must stop RegScavenger spills in later passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-lower-sgpr-spills"
+
+using MBBVector = SmallVector<MachineBasicBlock *, 4>;
+
+namespace {
+
+static cl::opt<bool> EnableSpillVGPRToAGPR(
+ "amdgpu-spill-vgpr-to-agpr",
+ cl::desc("Enable spilling VGPRs to AGPRs"),
+ cl::ReallyHidden,
+ cl::init(true));
+
+class SILowerSGPRSpills : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ VirtRegMap *VRM = nullptr;
+ LiveIntervals *LIS = nullptr;
+
+ // Save and Restore blocks of the current function. Typically there is a
+ // single save block, unless Windows EH funclets are involved.
+ MBBVector SaveBlocks;
+ MBBVector RestoreBlocks;
+
+public:
+ static char ID;
+
+ SILowerSGPRSpills() : MachineFunctionPass(ID) {}
+
+ void calculateSaveRestoreBlocks(MachineFunction &MF);
+ bool spillCalleeSavedRegs(MachineFunction &MF);
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char SILowerSGPRSpills::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE,
+ "SI lower SGPR spill instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE,
+ "SI lower SGPR spill instructions", false, false)
+
+char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID;
+
+/// Insert restore code for the callee-saved registers used in the function.
+static void insertCSRSaves(MachineBasicBlock &SaveBlock,
+ ArrayRef<CalleeSavedInfo> CSI,
+ LiveIntervals *LIS) {
+ MachineFunction &MF = *SaveBlock.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+ MachineBasicBlock::iterator I = SaveBlock.begin();
+ if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
+ for (const CalleeSavedInfo &CS : CSI) {
+ // Insert the spill to the stack frame.
+ unsigned Reg = CS.getReg();
+
+ MachineInstrSpan MIS(I, &SaveBlock);
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+
+ TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
+ TRI);
+
+ if (LIS) {
+ assert(std::distance(MIS.begin(), I) == 1);
+ MachineInstr &Inst = *std::prev(I);
+
+ LIS->InsertMachineInstrInMaps(Inst);
+ LIS->removeAllRegUnitsForPhysReg(Reg);
+ }
+ }
+ }
+}
+
+/// Insert restore code for the callee-saved registers used in the function.
+static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
+ std::vector<CalleeSavedInfo> &CSI,
+ LiveIntervals *LIS) {
+ MachineFunction &MF = *RestoreBlock.getParent();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+ // Restore all registers immediately before the return and any
+ // terminators that precede it.
+ MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator();
+
+ // FIXME: Just emit the readlane/writelane directly
+ if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
+ for (const CalleeSavedInfo &CI : reverse(CSI)) {
+ unsigned Reg = CI.getReg();
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+
+ TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI);
+ assert(I != RestoreBlock.begin() &&
+ "loadRegFromStackSlot didn't insert any code!");
+ // Insert in reverse order. loadRegFromStackSlot can insert
+ // multiple instructions.
+
+ if (LIS) {
+ MachineInstr &Inst = *std::prev(I);
+ LIS->InsertMachineInstrInMaps(Inst);
+ LIS->removeAllRegUnitsForPhysReg(Reg);
+ }
+ }
+ }
+}
+
+/// Compute the sets of entry and return blocks for saving and restoring
+/// callee-saved registers, and placing prolog and epilog code.
+void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Even when we do not change any CSR, we still want to insert the
+ // prologue and epilogue of the function.
+ // So set the save points for those.
+
+ // Use the points found by shrink-wrapping, if any.
+ if (MFI.getSavePoint()) {
+ SaveBlocks.push_back(MFI.getSavePoint());
+ assert(MFI.getRestorePoint() && "Both restore and save must be set");
+ MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+ // If RestoreBlock does not have any successor and is not a return block
+ // then the end point is unreachable and we do not need to insert any
+ // epilogue.
+ if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
+ RestoreBlocks.push_back(RestoreBlock);
+ return;
+ }
+
+ // Save refs to entry and return blocks.
+ SaveBlocks.push_back(&MF.front());
+ for (MachineBasicBlock &MBB : MF) {
+ if (MBB.isEHFuncletEntry())
+ SaveBlocks.push_back(&MBB);
+ if (MBB.isReturnBlock())
+ RestoreBlocks.push_back(&MBB);
+ }
+}
+
+bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const Function &F = MF.getFunction();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIFrameLowering *TFI = ST.getFrameLowering();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ RegScavenger *RS = nullptr;
+
+ // Determine which of the registers in the callee save list should be saved.
+ BitVector SavedRegs;
+ TFI->determineCalleeSavesSGPR(MF, SavedRegs, RS);
+
+ // Add the code to save and restore the callee saved registers.
+ if (!F.hasFnAttribute(Attribute::Naked)) {
+ // FIXME: This is a lie. The CalleeSavedInfo is incomplete, but this is
+ // necessary for verifier liveness checks.
+ MFI.setCalleeSavedInfoValid(true);
+
+ std::vector<CalleeSavedInfo> CSI;
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+
+ for (unsigned I = 0; CSRegs[I]; ++I) {
+ unsigned Reg = CSRegs[I];
+ if (SavedRegs.test(Reg)) {
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
+ TRI->getSpillAlignment(*RC),
+ true);
+
+ CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
+ }
+ }
+
+ if (!CSI.empty()) {
+ for (MachineBasicBlock *SaveBlock : SaveBlocks)
+ insertCSRSaves(*SaveBlock, CSI, LIS);
+
+ for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
+ insertCSRRestores(*RestoreBlock, CSI, LIS);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+
+ VRM = getAnalysisIfAvailable<VirtRegMap>();
+
+ assert(SaveBlocks.empty() && RestoreBlocks.empty());
+
+ // First, expose any CSR SGPR spills. This is mostly the same as what PEI
+ // does, but somewhat simpler.
+ calculateSaveRestoreBlocks(MF);
+ bool HasCSRs = spillCalleeSavedRegs(MF);
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (!MFI.hasStackObjects() && !HasCSRs) {
+ SaveBlocks.clear();
+ RestoreBlocks.clear();
+ return false;
+ }
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
+ && EnableSpillVGPRToAGPR;
+
+ bool MadeChange = false;
+
+ const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts();
+
+ // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
+ // handled as SpilledToReg in regular PrologEpilogInserter.
+ if ((TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) ||
+ SpillVGPRToAGPR) {
+ // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
+ // are spilled to VGPRs, in which case we can eliminate the stack usage.
+ //
+ // This operates under the assumption that only other SGPR spills are users
+ // of the frame index.
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::iterator Next;
+ for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
+ MachineInstr &MI = *I;
+ Next = std::next(I);
+
+ if (SpillToAGPR && TII->isVGPRSpill(MI)) {
+ // Try to eliminate stack used by VGPR spills before frame
+ // finalization.
+ unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vaddr);
+ int FI = MI.getOperand(FIOp).getIndex();
+ unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)
+ ->getReg();
+ if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
+ TRI->isAGPR(MRI, VReg))) {
+ TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr);
+ continue;
+ }
+ }
+
+ if (!TII->isSGPRSpill(MI))
+ continue;
+
+ int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
+ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
+ bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr);
+ (void)Spilled;
+ assert(Spilled && "failed to spill SGPR to VGPR when allocated");
+ }
+ }
+ }
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto SSpill : FuncInfo->getSGPRSpillVGPRs())
+ MBB.addLiveIn(SSpill.VGPR);
+
+ for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
+ MBB.addLiveIn(Reg);
+
+ for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
+ MBB.addLiveIn(Reg);
+
+ MBB.sortUniqueLiveIns();
+ }
+
+ MadeChange = true;
+ }
+
+ SaveBlocks.clear();
+ RestoreBlocks.clear();
+
+ return MadeChange;
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 181cc41bd5ff..46da974a2f45 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -1,9 +1,8 @@
//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -29,6 +28,7 @@ using namespace llvm;
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
+ Mode(MF.getFunction()),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
@@ -46,7 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ImplicitBufferPtr(false),
ImplicitArgPtr(false),
GITPtrHigh(0xffffffff),
- HighBitsOf32BitAddress(0) {
+ HighBitsOf32BitAddress(0),
+ GDSSize(0) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const Function &F = MF.getFunction();
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
@@ -69,8 +70,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
// Non-entry functions have no special inputs for now, other registers
// required for scratch access.
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
- ScratchWaveOffsetReg = AMDGPU::SGPR4;
- FrameOffsetReg = AMDGPU::SGPR5;
+ ScratchWaveOffsetReg = AMDGPU::SGPR33;
+
+ // TODO: Pick a high register, and shift down, similar to a kernel.
+ FrameOffsetReg = AMDGPU::SGPR34;
StackPtrOffsetReg = AMDGPU::SGPR32;
ArgInfo.PrivateSegmentBuffer =
@@ -88,33 +91,23 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
}
- if (ST.debuggerEmitPrologue()) {
- // Enable everything.
+ if (F.hasFnAttribute("amdgpu-work-group-id-x"))
WorkGroupIDX = true;
- WorkGroupIDY = true;
- WorkGroupIDZ = true;
- WorkItemIDX = true;
- WorkItemIDY = true;
- WorkItemIDZ = true;
- } else {
- if (F.hasFnAttribute("amdgpu-work-group-id-x"))
- WorkGroupIDX = true;
- if (F.hasFnAttribute("amdgpu-work-group-id-y"))
- WorkGroupIDY = true;
+ if (F.hasFnAttribute("amdgpu-work-group-id-y"))
+ WorkGroupIDY = true;
- if (F.hasFnAttribute("amdgpu-work-group-id-z"))
- WorkGroupIDZ = true;
+ if (F.hasFnAttribute("amdgpu-work-group-id-z"))
+ WorkGroupIDZ = true;
- if (F.hasFnAttribute("amdgpu-work-item-id-x"))
- WorkItemIDX = true;
+ if (F.hasFnAttribute("amdgpu-work-item-id-x"))
+ WorkItemIDX = true;
- if (F.hasFnAttribute("amdgpu-work-item-id-y"))
- WorkItemIDY = true;
+ if (F.hasFnAttribute("amdgpu-work-item-id-y"))
+ WorkItemIDY = true;
- if (F.hasFnAttribute("amdgpu-work-item-id-z"))
- WorkItemIDZ = true;
- }
+ if (F.hasFnAttribute("amdgpu-work-item-id-z"))
+ WorkItemIDZ = true;
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
bool HasStackObjects = FrameInfo.hasStackObjects();
@@ -154,9 +147,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
KernargSegmentPtr = true;
if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
+ auto hasNonSpillStackObjects = [&]() {
+ // Avoid expensive checking if there's no stack objects.
+ if (!HasStackObjects)
+ return false;
+ for (auto OI = FrameInfo.getObjectIndexBegin(),
+ OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI)
+ if (!FrameInfo.isSpillSlotObjectIndex(OI))
+ return true;
+ // All stack objects are spill slots.
+ return false;
+ };
// TODO: This could be refined a lot. The attribute is a poor way of
// detecting calls that may require it before argument lowering.
- if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch"))
+ if (hasNonSpillStackObjects() || F.hasFnAttribute("amdgpu-flat-scratch"))
FlatScratchInit = true;
}
@@ -169,6 +173,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
S = A.getValueAsString();
if (!S.empty())
S.consumeInteger(0, HighBitsOf32BitAddress);
+
+ S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
+ if (!S.empty())
+ S.consumeInteger(0, GDSSize);
}
void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
@@ -239,6 +247,17 @@ static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
return false;
}
+/// \p returns true if \p NumLanes slots are available in VGPRs already used for
+/// SGPR spilling.
+//
+// FIXME: This only works after processFunctionBeforeFrameFinalized
+bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF,
+ unsigned NumNeed) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ unsigned WaveSize = ST.getWavefrontSize();
+ return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size();
+}
+
/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
int FI) {
@@ -260,7 +279,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
int NumLanes = Size / 4;
- const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
// Make sure to handle the case where a wide SGPR spill may span between two
// VGPRs.
@@ -300,26 +319,92 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
return true;
}
-void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
- for (auto &R : SGPRToVGPRSpills)
- MFI.RemoveStackObject(R.first);
+/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
+/// Either AGPR is spilled to VGPR to vice versa.
+/// Returns true if a \p FI can be eliminated completely.
+bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
+ int FI,
+ bool isAGPRtoVGPR) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+ assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
+
+ auto &Spill = VGPRToAGPRSpills[FI];
+
+ // This has already been allocated.
+ if (!Spill.Lanes.empty())
+ return Spill.FullyAllocated;
+
+ unsigned Size = FrameInfo.getObjectSize(FI);
+ unsigned NumLanes = Size / 4;
+ Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister);
+
+ const TargetRegisterClass &RC =
+ isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
+ auto Regs = RC.getRegisters();
+
+ auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ Spill.FullyAllocated = true;
+
+ // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
+ // once.
+ BitVector OtherUsedRegs;
+ OtherUsedRegs.resize(TRI->getNumRegs());
+
+ const uint32_t *CSRMask =
+ TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
+ if (CSRMask)
+ OtherUsedRegs.setBitsInMask(CSRMask);
+
+ // TODO: Should include register tuples, but doesn't matter with current
+ // usage.
+ for (MCPhysReg Reg : SpillAGPR)
+ OtherUsedRegs.set(Reg);
+ for (MCPhysReg Reg : SpillVGPR)
+ OtherUsedRegs.set(Reg);
+
+ SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
+ for (unsigned I = 0; I < NumLanes; ++I) {
+ NextSpillReg = std::find_if(
+ NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
+ return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
+ !OtherUsedRegs[Reg];
+ });
+
+ if (NextSpillReg == Regs.end()) { // Registers exhausted
+ Spill.FullyAllocated = false;
+ break;
+ }
+
+ OtherUsedRegs.set(*NextSpillReg);
+ SpillRegs.push_back(*NextSpillReg);
+ Spill.Lanes[I] = *NextSpillReg++;
+ }
+
+ return Spill.FullyAllocated;
}
+void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
+ // The FP spill hasn't been inserted yet, so keep it around.
+ for (auto &R : SGPRToVGPRSpills) {
+ if (R.first != FramePointerSaveIndex)
+ MFI.RemoveStackObject(R.first);
+ }
-/// \returns VGPR used for \p Dim' work item ID.
-unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const {
- switch (Dim) {
- case 0:
- assert(hasWorkItemIDX());
- return AMDGPU::VGPR0;
- case 1:
- assert(hasWorkItemIDY());
- return AMDGPU::VGPR1;
- case 2:
- assert(hasWorkItemIDZ());
- return AMDGPU::VGPR2;
+ // All other SPGRs must be allocated on the default stack, so reset the stack
+ // ID.
+ for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
+ ++i)
+ if (i != FramePointerSaveIndex)
+ MFI.setStackID(i, TargetStackID::Default);
+
+ for (auto &R : VGPRToAGPRSpills) {
+ if (R.second.FullyAllocated)
+ MFI.RemoveStackObject(R.first);
}
- llvm_unreachable("unexpected dimension");
}
MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
@@ -330,3 +415,97 @@ MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
}
+
+static yaml::StringValue regToString(unsigned Reg,
+ const TargetRegisterInfo &TRI) {
+ yaml::StringValue Dest;
+ {
+ raw_string_ostream OS(Dest.Value);
+ OS << printReg(Reg, &TRI);
+ }
+ return Dest;
+}
+
+static Optional<yaml::SIArgumentInfo>
+convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
+ const TargetRegisterInfo &TRI) {
+ yaml::SIArgumentInfo AI;
+
+ auto convertArg = [&](Optional<yaml::SIArgument> &A,
+ const ArgDescriptor &Arg) {
+ if (!Arg)
+ return false;
+
+ // Create a register or stack argument.
+ yaml::SIArgument SA = yaml::SIArgument::createArgument(Arg.isRegister());
+ if (Arg.isRegister()) {
+ raw_string_ostream OS(SA.RegisterName.Value);
+ OS << printReg(Arg.getRegister(), &TRI);
+ } else
+ SA.StackOffset = Arg.getStackOffset();
+ // Check and update the optional mask.
+ if (Arg.isMasked())
+ SA.Mask = Arg.getMask();
+
+ A = SA;
+ return true;
+ };
+
+ bool Any = false;
+ Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
+ Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
+ Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr);
+ Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
+ Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
+ Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
+ Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
+ Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
+ Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
+ Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
+ Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
+ Any |= convertArg(AI.PrivateSegmentWaveByteOffset,
+ ArgInfo.PrivateSegmentWaveByteOffset);
+ Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
+ Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
+ Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX);
+ Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
+ Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
+
+ if (Any)
+ return AI;
+
+ return None;
+}
+
+yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
+ const llvm::SIMachineFunctionInfo& MFI,
+ const TargetRegisterInfo &TRI)
+ : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
+ MaxKernArgAlign(MFI.getMaxKernArgAlign()),
+ LDSSize(MFI.getLDSSize()),
+ IsEntryFunction(MFI.isEntryFunction()),
+ NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
+ MemoryBound(MFI.isMemoryBound()),
+ WaveLimiter(MFI.needsWaveLimiter()),
+ ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
+ ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)),
+ FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
+ StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
+ ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
+ Mode(MFI.getMode()) {}
+
+void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
+ MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, *this);
+}
+
+bool SIMachineFunctionInfo::initializeBaseYamlFields(
+ const yaml::SIMachineFunctionInfo &YamlMFI) {
+ ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
+ MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
+ LDSSize = YamlMFI.LDSSize;
+ IsEntryFunction = YamlMFI.IsEntryFunction;
+ NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
+ MemoryBound = YamlMFI.MemoryBound;
+ WaveLimiter = YamlMFI.WaveLimiter;
+ return false;
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ef91d1e43075..f19b20ceb5da 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1,9 +1,8 @@
//==- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface --*- C++ -*-==//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -16,13 +15,16 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUMachineFunction.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -38,12 +40,19 @@ class MachineFrameInfo;
class MachineFunction;
class TargetRegisterClass;
-class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
+class AMDGPUPseudoSourceValue : public PseudoSourceValue {
public:
- // TODO: Is the img rsrc useful?
- explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) :
- PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) {}
+ enum AMDGPUPSVKind : unsigned {
+ PSVBuffer = PseudoSourceValue::TargetCustom,
+ PSVImage,
+ GWSResource
+ };
+
+protected:
+ AMDGPUPseudoSourceValue(unsigned Kind, const TargetInstrInfo &TII)
+ : PseudoSourceValue(Kind, TII) {}
+public:
bool isConstant(const MachineFrameInfo *) const override {
// This should probably be true for most images, but we will start by being
// conservative.
@@ -59,29 +68,250 @@ public:
}
};
-class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue {
+class AMDGPUBufferPseudoSourceValue final : public AMDGPUPseudoSourceValue {
public:
- explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII) :
- PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { }
+ explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII)
+ : AMDGPUPseudoSourceValue(PSVBuffer, TII) {}
- bool isConstant(const MachineFrameInfo *) const override {
- // This should probably be true for most images, but we will start by being
- // conservative.
- return false;
+ static bool classof(const PseudoSourceValue *V) {
+ return V->kind() == PSVBuffer;
}
+};
+class AMDGPUImagePseudoSourceValue final : public AMDGPUPseudoSourceValue {
+public:
+ // TODO: Is the img rsrc useful?
+ explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII)
+ : AMDGPUPseudoSourceValue(PSVImage, TII) {}
+
+ static bool classof(const PseudoSourceValue *V) {
+ return V->kind() == PSVImage;
+ }
+};
+
+class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
+public:
+ explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII)
+ : AMDGPUPseudoSourceValue(GWSResource, TII) {}
+
+ static bool classof(const PseudoSourceValue *V) {
+ return V->kind() == GWSResource;
+ }
+
+ // These are inaccessible memory from IR.
bool isAliased(const MachineFrameInfo *) const override {
- return true;
+ return false;
}
+ // These are inaccessible memory from IR.
bool mayAlias(const MachineFrameInfo *) const override {
- return true;
+ return false;
+ }
+
+ void printCustom(raw_ostream &OS) const override {
+ OS << "GWSResource";
+ }
+};
+
+namespace yaml {
+
+struct SIArgument {
+ bool IsRegister;
+ union {
+ StringValue RegisterName;
+ unsigned StackOffset;
+ };
+ Optional<unsigned> Mask;
+
+ // Default constructor, which creates a stack argument.
+ SIArgument() : IsRegister(false), StackOffset(0) {}
+ SIArgument(const SIArgument &Other) {
+ IsRegister = Other.IsRegister;
+ if (IsRegister) {
+ ::new ((void *)std::addressof(RegisterName))
+ StringValue(Other.RegisterName);
+ } else
+ StackOffset = Other.StackOffset;
+ Mask = Other.Mask;
+ }
+ SIArgument &operator=(const SIArgument &Other) {
+ IsRegister = Other.IsRegister;
+ if (IsRegister) {
+ ::new ((void *)std::addressof(RegisterName))
+ StringValue(Other.RegisterName);
+ } else
+ StackOffset = Other.StackOffset;
+ Mask = Other.Mask;
+ return *this;
+ }
+ ~SIArgument() {
+ if (IsRegister)
+ RegisterName.~StringValue();
+ }
+
+ // Helper to create a register or stack argument.
+ static inline SIArgument createArgument(bool IsReg) {
+ if (IsReg)
+ return SIArgument(IsReg);
+ return SIArgument();
+ }
+
+private:
+ // Construct a register argument.
+ SIArgument(bool) : IsRegister(true), RegisterName() {}
+};
+
+template <> struct MappingTraits<SIArgument> {
+ static void mapping(IO &YamlIO, SIArgument &A) {
+ if (YamlIO.outputting()) {
+ if (A.IsRegister)
+ YamlIO.mapRequired("reg", A.RegisterName);
+ else
+ YamlIO.mapRequired("offset", A.StackOffset);
+ } else {
+ auto Keys = YamlIO.keys();
+ if (is_contained(Keys, "reg")) {
+ A = SIArgument::createArgument(true);
+ YamlIO.mapRequired("reg", A.RegisterName);
+ } else if (is_contained(Keys, "offset"))
+ YamlIO.mapRequired("offset", A.StackOffset);
+ else
+ YamlIO.setError("missing required key 'reg' or 'offset'");
+ }
+ YamlIO.mapOptional("mask", A.Mask);
+ }
+ static const bool flow = true;
+};
+
+struct SIArgumentInfo {
+ Optional<SIArgument> PrivateSegmentBuffer;
+ Optional<SIArgument> DispatchPtr;
+ Optional<SIArgument> QueuePtr;
+ Optional<SIArgument> KernargSegmentPtr;
+ Optional<SIArgument> DispatchID;
+ Optional<SIArgument> FlatScratchInit;
+ Optional<SIArgument> PrivateSegmentSize;
+
+ Optional<SIArgument> WorkGroupIDX;
+ Optional<SIArgument> WorkGroupIDY;
+ Optional<SIArgument> WorkGroupIDZ;
+ Optional<SIArgument> WorkGroupInfo;
+ Optional<SIArgument> PrivateSegmentWaveByteOffset;
+
+ Optional<SIArgument> ImplicitArgPtr;
+ Optional<SIArgument> ImplicitBufferPtr;
+
+ Optional<SIArgument> WorkItemIDX;
+ Optional<SIArgument> WorkItemIDY;
+ Optional<SIArgument> WorkItemIDZ;
+};
+
+template <> struct MappingTraits<SIArgumentInfo> {
+ static void mapping(IO &YamlIO, SIArgumentInfo &AI) {
+ YamlIO.mapOptional("privateSegmentBuffer", AI.PrivateSegmentBuffer);
+ YamlIO.mapOptional("dispatchPtr", AI.DispatchPtr);
+ YamlIO.mapOptional("queuePtr", AI.QueuePtr);
+ YamlIO.mapOptional("kernargSegmentPtr", AI.KernargSegmentPtr);
+ YamlIO.mapOptional("dispatchID", AI.DispatchID);
+ YamlIO.mapOptional("flatScratchInit", AI.FlatScratchInit);
+ YamlIO.mapOptional("privateSegmentSize", AI.PrivateSegmentSize);
+
+ YamlIO.mapOptional("workGroupIDX", AI.WorkGroupIDX);
+ YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY);
+ YamlIO.mapOptional("workGroupIDZ", AI.WorkGroupIDZ);
+ YamlIO.mapOptional("workGroupInfo", AI.WorkGroupInfo);
+ YamlIO.mapOptional("privateSegmentWaveByteOffset",
+ AI.PrivateSegmentWaveByteOffset);
+
+ YamlIO.mapOptional("implicitArgPtr", AI.ImplicitArgPtr);
+ YamlIO.mapOptional("implicitBufferPtr", AI.ImplicitBufferPtr);
+
+ YamlIO.mapOptional("workItemIDX", AI.WorkItemIDX);
+ YamlIO.mapOptional("workItemIDY", AI.WorkItemIDY);
+ YamlIO.mapOptional("workItemIDZ", AI.WorkItemIDZ);
+ }
+};
+
+// Default to default mode for default calling convention.
+struct SIMode {
+ bool IEEE = true;
+ bool DX10Clamp = true;
+
+ SIMode() = default;
+
+
+ SIMode(const AMDGPU::SIModeRegisterDefaults &Mode) {
+ IEEE = Mode.IEEE;
+ DX10Clamp = Mode.DX10Clamp;
}
+
+ bool operator ==(const SIMode Other) const {
+ return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp;
+ }
+};
+
+template <> struct MappingTraits<SIMode> {
+ static void mapping(IO &YamlIO, SIMode &Mode) {
+ YamlIO.mapOptional("ieee", Mode.IEEE, true);
+ YamlIO.mapOptional("dx10-clamp", Mode.DX10Clamp, true);
+ }
+};
+
+struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
+ uint64_t ExplicitKernArgSize = 0;
+ unsigned MaxKernArgAlign = 0;
+ unsigned LDSSize = 0;
+ bool IsEntryFunction = false;
+ bool NoSignedZerosFPMath = false;
+ bool MemoryBound = false;
+ bool WaveLimiter = false;
+
+ StringValue ScratchRSrcReg = "$private_rsrc_reg";
+ StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg";
+ StringValue FrameOffsetReg = "$fp_reg";
+ StringValue StackPtrOffsetReg = "$sp_reg";
+
+ Optional<SIArgumentInfo> ArgInfo;
+ SIMode Mode;
+
+ SIMachineFunctionInfo() = default;
+ SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
+ const TargetRegisterInfo &TRI);
+
+ void mappingImpl(yaml::IO &YamlIO) override;
+ ~SIMachineFunctionInfo() = default;
};
+template <> struct MappingTraits<SIMachineFunctionInfo> {
+ static void mapping(IO &YamlIO, SIMachineFunctionInfo &MFI) {
+ YamlIO.mapOptional("explicitKernArgSize", MFI.ExplicitKernArgSize,
+ UINT64_C(0));
+ YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign, 0u);
+ YamlIO.mapOptional("ldsSize", MFI.LDSSize, 0u);
+ YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false);
+ YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false);
+ YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false);
+ YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
+ YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
+ StringValue("$private_rsrc_reg"));
+ YamlIO.mapOptional("scratchWaveOffsetReg", MFI.ScratchWaveOffsetReg,
+ StringValue("$scratch_wave_offset_reg"));
+ YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
+ StringValue("$fp_reg"));
+ YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg,
+ StringValue("$sp_reg"));
+ YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
+ YamlIO.mapOptional("mode", MFI.Mode, SIMode());
+ }
+};
+
+} // end namespace yaml
+
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
+ friend class GCNTargetMachine;
+
unsigned TIDReg = AMDGPU::NoRegister;
// Registers that may be reserved for spilling purposes. These may be the same
@@ -99,6 +329,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
AMDGPUFunctionArgInfo ArgInfo;
+ // State of MODE register, assumed FP mode.
+ AMDGPU::SIModeRegisterDefaults Mode;
+
// Graphics info.
unsigned PSInputAddr = 0;
unsigned PSInputEnable = 0;
@@ -124,16 +357,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
// unit. Minimum - first, maximum - second.
std::pair<unsigned, unsigned> WavesPerEU = {0, 0};
- // Stack object indices for work group IDs.
- std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices = {{0, 0, 0}};
-
- // Stack object indices for work item IDs.
- std::array<int, 3> DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}};
-
DenseMap<const Value *,
std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
DenseMap<const Value *,
std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
+ std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV;
private:
unsigned LDSWaveSpillSize = 0;
@@ -182,6 +410,7 @@ private:
unsigned GITPtrHigh;
unsigned HighBitsOf32BitAddress;
+ unsigned GDSSize;
// Current recorded maximum possible occupancy.
unsigned Occupancy;
@@ -213,6 +442,15 @@ public:
SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {}
};
+ struct VGPRSpillToAGPR {
+ SmallVector<MCPhysReg, 32> Lanes;
+ bool FullyAllocated = false;
+ };
+
+ SparseBitVector<> WWMReservedRegs;
+
+ void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); }
+
private:
// SGPR->VGPR spilling support.
using SpillRegMask = std::pair<unsigned, unsigned>;
@@ -223,9 +461,25 @@ private:
unsigned NumVGPRSpillLanes = 0;
SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
+ DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills;
+
+ // AGPRs used for VGPR spills.
+ SmallVector<MCPhysReg, 32> SpillAGPR;
+
+ // VGPRs used for AGPR spills.
+ SmallVector<MCPhysReg, 32> SpillVGPR;
+
+public: // FIXME
+ /// If this is set, an SGPR used for save/restore of the register used for the
+ /// frame pointer.
+ unsigned SGPRForFPSaveRestoreCopy = 0;
+ Optional<int> FramePointerSaveIndex;
+
public:
SIMachineFunctionInfo(const MachineFunction &MF);
+ bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI);
+
ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
auto I = SGPRToVGPRSpills.find(FrameIndex);
return (I == SGPRToVGPRSpills.end()) ?
@@ -236,8 +490,29 @@ public:
return SpillVGPRs;
}
+ ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const {
+ return SpillAGPR;
+ }
+
+ ArrayRef<MCPhysReg> getVGPRSpillAGPRs() const {
+ return SpillVGPR;
+ }
+
+ MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const {
+ auto I = VGPRToAGPRSpills.find(FrameIndex);
+ return (I == VGPRToAGPRSpills.end()) ? (MCPhysReg)AMDGPU::NoRegister
+ : I->second.Lanes[Lane];
+ }
+
+ AMDGPU::SIModeRegisterDefaults getMode() const {
+ return Mode;
+ }
+
+ bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
+ unsigned NumLane) const;
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
- void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
+ bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
+ void removeDeadFrameIndices(MachineFrameInfo &MFI);
bool hasCalculatedTID() const { return TIDReg != 0; };
unsigned getTIDReg() const { return TIDReg; };
@@ -386,8 +661,9 @@ public:
return ArgInfo.getPreloadedValue(Value);
}
- unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
- return ArgInfo.getPreloadedValue(Value).first->getRegister();
+ Register getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
+ auto Arg = ArgInfo.getPreloadedValue(Value).first;
+ return Arg ? Arg->getRegister() : Register();
}
unsigned getGITPtrHigh() const {
@@ -398,6 +674,10 @@ public:
return HighBitsOf32BitAddress;
}
+ unsigned getGDSSize() const {
+ return GDSSize;
+ }
+
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
@@ -429,6 +709,11 @@ public:
return FrameOffsetReg;
}
+ void setFrameOffsetReg(unsigned Reg) {
+ assert(Reg != 0 && "Should never be unset");
+ FrameOffsetReg = Reg;
+ }
+
void setStackPtrOffsetReg(unsigned Reg) {
assert(Reg != 0 && "Should never be unset");
StackPtrOffsetReg = Reg;
@@ -445,8 +730,6 @@ public:
void setScratchWaveOffsetReg(unsigned Reg) {
assert(Reg != 0 && "Should never be unset");
ScratchWaveOffsetReg = Reg;
- if (isEntryFunction())
- FrameOffsetReg = ScratchWaveOffsetReg;
}
unsigned getQueuePtrUserSGPR() const {
@@ -565,30 +848,6 @@ public:
return WavesPerEU.second;
}
- /// \returns Stack object index for \p Dim's work group ID.
- int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const {
- assert(Dim < 3);
- return DebuggerWorkGroupIDStackObjectIndices[Dim];
- }
-
- /// Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
- void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
- assert(Dim < 3);
- DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
- }
-
- /// \returns Stack object index for \p Dim's work item ID.
- int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const {
- assert(Dim < 3);
- return DebuggerWorkItemIDStackObjectIndices[Dim];
- }
-
- /// Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
- void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
- assert(Dim < 3);
- DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
- }
-
/// \returns SGPR used for \p Dim's work group ID.
unsigned getWorkGroupIDSGPR(unsigned Dim) const {
switch (Dim) {
@@ -605,9 +864,6 @@ public:
llvm_unreachable("unexpected dimension");
}
- /// \returns VGPR used for \p Dim' work item ID.
- unsigned getWorkItemIDVGPR(unsigned Dim) const;
-
unsigned getLDSWaveSpillSize() const {
return LDSWaveSpillSize;
}
@@ -630,6 +886,15 @@ public:
return PSV.first->second.get();
}
+ const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
+ if (!GWSResourcePSV) {
+ GWSResourcePSV =
+ llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
+ }
+
+ return GWSResourcePSV.get();
+ }
+
unsigned getOccupancy() const {
return Occupancy;
}
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index fb7e670068fe..ebbdf80f9567 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -1,9 +1,8 @@
//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -1875,6 +1874,8 @@ void SIScheduleDAGMI::moveLowLatencies() {
bool CopyForLowLat = false;
for (SDep& SuccDep : SU->Succs) {
SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
CopyForLowLat = true;
}
@@ -1955,7 +1956,7 @@ void SIScheduleDAGMI::schedule()
for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
SUnit *SU = &SUnits[i];
- MachineOperand *BaseLatOp;
+ const MachineOperand *BaseLatOp;
int64_t OffLatReg;
if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
IsLowLatencySU[i] = 1;
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h
index 0ce68ac6a897..c28a7be4d03a 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -1,9 +1,8 @@
//===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index b4a4e9e33133..4320e6c957a0 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1,9 +1,8 @@
//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -146,7 +145,7 @@ private:
// only contains a single address space.
if ((OrderingAddrSpace == InstrAddrSpace) &&
isPowerOf2_32(uint32_t(InstrAddrSpace)))
- IsCrossAddressSpaceOrdering = false;
+ this->IsCrossAddressSpaceOrdering = false;
}
public:
@@ -353,6 +352,40 @@ public:
};
+class SIGfx10CacheControl : public SIGfx7CacheControl {
+protected:
+ bool CuMode = false;
+
+ /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit<AMDGPU::OpName::dlc>(MI);
+ }
+
+public:
+
+ SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) :
+ SIGfx7CacheControl(ST), CuMode(CuMode) {};
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+
+ bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
+
+ bool insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+};
+
class SIMemoryLegalizer final : public MachineFunctionPass {
private:
@@ -418,35 +451,46 @@ void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
SIAtomicAddrSpace InstrScope) const {
- /// TODO: For now assume OpenCL memory model which treats each
- /// address space as having a separate happens-before relation, and
- /// so an instruction only has ordering with respect to the address
- /// space it accesses, and if it accesses multiple address spaces it
- /// does not require ordering of operations in different address
- /// spaces.
- if (SSID == SyncScope::System)
+ if (SSID == SyncScope::System)
+ return std::make_tuple(SIAtomicScope::SYSTEM,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getAgentSSID())
+ return std::make_tuple(SIAtomicScope::AGENT,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getWorkgroupSSID())
+ return std::make_tuple(SIAtomicScope::WORKGROUP,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getWavefrontSSID())
+ return std::make_tuple(SIAtomicScope::WAVEFRONT,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == SyncScope::SingleThread)
+ return std::make_tuple(SIAtomicScope::SINGLETHREAD,
+ SIAtomicAddrSpace::ATOMIC,
+ true);
+ if (SSID == MMI->getSystemOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SYSTEM,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == MMI->getAgentSSID())
+ if (SSID == MMI->getAgentOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::AGENT,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == MMI->getWorkgroupSSID())
+ if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WORKGROUP,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == MMI->getWavefrontSSID())
+ if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WAVEFRONT,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- if (SSID == SyncScope::SingleThread)
+ if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
SIAtomicAddrSpace::ATOMIC & InstrScope,
false);
- /// TODO: To support HSA Memory Model need to add additional memory
- /// scopes that specify that do require cross address space
- /// ordering.
return None;
}
@@ -613,7 +657,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return make_unique<SIGfx6CacheControl>(ST);
- return make_unique<SIGfx7CacheControl>(ST);
+ if (Generation < AMDGPUSubtarget::GFX10)
+ return make_unique<SIGfx7CacheControl>(ST);
+ return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
}
bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -722,13 +768,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
bool VMCnt = false;
bool LGKMCnt = false;
- bool EXPCnt = false;
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- VMCnt = true;
+ VMCnt |= true;
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
@@ -752,7 +797,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// also synchronizing with global/GDS memory as LDS operations
// could be reordered with respect to later global/GDS memory
// operations of the same wave.
- LGKMCnt = IsCrossAddrSpaceOrdering;
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
@@ -774,7 +819,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// also synchronizing with global/LDS memory as GDS operations
// could be reordered with respect to later global/LDS memory
// operations of the same wave.
- EXPCnt = IsCrossAddrSpaceOrdering;
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
break;
case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:
@@ -787,11 +832,11 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
}
}
- if (VMCnt || LGKMCnt || EXPCnt) {
+ if (VMCnt || LGKMCnt) {
unsigned WaitCntImmediate =
AMDGPU::encodeWaitcnt(IV,
VMCnt ? 0 : getVmcntBitMask(IV),
- EXPCnt ? 0 : getExpcntBitMask(IV),
+ getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
Changed = true;
@@ -851,6 +896,231 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
return Changed;
}
+bool SIGfx10CacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ /// TODO Do not set glc for rmw atomic operations as they
+ /// implicitly bypass the L0/L1 caches.
+
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ Changed |= enableGLCBit(MI);
+ Changed |= enableDLCBit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
+ // CU mode and all waves of a work-group are on the same CU, and so the
+ // L0 does not need to be bypassed.
+ if (!CuMode) Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
+
+ return Changed;
+}
+
+bool SIGfx10CacheControl::enableNonTemporal(
+ const MachineBasicBlock::iterator &MI) const {
+ assert(MI->mayLoad() ^ MI->mayStore());
+ bool Changed = false;
+
+ Changed |= enableSLCBit(MI);
+ /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
+
+ return Changed;
+}
+
+bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
+ // in CU mode and all waves of a work-group are on the same CU, and so the
+ // L0 does not need to be invalidated.
+ if (!CuMode) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
+ Changed = true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not hava a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
+bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ bool VMCnt = false;
+ bool VSCnt = false;
+ bool LGKMCnt = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+ VMCnt |= true;
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+ VSCnt |= true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to wait for operations to complete to ensure
+ // they are visible to waves in the other CU as the L0 is per CU.
+ // Otherwise in CU mode and all waves of a work-group are on the same CU
+ // which shares the same L0.
+ if (!CuMode) {
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+ VMCnt |= true;
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+ VSCnt |= true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The L0 cache keeps all memory operations in order for
+ // work-items in the same wavefront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ // If no cross address space ordering then an LDS waitcnt is not
+ // needed as LDS operations for all waves are executed in a
+ // total global ordering as observed by all waves. Required if
+ // also synchronizing with global/GDS memory as LDS operations
+ // could be reordered with respect to later global/GDS memory
+ // operations of the same wave.
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The LDS keeps all memory operations in order for
+ // the same wavesfront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ // If no cross address space ordering then an GDS waitcnt is not
+ // needed as GDS operations for all waves are executed in a
+ // total global ordering as observed by all waves. Required if
+ // also synchronizing with global/LDS memory as GDS operations
+ // could be reordered with respect to later global/LDS memory
+ // operations of the same wave.
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The GDS keeps all memory operations in order for
+ // the same work-group.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if (VMCnt || LGKMCnt) {
+ unsigned WaitCntImmediate =
+ AMDGPU::encodeWaitcnt(IV,
+ VMCnt ? 0 : getVmcntBitMask(IV),
+ getExpcntBitMask(IV),
+ LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+ Changed = true;
+ }
+
+ if (VSCnt) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+ Changed = true;
+ }
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;
diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp
index 883fd308f2f4..a5edd7b3554a 100644
--- a/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -1,9 +1,8 @@
//===-- SIModeRegister.cpp - Mode Register --------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
@@ -45,7 +44,7 @@ struct Status {
Status() : Mask(0), Mode(0){};
- Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) {
+ Status(unsigned NewMask, unsigned NewMode) : Mask(NewMask), Mode(NewMode) {
Mode &= Mask;
};
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index ebcad30a1866..3227bff20513 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -1,9 +1,8 @@
//===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -57,13 +56,16 @@ char SIOptimizeExecMasking::ID = 0;
char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
/// If \p MI is a copy from exec, return the register copied to.
-static unsigned isCopyFromExec(const MachineInstr &MI) {
+static unsigned isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::S_MOV_B64:
- case AMDGPU::S_MOV_B64_term: {
+ case AMDGPU::S_MOV_B64_term:
+ case AMDGPU::S_MOV_B32:
+ case AMDGPU::S_MOV_B32_term: {
const MachineOperand &Src = MI.getOperand(1);
- if (Src.isReg() && Src.getReg() == AMDGPU::EXEC)
+ if (Src.isReg() &&
+ Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC))
return MI.getOperand(0).getReg();
}
}
@@ -72,16 +74,20 @@ static unsigned isCopyFromExec(const MachineInstr &MI) {
}
/// If \p MI is a copy to exec, return the register copied from.
-static unsigned isCopyToExec(const MachineInstr &MI) {
+static unsigned isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
- case AMDGPU::S_MOV_B64: {
+ case AMDGPU::S_MOV_B64:
+ case AMDGPU::S_MOV_B32: {
const MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC && MI.getOperand(1).isReg())
+ if (Dst.isReg() &&
+ Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) &&
+ MI.getOperand(1).isReg())
return MI.getOperand(1).getReg();
break;
}
case AMDGPU::S_MOV_B64_term:
+ case AMDGPU::S_MOV_B32_term:
llvm_unreachable("should have been replaced");
}
@@ -106,6 +112,23 @@ static unsigned isLogicalOpOnExec(const MachineInstr &MI) {
const MachineOperand &Src2 = MI.getOperand(2);
if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC)
return MI.getOperand(0).getReg();
+ break;
+ }
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_ANDN2_B32:
+ case AMDGPU::S_ORN2_B32:
+ case AMDGPU::S_NAND_B32:
+ case AMDGPU::S_NOR_B32:
+ case AMDGPU::S_XNOR_B32: {
+ const MachineOperand &Src1 = MI.getOperand(1);
+ if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO)
+ return MI.getOperand(0).getReg();
+ const MachineOperand &Src2 = MI.getOperand(2);
+ if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO)
+ return MI.getOperand(0).getReg();
+ break;
}
}
@@ -130,6 +153,22 @@ static unsigned getSaveExecOp(unsigned Opc) {
return AMDGPU::S_NOR_SAVEEXEC_B64;
case AMDGPU::S_XNOR_B64:
return AMDGPU::S_XNOR_SAVEEXEC_B64;
+ case AMDGPU::S_AND_B32:
+ return AMDGPU::S_AND_SAVEEXEC_B32;
+ case AMDGPU::S_OR_B32:
+ return AMDGPU::S_OR_SAVEEXEC_B32;
+ case AMDGPU::S_XOR_B32:
+ return AMDGPU::S_XOR_SAVEEXEC_B32;
+ case AMDGPU::S_ANDN2_B32:
+ return AMDGPU::S_ANDN2_SAVEEXEC_B32;
+ case AMDGPU::S_ORN2_B32:
+ return AMDGPU::S_ORN2_SAVEEXEC_B32;
+ case AMDGPU::S_NAND_B32:
+ return AMDGPU::S_NAND_SAVEEXEC_B32;
+ case AMDGPU::S_NOR_B32:
+ return AMDGPU::S_NOR_SAVEEXEC_B32;
+ case AMDGPU::S_XNOR_B32:
+ return AMDGPU::S_XNOR_SAVEEXEC_B32;
default:
return AMDGPU::INSTRUCTION_LIST_END;
}
@@ -140,7 +179,8 @@ static unsigned getSaveExecOp(unsigned Opc) {
// these is expected per block.
static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
switch (MI.getOpcode()) {
- case AMDGPU::S_MOV_B64_term: {
+ case AMDGPU::S_MOV_B64_term:
+ case AMDGPU::S_MOV_B32_term: {
MI.setDesc(TII.get(AMDGPU::COPY));
return true;
}
@@ -150,12 +190,30 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
return true;
}
+ case AMDGPU::S_XOR_B32_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_XOR_B32));
+ return true;
+ }
+ case AMDGPU::S_OR_B32_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_OR_B32));
+ return true;
+ }
case AMDGPU::S_ANDN2_B64_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
return true;
}
+ case AMDGPU::S_ANDN2_B32_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
+ return true;
+ }
default:
return false;
}
@@ -178,6 +236,7 @@ static MachineBasicBlock::reverse_iterator fixTerminators(
static MachineBasicBlock::reverse_iterator findExecCopy(
const SIInstrInfo &TII,
+ const GCNSubtarget &ST,
MachineBasicBlock &MBB,
MachineBasicBlock::reverse_iterator I,
unsigned CopyToExec) {
@@ -185,7 +244,7 @@ static MachineBasicBlock::reverse_iterator findExecCopy(
auto E = MBB.rend();
for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
- unsigned CopyFromExec = isCopyFromExec(*I);
+ unsigned CopyFromExec = isCopyFromExec(*I, ST);
if (CopyFromExec != AMDGPU::NoRegister)
return I;
}
@@ -194,8 +253,8 @@ static MachineBasicBlock::reverse_iterator findExecCopy(
}
// XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly
-// repor tthe register as unavailable because a super-register with a lane mask
-// as unavailable.
+// report the register as unavailable because a super-register with a lane mask
+// is unavailable.
static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
for (MachineBasicBlock *Succ : MBB.successors()) {
if (Succ->isLiveIn(Reg))
@@ -212,6 +271,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Optimize sequences emitted for control flow lowering. They are originally
// emitted as the separate operations because spill code may need to be
@@ -230,13 +290,13 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (I == E)
continue;
- unsigned CopyToExec = isCopyToExec(*I);
+ unsigned CopyToExec = isCopyToExec(*I, ST);
if (CopyToExec == AMDGPU::NoRegister)
continue;
// Scan backwards to find the def.
auto CopyToExecInst = &*I;
- auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec);
+ auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec);
if (CopyFromExecInst == E) {
auto PrepareExecInst = std::next(I);
if (PrepareExecInst == E)
@@ -246,7 +306,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
- PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC);
+ PrepareExecInst->getOperand(0).setReg(Exec);
LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
@@ -269,7 +329,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock::iterator J
= std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
J != JE; ++J) {
- if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) {
+ if (SaveExecInst && J->readsRegister(Exec, TRI)) {
LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
// Make sure this is inserted after any VALU ops that may have been
// scheduled in between.
@@ -353,7 +413,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
CopyToExecInst->eraseFromParent();
for (MachineInstr *OtherInst : OtherUseInsts) {
- OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC,
+ OtherInst->substituteRegister(CopyToExec, Exec,
AMDGPU::NoSubRegister, *TRI);
}
}
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index c671fed34bdf..7e10316eab92 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -1,9 +1,8 @@
//===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -34,10 +33,22 @@ using namespace llvm;
namespace {
class SIOptimizeExecMaskingPreRA : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI;
+ const SIInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+
public:
- static char ID;
+ MachineBasicBlock::iterator skipIgnoreExecInsts(
+ MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const;
+
+ MachineBasicBlock::iterator skipIgnoreExecInstsTrivialSucc(
+ MachineBasicBlock *&MBB,
+ MachineBasicBlock::iterator It) const;
public:
+ static char ID;
+
SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) {
initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
}
@@ -71,38 +82,93 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
return new SIOptimizeExecMaskingPreRA();
}
-static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) {
+static bool isEndCF(const MachineInstr &MI, const SIRegisterInfo *TRI,
+ const GCNSubtarget &ST) {
+ if (ST.isWave32()) {
+ return MI.getOpcode() == AMDGPU::S_OR_B32 &&
+ MI.modifiesRegister(AMDGPU::EXEC_LO, TRI);
+ }
+
return MI.getOpcode() == AMDGPU::S_OR_B64 &&
MI.modifiesRegister(AMDGPU::EXEC, TRI);
}
-static bool isFullExecCopy(const MachineInstr& MI) {
- return MI.isFullCopy() && MI.getOperand(1).getReg() == AMDGPU::EXEC;
+static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) {
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+ if (MI.isCopy() && MI.getOperand(1).getReg() == Exec) {
+ assert(MI.isFullCopy());
+ return true;
+ }
+
+ return false;
}
static unsigned getOrNonExecReg(const MachineInstr &MI,
- const SIInstrInfo &TII) {
+ const SIInstrInfo &TII,
+ const GCNSubtarget& ST) {
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1);
- if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
+ if (Op->isReg() && Op->getReg() != Exec)
return Op->getReg();
Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0);
- if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
+ if (Op->isReg() && Op->getReg() != Exec)
return Op->getReg();
return AMDGPU::NoRegister;
}
static MachineInstr* getOrExecSource(const MachineInstr &MI,
const SIInstrInfo &TII,
- const MachineRegisterInfo &MRI) {
- auto SavedExec = getOrNonExecReg(MI, TII);
+ const MachineRegisterInfo &MRI,
+ const GCNSubtarget& ST) {
+ auto SavedExec = getOrNonExecReg(MI, TII, ST);
if (SavedExec == AMDGPU::NoRegister)
return nullptr;
auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec);
- if (!SaveExecInst || !isFullExecCopy(*SaveExecInst))
+ if (!SaveExecInst || !isFullExecCopy(*SaveExecInst, ST))
return nullptr;
return SaveExecInst;
}
+/// Skip over instructions that don't care about the exec mask.
+MachineBasicBlock::iterator SIOptimizeExecMaskingPreRA::skipIgnoreExecInsts(
+ MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const {
+ for ( ; I != E; ++I) {
+ if (TII->mayReadEXEC(*MRI, *I))
+ break;
+ }
+
+ return I;
+}
+
+// Skip to the next instruction, ignoring debug instructions, and trivial block
+// boundaries (blocks that have one (typically fallthrough) successor, and the
+// successor has one predecessor.
+MachineBasicBlock::iterator
+SIOptimizeExecMaskingPreRA::skipIgnoreExecInstsTrivialSucc(
+ MachineBasicBlock *&MBB,
+ MachineBasicBlock::iterator It) const {
+
+ do {
+ It = skipIgnoreExecInsts(It, MBB->end());
+ if (It != MBB->end() || MBB->succ_size() != 1)
+ break;
+
+ // If there is one trivial successor, advance to the next block.
+ MachineBasicBlock *Succ = *MBB->succ_begin();
+
+ // TODO: Is this really necessary?
+ if (!MBB->isLayoutSuccessor(Succ))
+ break;
+
+ It = Succ->begin();
+ MBB = Succ;
+ } while (true);
+
+ return It;
+}
+
+
// Optimize sequence
// %sel = V_CNDMASK_B32_e64 0, 1, %cc
// %cmp = V_CMP_NE_U32 1, %1
@@ -125,10 +191,11 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
LiveIntervals *LIS) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
- const unsigned AndOpc = AMDGPU::S_AND_B64;
- const unsigned Andn2Opc = AMDGPU::S_ANDN2_B64;
- const unsigned CondReg = AMDGPU::VCC;
- const unsigned ExecReg = AMDGPU::EXEC;
+ bool Wave32 = ST.isWave32();
+ const unsigned AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ const unsigned Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+ const unsigned CondReg = Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
+ const unsigned ExecReg = Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
@@ -172,6 +239,10 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
return AMDGPU::NoRegister;
+ if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
+ TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
+ return AMDGPU::NoRegister;
+
Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
@@ -187,7 +258,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(),
TII->get(Andn2Opc), And->getOperand(0).getReg())
.addReg(ExecReg)
- .addReg(CCReg, CC->getSubReg());
+ .addReg(CCReg, 0, CC->getSubReg());
And->eraseFromParent();
LIS->InsertMachineInstrInMaps(*Andn2);
@@ -224,11 +295,14 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
return false;
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+ TII = ST.getInstrInfo();
+ MRI = &MF.getRegInfo();
+
MachineRegisterInfo &MRI = MF.getRegInfo();
LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
@@ -248,9 +322,10 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
// Skip this if the endpgm has any implicit uses, otherwise we would need
// to be careful to update / remove them.
+ // S_ENDPGM always has a single imm operand that is not used other than to
+ // end up in the encoding
MachineInstr &Term = MBB.back();
- if (Term.getOpcode() != AMDGPU::S_ENDPGM ||
- Term.getNumOperands() != 0)
+ if (Term.getOpcode() != AMDGPU::S_ENDPGM || Term.getNumOperands() != 1)
continue;
SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
@@ -304,32 +379,21 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
}
// Try to collapse adjacent endifs.
- auto Lead = MBB.begin(), E = MBB.end();
- if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI))
- continue;
-
- const MachineBasicBlock* Succ = *MBB.succ_begin();
- if (!MBB.isLayoutSuccessor(Succ))
- continue;
-
- auto I = std::next(Lead);
-
- for ( ; I != E; ++I)
- if (!TII->isSALU(*I) || I->readsRegister(AMDGPU::EXEC, TRI))
- break;
-
- if (I != E)
+ auto E = MBB.end();
+ auto Lead = skipDebugInstructionsForward(MBB.begin(), E);
+ if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI, ST))
continue;
- const auto NextLead = Succ->begin();
- if (NextLead == Succ->end() || !isEndCF(*NextLead, TRI) ||
- !getOrExecSource(*NextLead, *TII, MRI))
+ MachineBasicBlock *TmpMBB = &MBB;
+ auto NextLead = skipIgnoreExecInstsTrivialSucc(TmpMBB, std::next(Lead));
+ if (NextLead == TmpMBB->end() || !isEndCF(*NextLead, TRI, ST) ||
+ !getOrExecSource(*NextLead, *TII, MRI, ST))
continue;
LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
- auto SaveExec = getOrExecSource(*Lead, *TII, MRI);
- unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII);
+ auto SaveExec = getOrExecSource(*Lead, *TII, MRI, ST);
+ unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII, ST);
for (auto &Op : Lead->operands()) {
if (Op.isReg())
RecalcRegs.insert(Op.getReg());
@@ -363,7 +427,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (SafeToReplace) {
LIS->RemoveMachineInstrFromMaps(*SaveExec);
SaveExec->eraseFromParent();
- MRI.replaceRegWith(SavedExec, AMDGPU::EXEC);
+ MRI.replaceRegWith(SavedExec, Exec);
LIS->removeInterval(SavedExec);
}
}
@@ -375,8 +439,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (!MRI.reg_empty(Reg))
LIS->createAndComputeVirtRegInterval(Reg);
} else {
- for (MCRegUnitIterator U(Reg, TRI); U.isValid(); ++U)
- LIS->removeRegUnit(*U);
+ LIS->removeAllRegUnitsForPhysReg(Reg);
}
}
}
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 2d43d5d05ef6..2d71abc0612a 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1,9 +1,8 @@
//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -348,8 +347,8 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
if (Abs || Neg) {
assert(!Sext &&
"Float and integer src modifiers can't be set simulteniously");
- Mods |= Abs ? SISrcMods::ABS : 0;
- Mods ^= Neg ? SISrcMods::NEG : 0;
+ Mods |= Abs ? SISrcMods::ABS : 0u;
+ Mods ^= Neg ? SISrcMods::NEG : 0u;
} else if (Sext) {
Mods |= SISrcMods::SEXT;
}
@@ -419,7 +418,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
}
assert(Src && Src->isReg());
- if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+ if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
+ MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
!isSameReg(*Src, *getReplacedOperand())) {
// In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
@@ -461,7 +462,9 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
// Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
- if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+ if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
+ MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
getDstSel() != AMDGPU::SDWA::DWORD) {
// v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
@@ -951,7 +954,8 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
if (TII->isVOPC(Opc)) {
if (!ST.hasSDWASdst()) {
const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
- if (SDst && SDst->getReg() != AMDGPU::VCC)
+ if (SDst && (SDst->getReg() != AMDGPU::VCC &&
+ SDst->getReg() != AMDGPU::VCC_LO))
return false;
}
@@ -965,10 +969,16 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
return false;
}
- if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
+ if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
+ Opc == AMDGPU::V_FMAC_F32_e32 ||
+ Opc == AMDGPU::V_MAC_F16_e32 ||
Opc == AMDGPU::V_MAC_F32_e32))
return false;
+ // Check if target supports this SDWA opcode
+ if (TII->pseudoToMCOpcode(Opc) == -1)
+ return false;
+
// FIXME: has SDWA but require handling of implicit VCC use
if (Opc == AMDGPU::V_CNDMASK_B32_e32)
return false;
@@ -1010,7 +1020,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
SDWAInst.add(*Dst);
} else {
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
- SDWAInst.addReg(AMDGPU::VCC, RegState::Define);
+ SDWAInst.addReg(TRI->getVCC(), RegState::Define);
}
// Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
@@ -1039,7 +1049,9 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
SDWAInst.add(*Src1);
}
- if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
+ if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
+ SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
+ SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
// v_mac_f16/32 has additional src2 operand tied to vdst
MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
diff --git a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
new file mode 100644
index 000000000000..f9bfe96f65cb
--- /dev/null
+++ b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -0,0 +1,221 @@
+//===- SIPreAllocateWWMRegs.cpp - WWM Register Pre-allocation -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Pass to pre-allocated WWM registers
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-pre-allocate-wwm-regs"
+
+namespace {
+
+class SIPreAllocateWWMRegs : public MachineFunctionPass {
+private:
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+ LiveIntervals *LIS;
+ LiveRegMatrix *Matrix;
+ VirtRegMap *VRM;
+ RegisterClassInfo RegClassInfo;
+
+ std::vector<unsigned> RegsToRewrite;
+
+public:
+ static char ID;
+
+ SIPreAllocateWWMRegs() : MachineFunctionPass(ID) {
+ initializeSIPreAllocateWWMRegsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addRequired<VirtRegMap>();
+ AU.addRequired<LiveRegMatrix>();
+ AU.addPreserved<SlotIndexes>();
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ bool processDef(MachineOperand &MO);
+ void rewriteRegs(MachineFunction &MF);
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIPreAllocateWWMRegs, DEBUG_TYPE,
+ "SI Pre-allocate WWM Registers", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(SIPreAllocateWWMRegs, DEBUG_TYPE,
+ "SI Pre-allocate WWM Registers", false, false)
+
+char SIPreAllocateWWMRegs::ID = 0;
+
+char &llvm::SIPreAllocateWWMRegsID = SIPreAllocateWWMRegs::ID;
+
+FunctionPass *llvm::createSIPreAllocateWWMRegsPass() {
+ return new SIPreAllocateWWMRegs();
+}
+
+bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
+ if (!MO.isReg())
+ return false;
+
+ unsigned Reg = MO.getReg();
+
+ if (!TRI->isVGPR(*MRI, Reg))
+ return false;
+
+ if (TRI->isPhysicalRegister(Reg))
+ return false;
+
+ if (VRM->hasPhys(Reg))
+ return false;
+
+ LiveInterval &LI = LIS->getInterval(Reg);
+
+ for (unsigned PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
+ if (!MRI->isPhysRegUsed(PhysReg) &&
+ Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) {
+ Matrix->assign(LI, PhysReg);
+ assert(PhysReg != 0);
+ RegsToRewrite.push_back(Reg);
+ return true;
+ }
+ }
+
+ llvm_unreachable("physreg not found for WWM expression");
+ return false;
+}
+
+void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ for (MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+
+ const unsigned VirtReg = MO.getReg();
+ if (TRI->isPhysicalRegister(VirtReg))
+ continue;
+
+ if (!VRM->hasPhys(VirtReg))
+ continue;
+
+ unsigned PhysReg = VRM->getPhys(VirtReg);
+ const unsigned SubReg = MO.getSubReg();
+ if (SubReg != 0) {
+ PhysReg = TRI->getSubReg(PhysReg, SubReg);
+ MO.setSubReg(0);
+ }
+
+ MO.setReg(PhysReg);
+ MO.setIsRenamable(false);
+ }
+ }
+ }
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ for (unsigned Reg : RegsToRewrite) {
+ LIS->removeInterval(Reg);
+
+ const unsigned PhysReg = VRM->getPhys(Reg);
+ assert(PhysReg != 0);
+ MFI->ReserveWWMRegister(PhysReg);
+ }
+
+ RegsToRewrite.clear();
+
+ // Update the set of reserved registers to include WWM ones.
+ MRI->freezeReservedRegs(MF);
+}
+
+bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n");
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+
+ LIS = &getAnalysis<LiveIntervals>();
+ Matrix = &getAnalysis<LiveRegMatrix>();
+ VRM = &getAnalysis<VirtRegMap>();
+
+ RegClassInfo.runOnMachineFunction(MF);
+
+ bool RegsAssigned = false;
+
+ // We use a reverse post-order traversal of the control-flow graph to
+ // guarantee that we visit definitions in dominance order. Since WWM
+ // expressions are guaranteed to never involve phi nodes, and we can only
+ // escape WWM through the special WWM instruction, this means that this is a
+ // perfect elimination order, so we can never do any better.
+ ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+
+ for (MachineBasicBlock *MBB : RPOT) {
+ bool InWWM = false;
+ for (MachineInstr &MI : *MBB) {
+ if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
+ MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
+ RegsAssigned |= processDef(MI.getOperand(0));
+
+ if (MI.getOpcode() == AMDGPU::ENTER_WWM) {
+ LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n");
+ InWWM = true;
+ continue;
+ }
+
+ if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
+ LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n");
+ InWWM = false;
+ }
+
+ if (!InWWM)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "processing " << MI << "\n");
+
+ for (MachineOperand &DefOpnd : MI.defs()) {
+ RegsAssigned |= processDef(DefOpnd);
+ }
+ }
+ }
+
+ if (!RegsAssigned)
+ return false;
+
+ rewriteRegs(MF);
+ return true;
+}
diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h
index 383f6b575808..168f05f8fdd6 100644
--- a/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/lib/Target/AMDGPU/SIProgramInfo.h
@@ -1,9 +1,8 @@
//===--- SIProgramInfo.h ----------------------------------------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -29,6 +28,8 @@ struct SIProgramInfo {
uint32_t DX10Clamp = 0;
uint32_t DebugMode = 0;
uint32_t IEEEMode = 0;
+ uint32_t WgpMode = 0; // GFX10+
+ uint32_t MemOrdered = 0; // GFX10+
uint64_t ScratchSize = 0;
uint64_t ComputePGMRSrc1 = 0;
@@ -50,18 +51,6 @@ struct SIProgramInfo {
// Number of VGPRs that meets number of waves per execution unit request.
uint32_t NumVGPRsForWavesPerEU = 0;
- // Fixed SGPR number used to hold wave scratch offset for entire kernel
- // execution, or std::numeric_limits<uint16_t>::max() if the register is not
- // used or not known.
- uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
- std::numeric_limits<uint16_t>::max();
-
- // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
- // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
- // is not used or not known.
- uint16_t DebuggerPrivateSegmentBufferSGPR =
- std::numeric_limits<uint16_t>::max();
-
// Whether there is recursion, dynamic allocas, indirect calls or some other
// reason there may be statically unknown stack usage.
bool DynamicCallStack = false;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 97cfde2b2354..f152deb28004 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1,9 +1,8 @@
//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -17,6 +16,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -63,8 +63,10 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
AMDGPURegisterInfo(),
SGPRPressureSets(getNumRegPressureSets()),
VGPRPressureSets(getNumRegPressureSets()),
+ AGPRPressureSets(getNumRegPressureSets()),
SpillSGPRToVGPR(false),
- SpillSGPRToSMEM(false) {
+ SpillSGPRToSMEM(false),
+ isWave32(ST.isWave32()) {
if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
SpillSGPRToSMEM = true;
else if (EnableSpillSGPRToVGPR)
@@ -74,10 +76,12 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
SGPRSetID = NumRegPressureSets;
VGPRSetID = NumRegPressureSets;
+ AGPRSetID = NumRegPressureSets;
for (unsigned i = 0; i < NumRegPressureSets; ++i) {
classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
+ classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets);
}
// Determine the number of reg units for each pressure set.
@@ -89,7 +93,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
}
}
- unsigned VGPRMax = 0, SGPRMax = 0;
+ unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0;
for (unsigned i = 0; i < NumRegPressureSets; ++i) {
if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) {
VGPRSetID = i;
@@ -100,10 +104,16 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
SGPRSetID = i;
SGPRMax = PressureSetRegUnits[i];
}
+ if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) {
+ AGPRSetID = i;
+ AGPRMax = PressureSetRegUnits[i];
+ continue;
+ }
}
assert(SGPRSetID < NumRegPressureSets &&
- VGPRSetID < NumRegPressureSets);
+ VGPRSetID < NumRegPressureSets &&
+ AGPRSetID < NumRegPressureSets);
}
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
@@ -139,11 +149,6 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
}
-unsigned SIRegisterInfo::reservedStackPtrOffsetReg(
- const MachineFunction &MF) const {
- return AMDGPU::SGPR32;
-}
-
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
@@ -155,15 +160,26 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// M0 has to be reserved so that llvm accepts it as a live-in into a block.
reserveRegisterTuples(Reserved, AMDGPU::M0);
+ // Reserve src_vccz, src_execz, src_scc.
+ reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
+ reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
+ reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
+
// Reserve the memory aperture registers.
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
+ // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
+ reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
+
// Reserve xnack_mask registers - support is not implemented in Codegen.
reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
+ // Reserve lds_direct register - support is not implemented in Codegen.
+ reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
+
// Reserve Trap Handler registers - support is not implemented in Codegen.
reserveRegisterTuples(Reserved, AMDGPU::TBA);
reserveRegisterTuples(Reserved, AMDGPU::TMA);
@@ -176,6 +192,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
+ // Reserve null register - it shall never be allocated
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
+
+ // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
+ // will result in bugs.
+ if (isWave32) {
+ Reserved.set(AMDGPU::VCC);
+ Reserved.set(AMDGPU::VCC_HI);
+ }
+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
@@ -190,6 +216,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);
+ Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
}
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -225,9 +253,33 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
assert(!isSubRegister(ScratchRSrcReg, FrameReg));
}
+ for (unsigned Reg : MFI->WWMReservedRegs) {
+ reserveRegisterTuples(Reserved, Reg);
+ }
+
+ // FIXME: Stop using reserved registers for this.
+ for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
+ reserveRegisterTuples(Reserved, Reg);
+
+ for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
+ reserveRegisterTuples(Reserved, Reg);
+
return Reserved;
}
+bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ // On entry, the base address is 0, so it can't possibly need any more
+ // alignment.
+
+ // FIXME: Should be able to specify the entry frame alignment per calling
+ // convention instead.
+ if (Info->isEntryFunction())
+ return false;
+
+ return TargetRegisterInfo::canRealignStack(MF);
+}
+
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
if (Info->isEntryFunction()) {
@@ -252,11 +304,20 @@ bool SIRegisterInfo::requiresFrameIndexScavenging(
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
const MachineFunction &MF) const {
- // m0 is needed for the scalar store offset. m0 is unallocatable, so we can't
- // create a virtual register for it during frame index elimination, so the
- // scavenger is directly needed.
- return MF.getFrameInfo().hasStackObjects() &&
- MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (!MFI.hasStackObjects())
+ return false;
+
+ // The scavenger is used for large frames which may require finding a free
+ // register for large offsets.
+ if (!isUInt<12>(MFI.getStackSize()))
+ return true;
+
+ // If using scalar stores, for spills, m0 is needed for the scalar store
+ // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual
+ // register for it during frame index elimination, so the scavenger is
+ // directly needed.
+ return MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
}
@@ -332,7 +393,8 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
.addReg(OffsetReg, RegState::Kill)
- .addReg(FIReg);
+ .addReg(FIReg)
+ .addImm(0); // clamp bit
}
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
@@ -394,21 +456,39 @@ const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
static unsigned getNumSubRegsForSpillOp(unsigned Op) {
switch (Op) {
+ case AMDGPU::SI_SPILL_S1024_SAVE:
+ case AMDGPU::SI_SPILL_S1024_RESTORE:
+ case AMDGPU::SI_SPILL_V1024_SAVE:
+ case AMDGPU::SI_SPILL_V1024_RESTORE:
+ case AMDGPU::SI_SPILL_A1024_SAVE:
+ case AMDGPU::SI_SPILL_A1024_RESTORE:
+ return 32;
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_V512_SAVE:
case AMDGPU::SI_SPILL_V512_RESTORE:
+ case AMDGPU::SI_SPILL_A512_SAVE:
+ case AMDGPU::SI_SPILL_A512_RESTORE:
return 16;
case AMDGPU::SI_SPILL_S256_SAVE:
case AMDGPU::SI_SPILL_S256_RESTORE:
case AMDGPU::SI_SPILL_V256_SAVE:
case AMDGPU::SI_SPILL_V256_RESTORE:
return 8;
+ case AMDGPU::SI_SPILL_S160_SAVE:
+ case AMDGPU::SI_SPILL_S160_RESTORE:
+ case AMDGPU::SI_SPILL_V160_SAVE:
+ case AMDGPU::SI_SPILL_V160_RESTORE:
+ return 5;
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S128_RESTORE:
case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V128_RESTORE:
+ case AMDGPU::SI_SPILL_A128_SAVE:
+ case AMDGPU::SI_SPILL_A128_RESTORE:
return 4;
+ case AMDGPU::SI_SPILL_S96_SAVE:
+ case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V96_RESTORE:
return 3;
@@ -416,11 +496,15 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_V64_SAVE:
case AMDGPU::SI_SPILL_V64_RESTORE:
+ case AMDGPU::SI_SPILL_A64_SAVE:
+ case AMDGPU::SI_SPILL_A64_RESTORE:
return 2;
case AMDGPU::SI_SPILL_S32_SAVE:
case AMDGPU::SI_SPILL_S32_RESTORE:
case AMDGPU::SI_SPILL_V32_SAVE:
case AMDGPU::SI_SPILL_V32_RESTORE:
+ case AMDGPU::SI_SPILL_A32_SAVE:
+ case AMDGPU::SI_SPILL_A32_RESTORE:
return 1;
default: llvm_unreachable("Invalid spill opcode");
}
@@ -480,6 +564,35 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
}
}
+static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
+ int Index,
+ unsigned Lane,
+ unsigned ValueReg,
+ bool IsKill) {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction *MF = MI->getParent()->getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
+
+ if (Reg == AMDGPU::NoRegister)
+ return MachineInstrBuilder();
+
+ bool IsStore = MI->mayStore();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+
+ unsigned Dst = IsStore ? Reg : ValueReg;
+ unsigned Src = IsStore ? ValueReg : Reg;
+ unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
+ : AMDGPU::V_ACCVGPR_READ_B32;
+
+ return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
+ .addReg(Src, getKillRegState(IsKill));
+}
+
// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
// need to handle the case where an SGPR may need to be spilled while spilling.
static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
@@ -498,6 +611,9 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
return false;
const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
+ if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
+ return true;
+
MachineInstrBuilder NewMI =
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
.add(*Reg)
@@ -507,6 +623,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // tfe
+ .addImm(0) // dlc
.cloneMemRefs(*MI);
const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
@@ -549,6 +666,10 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned Align = MFI.getObjectAlignment(Index);
const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
+ Register TmpReg =
+ hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg()
+ : Register();
+
assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
if (!isUInt<12>(Offset + Size - EltSize)) {
@@ -562,7 +683,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
// We don't have access to the register scavenger if this function is called
// during PEI::scavengeFrameVirtualRegs().
if (RS)
- SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass);
+ SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
if (SOffset == AMDGPU::NoRegister) {
// There are no free SGPRs, and since we are in the process of spilling
@@ -597,20 +718,38 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
SrcDstRegState |= getKillRegState(IsKill);
}
- MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
- MachineMemOperand *NewMMO
- = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
- EltSize, MinAlign(Align, EltSize * i));
-
- auto MIB = BuildMI(*MBB, MI, DL, Desc)
- .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
- .addReg(ScratchRsrcReg)
- .addReg(SOffset, SOffsetRegState)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addMemOperand(NewMMO);
+ auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);
+
+ if (!MIB.getInstr()) {
+ unsigned FinalReg = SubReg;
+ if (TmpReg != AMDGPU::NoRegister) {
+ if (IsStore)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
+ .addReg(SubReg, getKillRegState(IsKill));
+ SubReg = TmpReg;
+ }
+
+ MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
+ MachineMemOperand *NewMMO
+ = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
+ EltSize, MinAlign(Align, EltSize * i));
+
+ MIB = BuildMI(*MBB, MI, DL, Desc)
+ .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
+ .addReg(ScratchRsrcReg)
+ .addReg(SOffset, SOffsetRegState)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(NewMMO);
+
+ if (!IsStore && TmpReg != AMDGPU::NoRegister)
+ MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
+ FinalReg)
+ .addReg(TmpReg, RegState::Kill);
+ }
if (NumSubRegs > 1)
MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
@@ -669,6 +808,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (SpillToSMEM && OnlyToVGPR)
return false;
+ Register FrameReg = getFrameRegister(*MF);
+
assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
SuperReg != MFI->getFrameOffsetReg() &&
SuperReg != MFI->getScratchWaveOffsetReg()));
@@ -728,11 +869,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
if (Offset != 0) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(MFI->getFrameOffsetReg())
+ .addReg(FrameReg)
.addImm(Offset);
} else {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(MFI->getFrameOffsetReg());
+ .addReg(FrameReg);
}
BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
@@ -740,6 +881,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
.addReg(MFI->getScratchRSrcReg()) // sbase
.addReg(OffsetReg, RegState::Kill) // soff
.addImm(0) // glc
+ .addImm(0) // dlc
.addMemOperand(MMO);
continue;
@@ -799,11 +941,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
- .addReg(TmpReg, RegState::Kill) // src
- .addFrameIndex(Index) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // srrsrc
- .addReg(MFI->getFrameOffsetReg()) // soffset
- .addImm(i * 4) // offset
+ .addReg(TmpReg, RegState::Kill) // src
+ .addFrameIndex(Index) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // srrsrc
+ .addReg(MFI->getStackPtrOffsetReg()) // soffset
+ .addImm(i * 4) // offset
.addMemOperand(MMO);
}
}
@@ -859,6 +1001,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
unsigned EltSize = 4;
unsigned ScalarLoadOp;
+ Register FrameReg = getFrameRegister(*MF);
+
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
if (SpillToSMEM && isSGPRClass(RC)) {
// XXX - if private_element_size is larger than 4 it might be useful to be
@@ -890,18 +1034,19 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
if (Offset != 0) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(MFI->getFrameOffsetReg())
+ .addReg(FrameReg)
.addImm(Offset);
} else {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(MFI->getFrameOffsetReg());
+ .addReg(FrameReg);
}
auto MIB =
BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
- .addReg(MFI->getScratchRSrcReg()) // sbase
- .addReg(OffsetReg, RegState::Kill) // soff
- .addImm(0) // glc
+ .addReg(MFI->getScratchRSrcReg()) // sbase
+ .addReg(OffsetReg, RegState::Kill) // soff
+ .addImm(0) // glc
+ .addImm(0) // dlc
.addMemOperand(MMO);
if (NumSubRegs > 1 && i == 0)
@@ -937,10 +1082,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
- .addFrameIndex(Index) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // srsrc
- .addReg(MFI->getFrameOffsetReg()) // soffset
- .addImm(i * 4) // offset
+ .addFrameIndex(Index) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // srsrc
+ .addReg(MFI->getStackPtrOffsetReg()) // soffset
+ .addImm(i * 4) // offset
.addMemOperand(MMO);
auto MIB =
@@ -969,15 +1114,21 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
int FI,
RegScavenger *RS) const {
switch (MI->getOpcode()) {
+ case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE:
return spillSGPR(MI, FI, RS, true);
+ case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE:
return restoreSGPR(MI, FI, RS, true);
@@ -998,14 +1149,21 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
const SIInstrInfo *TII = ST.getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
+ assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
+
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
int Index = MI->getOperand(FIOperandNum).getIndex();
+ Register FrameReg = getFrameRegister(*MF);
+
switch (MI->getOpcode()) {
// SGPR register spill
+ case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE: {
spillSGPR(MI, Index, RS);
@@ -1013,9 +1171,12 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
// SGPR register restore
+ case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE: {
restoreSGPR(MI, Index, RS);
@@ -1023,19 +1184,29 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
// VGPR register spill
+ case AMDGPU::SI_SPILL_V1024_SAVE:
case AMDGPU::SI_SPILL_V512_SAVE:
case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V160_SAVE:
case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V64_SAVE:
- case AMDGPU::SI_SPILL_V32_SAVE: {
+ case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_A1024_SAVE:
+ case AMDGPU::SI_SPILL_A512_SAVE:
+ case AMDGPU::SI_SPILL_A128_SAVE:
+ case AMDGPU::SI_SPILL_A64_SAVE:
+ case AMDGPU::SI_SPILL_A32_SAVE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
+ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
+ MFI->getStackPtrOffsetReg());
+
buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
Index,
VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
- TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
+ FrameReg,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
*MI->memoperands_begin(),
RS);
@@ -1047,16 +1218,25 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_V128_RESTORE:
+ case AMDGPU::SI_SPILL_V160_RESTORE:
case AMDGPU::SI_SPILL_V256_RESTORE:
- case AMDGPU::SI_SPILL_V512_RESTORE: {
+ case AMDGPU::SI_SPILL_V512_RESTORE:
+ case AMDGPU::SI_SPILL_V1024_RESTORE:
+ case AMDGPU::SI_SPILL_A32_RESTORE:
+ case AMDGPU::SI_SPILL_A64_RESTORE:
+ case AMDGPU::SI_SPILL_A128_RESTORE:
+ case AMDGPU::SI_SPILL_A512_RESTORE:
+ case AMDGPU::SI_SPILL_A1024_RESTORE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
+ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
+ MFI->getStackPtrOffsetReg());
buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
Index,
VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
- TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
+ FrameReg,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
*MI->memoperands_begin(),
RS);
@@ -1068,24 +1248,23 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
const DebugLoc &DL = MI->getDebugLoc();
bool IsMUBUF = TII->isMUBUF(*MI);
- if (!IsMUBUF &&
- MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) {
+ if (!IsMUBUF && !MFI->isEntryFunction()) {
// Convert to an absolute stack address by finding the offset from the
// scratch wave base and scaling by the wave size.
//
- // In an entry function/kernel the stack address is already the
- // absolute address relative to the scratch wave offset.
+ // In an entry function/kernel the offset is already the absolute
+ // address relative to the frame register.
unsigned DiffReg
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
- unsigned ResultReg = IsCopy ?
+ Register ResultReg = IsCopy ?
MI->getOperand(0).getReg() :
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
- .addReg(MFI->getFrameOffsetReg())
+ .addReg(FrameReg)
.addReg(MFI->getScratchWaveOffsetReg());
int64_t Offset = FrameInfo.getObjectOffset(Index);
@@ -1106,7 +1285,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
.addImm(Offset)
- .addReg(ScaledReg, RegState::Kill);
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(0); // clamp bit
} else {
unsigned ConstOffsetReg
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -1115,7 +1295,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
.addImm(Offset);
TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
.addReg(ConstOffsetReg, RegState::Kill)
- .addReg(ScaledReg, RegState::Kill);
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(0); // clamp bit
}
}
@@ -1133,8 +1314,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::vaddr));
- assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg()
- == MFI->getFrameOffsetReg());
+ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
+ MFI->getStackPtrOffsetReg());
+
+ TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg);
int64_t Offset = FrameInfo.getObjectOffset(Index);
int64_t OldImm
@@ -1164,63 +1347,21 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
- #define AMDGPU_REG_ASM_NAMES
- #include "AMDGPURegAsmNames.inc.cpp"
-
- #define REG_RANGE(BeginReg, EndReg, RegTable) \
- if (Reg >= BeginReg && Reg <= EndReg) { \
- unsigned Index = Reg - BeginReg; \
- assert(Index < array_lengthof(RegTable)); \
- return RegTable[Index]; \
- }
+ const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
+ unsigned Size = getRegSizeInBits(*RC);
+ unsigned AltName = AMDGPU::NoRegAltName;
- REG_RANGE(AMDGPU::VGPR0, AMDGPU::VGPR255, VGPR32RegNames);
- REG_RANGE(AMDGPU::SGPR0, AMDGPU::SGPR103, SGPR32RegNames);
- REG_RANGE(AMDGPU::VGPR0_VGPR1, AMDGPU::VGPR254_VGPR255, VGPR64RegNames);
- REG_RANGE(AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR102_SGPR103, SGPR64RegNames);
- REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2, AMDGPU::VGPR253_VGPR254_VGPR255,
- VGPR96RegNames);
-
- REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3,
- AMDGPU::VGPR252_VGPR253_VGPR254_VGPR255,
- VGPR128RegNames);
- REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
- AMDGPU::SGPR100_SGPR101_SGPR102_SGPR103,
- SGPR128RegNames);
-
- REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7,
- AMDGPU::VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255,
- VGPR256RegNames);
-
- REG_RANGE(
- AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15,
- AMDGPU::VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247_VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255,
- VGPR512RegNames);
-
- REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7,
- AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103,
- SGPR256RegNames);
-
- REG_RANGE(
- AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7_SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15,
- AMDGPU::SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95_SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103,
- SGPR512RegNames
- );
-
-#undef REG_RANGE
-
- // FIXME: Rename flat_scr so we don't need to special case this.
- switch (Reg) {
- case AMDGPU::FLAT_SCR:
- return "flat_scratch";
- case AMDGPU::FLAT_SCR_LO:
- return "flat_scratch_lo";
- case AMDGPU::FLAT_SCR_HI:
- return "flat_scratch_hi";
- default:
- // For the special named registers the default is fine.
- return TargetRegisterInfo::getRegAsmName(Reg);
+ switch (Size) {
+ case 32: AltName = AMDGPU::Reg32; break;
+ case 64: AltName = AMDGPU::Reg64; break;
+ case 96: AltName = AMDGPU::Reg96; break;
+ case 128: AltName = AMDGPU::Reg128; break;
+ case 160: AltName = AMDGPU::Reg160; break;
+ case 256: AltName = AMDGPU::Reg256; break;
+ case 512: AltName = AMDGPU::Reg512; break;
+ case 1024: AltName = AMDGPU::Reg1024; break;
}
+ return AMDGPUInstPrinter::getRegisterName(Reg, AltName);
}
// FIXME: This is very slow. It might be worth creating a map from physreg to
@@ -1231,15 +1372,25 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
static const TargetRegisterClass *const BaseClasses[] = {
&AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
+ &AMDGPU::AGPR_32RegClass,
&AMDGPU::VReg_64RegClass,
&AMDGPU::SReg_64RegClass,
+ &AMDGPU::AReg_64RegClass,
&AMDGPU::VReg_96RegClass,
+ &AMDGPU::SReg_96RegClass,
&AMDGPU::VReg_128RegClass,
&AMDGPU::SReg_128RegClass,
+ &AMDGPU::AReg_128RegClass,
+ &AMDGPU::VReg_160RegClass,
+ &AMDGPU::SReg_160RegClass,
&AMDGPU::VReg_256RegClass,
&AMDGPU::SReg_256RegClass,
&AMDGPU::VReg_512RegClass,
&AMDGPU::SReg_512RegClass,
+ &AMDGPU::AReg_512RegClass,
+ &AMDGPU::SReg_1024RegClass,
+ &AMDGPU::VReg_1024RegClass,
+ &AMDGPU::AReg_1024RegClass,
&AMDGPU::SCC_CLASSRegClass,
&AMDGPU::Pseudo_SReg_32RegClass,
&AMDGPU::Pseudo_SReg_128RegClass,
@@ -1268,10 +1419,39 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
case 128:
return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
+ case 160:
+ return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr;
case 256:
return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
case 512:
return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
+ case 1024:
+ return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
+}
+
+bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
+ unsigned Size = getRegSizeInBits(*RC);
+ if (Size < 32)
+ return false;
+ switch (Size) {
+ case 32:
+ return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr;
+ case 64:
+ return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr;
+ case 96:
+ return false;
+ case 128:
+ return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr;
+ case 160:
+ case 256:
+ return false;
+ case 512:
+ return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr;
+ case 1024:
+ return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr;
default:
llvm_unreachable("Invalid register class size");
}
@@ -1288,10 +1468,32 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
return &AMDGPU::VReg_96RegClass;
case 128:
return &AMDGPU::VReg_128RegClass;
+ case 160:
+ return &AMDGPU::VReg_160RegClass;
case 256:
return &AMDGPU::VReg_256RegClass;
case 512:
return &AMDGPU::VReg_512RegClass;
+ case 1024:
+ return &AMDGPU::VReg_1024RegClass;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
+}
+
+const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass(
+ const TargetRegisterClass *SRC) const {
+ switch (getRegSizeInBits(*SRC)) {
+ case 32:
+ return &AMDGPU::AGPR_32RegClass;
+ case 64:
+ return &AMDGPU::AReg_64RegClass;
+ case 128:
+ return &AMDGPU::AReg_128RegClass;
+ case 512:
+ return &AMDGPU::AReg_512RegClass;
+ case 1024:
+ return &AMDGPU::AReg_1024RegClass;
default:
llvm_unreachable("Invalid register class size");
}
@@ -1304,12 +1506,18 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
return &AMDGPU::SGPR_32RegClass;
case 64:
return &AMDGPU::SReg_64RegClass;
+ case 96:
+ return &AMDGPU::SReg_96RegClass;
case 128:
return &AMDGPU::SReg_128RegClass;
+ case 160:
+ return &AMDGPU::SReg_160RegClass;
case 256:
return &AMDGPU::SReg_256RegClass;
case 512:
return &AMDGPU::SReg_512RegClass;
+ case 1024:
+ return &AMDGPU::SReg_1024RegClass;
default:
llvm_unreachable("Invalid register class size");
}
@@ -1328,11 +1536,31 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
return &AMDGPU::SGPR_32RegClass;
case 2:
return &AMDGPU::SReg_64RegClass;
+ case 3:
+ return &AMDGPU::SReg_96RegClass;
case 4:
return &AMDGPU::SReg_128RegClass;
+ case 5:
+ return &AMDGPU::SReg_160RegClass;
case 8:
return &AMDGPU::SReg_256RegClass;
- case 16: /* fall-through */
+ case 16:
+ return &AMDGPU::SReg_512RegClass;
+ case 32: /* fall-through */
+ default:
+ llvm_unreachable("Invalid sub-register class size");
+ }
+ } else if (hasAGPRs(RC)) {
+ switch (Count) {
+ case 1:
+ return &AMDGPU::AGPR_32RegClass;
+ case 2:
+ return &AMDGPU::AReg_64RegClass;
+ case 4:
+ return &AMDGPU::AReg_128RegClass;
+ case 16:
+ return &AMDGPU::AReg_512RegClass;
+ case 32: /* fall-through */
default:
llvm_unreachable("Invalid sub-register class size");
}
@@ -1346,9 +1574,13 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
return &AMDGPU::VReg_96RegClass;
case 4:
return &AMDGPU::VReg_128RegClass;
+ case 5:
+ return &AMDGPU::VReg_160RegClass;
case 8:
return &AMDGPU::VReg_256RegClass;
- case 16: /* fall-through */
+ case 16:
+ return &AMDGPU::VReg_512RegClass;
+ case 32: /* fall-through */
default:
llvm_unreachable("Invalid sub-register class size");
}
@@ -1396,6 +1628,17 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
unsigned EltSize) const {
if (EltSize == 4) {
+ static const int16_t Sub0_31[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+ AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+ AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
+ AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
+ AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
+ AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31,
+ };
+
static const int16_t Sub0_15[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
@@ -1408,6 +1651,10 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC
AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
};
+ static const int16_t Sub0_4[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
+ };
+
static const int16_t Sub0_3[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
};
@@ -1429,16 +1676,31 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC
return makeArrayRef(Sub0_2);
case 128:
return makeArrayRef(Sub0_3);
+ case 160:
+ return makeArrayRef(Sub0_4);
case 256:
return makeArrayRef(Sub0_7);
case 512:
return makeArrayRef(Sub0_15);
+ case 1024:
+ return makeArrayRef(Sub0_31);
default:
llvm_unreachable("unhandled register size");
}
}
if (EltSize == 8) {
+ static const int16_t Sub0_31_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+ AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+ AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+ AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
+ AMDGPU::sub16_sub17, AMDGPU::sub18_sub19,
+ AMDGPU::sub20_sub21, AMDGPU::sub22_sub23,
+ AMDGPU::sub24_sub25, AMDGPU::sub26_sub27,
+ AMDGPU::sub28_sub29, AMDGPU::sub30_sub31
+ };
+
static const int16_t Sub0_15_64[] = {
AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
@@ -1465,32 +1727,73 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC
return makeArrayRef(Sub0_7_64);
case 512:
return makeArrayRef(Sub0_15_64);
+ case 1024:
+ return makeArrayRef(Sub0_31_64);
default:
llvm_unreachable("unhandled register size");
}
}
- assert(EltSize == 16 && "unhandled register spill split size");
+ if (EltSize == 16) {
+
+ static const int16_t Sub0_31_128[] = {
+ AMDGPU::sub0_sub1_sub2_sub3,
+ AMDGPU::sub4_sub5_sub6_sub7,
+ AMDGPU::sub8_sub9_sub10_sub11,
+ AMDGPU::sub12_sub13_sub14_sub15,
+ AMDGPU::sub16_sub17_sub18_sub19,
+ AMDGPU::sub20_sub21_sub22_sub23,
+ AMDGPU::sub24_sub25_sub26_sub27,
+ AMDGPU::sub28_sub29_sub30_sub31
+ };
+
+ static const int16_t Sub0_15_128[] = {
+ AMDGPU::sub0_sub1_sub2_sub3,
+ AMDGPU::sub4_sub5_sub6_sub7,
+ AMDGPU::sub8_sub9_sub10_sub11,
+ AMDGPU::sub12_sub13_sub14_sub15
+ };
+
+ static const int16_t Sub0_7_128[] = {
+ AMDGPU::sub0_sub1_sub2_sub3,
+ AMDGPU::sub4_sub5_sub6_sub7
+ };
- static const int16_t Sub0_15_128[] = {
- AMDGPU::sub0_sub1_sub2_sub3,
- AMDGPU::sub4_sub5_sub6_sub7,
- AMDGPU::sub8_sub9_sub10_sub11,
- AMDGPU::sub12_sub13_sub14_sub15
+ switch (AMDGPU::getRegBitWidth(*RC->MC)) {
+ case 128:
+ return {};
+ case 256:
+ return makeArrayRef(Sub0_7_128);
+ case 512:
+ return makeArrayRef(Sub0_15_128);
+ case 1024:
+ return makeArrayRef(Sub0_31_128);
+ default:
+ llvm_unreachable("unhandled register size");
+ }
+ }
+
+ assert(EltSize == 32 && "unhandled elt size");
+
+ static const int16_t Sub0_31_256[] = {
+ AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
+ AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15,
+ AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23,
+ AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
};
- static const int16_t Sub0_7_128[] = {
- AMDGPU::sub0_sub1_sub2_sub3,
- AMDGPU::sub4_sub5_sub6_sub7
+ static const int16_t Sub0_15_256[] = {
+ AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
+ AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15
};
switch (AMDGPU::getRegBitWidth(*RC->MC)) {
- case 128:
- return {};
case 256:
- return makeArrayRef(Sub0_7_128);
+ return {};
case 512:
- return makeArrayRef(Sub0_15_128);
+ return makeArrayRef(Sub0_15_256);
+ case 1024:
+ return makeArrayRef(Sub0_31_256);
default:
llvm_unreachable("unhandled register size");
}
@@ -1512,6 +1815,13 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
return hasVGPRs(RC);
}
+bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
+ unsigned Reg) const {
+ const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
+ assert(RC && "Register class for the reg not found");
+ return hasAGPRs(RC);
+}
+
bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
const TargetRegisterClass *SrcRC,
unsigned SubReg,
@@ -1553,7 +1863,7 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const {
- if (Idx == getVGPRPressureSet())
+ if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet())
return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
const_cast<MachineFunction &>(MF));
@@ -1578,28 +1888,80 @@ unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
}
const TargetRegisterClass *
-SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
+SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
+ const RegisterBank &RB,
const MachineRegisterInfo &MRI) const {
- unsigned Size = getRegSizeInBits(MO.getReg(), MRI);
- const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
- if (!RB)
- return nullptr;
-
switch (Size) {
+ case 1: {
+ switch (RB.getID()) {
+ case AMDGPU::VGPRRegBankID:
+ return &AMDGPU::VGPR_32RegClass;
+ case AMDGPU::VCCRegBankID:
+ return isWave32 ?
+ &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
+ case AMDGPU::SGPRRegBankID:
+ return &AMDGPU::SReg_32_XM0RegClass;
+ case AMDGPU::SCCRegBankID:
+ // This needs to return an allocatable class, so don't bother returning
+ // the dummy SCC class.
+ return &AMDGPU::SReg_32_XM0RegClass;
+ default:
+ llvm_unreachable("unknown register bank");
+ }
+ }
case 32:
- return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
- &AMDGPU::SReg_32_XM0RegClass;
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
+ &AMDGPU::SReg_32_XM0RegClass;
case 64:
- return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
- &AMDGPU::SReg_64_XEXECRegClass;
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
+ &AMDGPU::SReg_64_XEXECRegClass;
case 96:
- return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
- nullptr;
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
+ &AMDGPU::SReg_96RegClass;
case 128:
- return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
- &AMDGPU::SReg_128RegClass;
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
+ &AMDGPU::SReg_128RegClass;
+ case 160:
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass :
+ &AMDGPU::SReg_160RegClass;
+ case 256:
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass :
+ &AMDGPU::SReg_256RegClass;
+ case 512:
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass :
+ &AMDGPU::SReg_512RegClass;
+ default:
+ if (Size < 32)
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
+ &AMDGPU::SReg_32_XM0RegClass;
+ return nullptr;
+ }
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI) const {
+ if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()))
+ return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
+ return nullptr;
+}
+
+unsigned SIRegisterInfo::getVCC() const {
+ return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getRegClass(unsigned RCID) const {
+ switch ((int)RCID) {
+ case AMDGPU::SReg_1RegClassID:
+ return getBoolRC();
+ case AMDGPU::SReg_1_XEXECRegClassID:
+ return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
+ : &AMDGPU::SReg_64_XEXECRegClass;
+ case -1:
+ return nullptr;
default:
- llvm_unreachable("not implemented");
+ return AMDGPURegisterInfo::getRegClass(RCID);
}
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index b82fefde47e1..34487c96e72e 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -1,9 +1,8 @@
//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -30,10 +29,13 @@ class SIRegisterInfo final : public AMDGPURegisterInfo {
private:
unsigned SGPRSetID;
unsigned VGPRSetID;
+ unsigned AGPRSetID;
BitVector SGPRPressureSets;
BitVector VGPRPressureSets;
+ BitVector AGPRPressureSets;
bool SpillSGPRToVGPR;
bool SpillSGPRToSMEM;
+ bool isWave32;
void classifyPressureSet(unsigned PSetID, unsigned Reg,
BitVector &PressureSets) const;
@@ -57,8 +59,6 @@ public:
unsigned reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const;
- unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const;
-
BitVector getReservedRegs(const MachineFunction &MF) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
@@ -72,8 +72,9 @@ public:
return 100;
}
- unsigned getFrameRegister(const MachineFunction &MF) const override;
+ Register getFrameRegister(const MachineFunction &MF) const override;
+ bool canRealignStack(const MachineFunction &MF) const override;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
@@ -130,7 +131,7 @@ public:
/// \returns true if this class contains only SGPR registers
bool isSGPRClass(const TargetRegisterClass *RC) const {
- return !hasVGPRs(RC);
+ return !hasVGPRs(RC) && !hasAGPRs(RC);
}
/// \returns true if this class ID contains only SGPR registers
@@ -150,10 +151,22 @@ public:
/// \returns true if this class contains VGPR registers.
bool hasVGPRs(const TargetRegisterClass *RC) const;
+ /// \returns true if this class contains AGPR registers.
+ bool hasAGPRs(const TargetRegisterClass *RC) const;
+
+ /// \returns true if this class contains any vector registers.
+ bool hasVectorRegisters(const TargetRegisterClass *RC) const {
+ return hasVGPRs(RC) || hasAGPRs(RC);
+ }
+
/// \returns A VGPR reg class with the same width as \p SRC
const TargetRegisterClass *getEquivalentVGPRClass(
const TargetRegisterClass *SRC) const;
+ /// \returns An AGPR reg class with the same width as \p SRC
+ const TargetRegisterClass *getEquivalentAGPRClass(
+ const TargetRegisterClass *SRC) const;
+
/// \returns A SGPR reg class with the same width as \p SRC
const TargetRegisterClass *getEquivalentSGPRClass(
const TargetRegisterClass *VRC) const;
@@ -191,16 +204,32 @@ public:
unsigned getSGPRPressureSet() const { return SGPRSetID; };
unsigned getVGPRPressureSet() const { return VGPRSetID; };
+ unsigned getAGPRPressureSet() const { return AGPRSetID; };
const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI,
unsigned Reg) const;
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
+ bool isAGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
+ bool isVectorRegister(const MachineRegisterInfo &MRI, unsigned Reg) const {
+ return isVGPR(MRI, Reg) || isAGPR(MRI, Reg);
+ }
+
+ virtual bool
+ isDivergentRegClass(const TargetRegisterClass *RC) const override {
+ return !isSGPRClass(RC);
+ }
bool isSGPRPressureSet(unsigned SetID) const {
- return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID);
+ return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID) &&
+ !AGPRPressureSets.test(SetID);
}
bool isVGPRPressureSet(unsigned SetID) const {
- return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID);
+ return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) &&
+ !AGPRPressureSets.test(SetID);
+ }
+ bool isAGPRPressureSet(unsigned SetID) const {
+ return AGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) &&
+ !VGPRPressureSets.test(SetID);
}
ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
@@ -225,15 +254,44 @@ public:
unsigned getReturnAddressReg(const MachineFunction &MF) const;
const TargetRegisterClass *
+ getRegClassForSizeOnBank(unsigned Size,
+ const RegisterBank &Bank,
+ const MachineRegisterInfo &MRI) const;
+
+ const TargetRegisterClass *
+ getRegClassForTypeOnBank(LLT Ty,
+ const RegisterBank &Bank,
+ const MachineRegisterInfo &MRI) const {
+ return getRegClassForSizeOnBank(Ty.getSizeInBits(), Bank, MRI);
+ }
+
+ const TargetRegisterClass *
getConstrainedRegClassForOperand(const MachineOperand &MO,
const MachineRegisterInfo &MRI) const override;
+ const TargetRegisterClass *getBoolRC() const {
+ return isWave32 ? &AMDGPU::SReg_32_XM0RegClass
+ : &AMDGPU::SReg_64RegClass;
+ }
+
+ const TargetRegisterClass *getWaveMaskRegClass() const {
+ return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
+ : &AMDGPU::SReg_64_XEXECRegClass;
+ }
+
+ unsigned getVCC() const;
+
+ const TargetRegisterClass *getRegClass(unsigned RCID) const;
+
// Find reaching register definition
MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg,
MachineInstr &Use,
MachineRegisterInfo &MRI,
LiveIntervals *LIS) const;
+ const uint32_t *getAllVGPRRegMask() const;
+ const uint32_t *getAllAllocatableSRegMask() const;
+
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index c625ecc9b750..d5948a7862cc 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1,9 +1,8 @@
//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -15,43 +14,86 @@ class getSubRegs<int size> {
list<SubRegIndex> ret2 = [sub0, sub1];
list<SubRegIndex> ret3 = [sub0, sub1, sub2];
list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3];
+ list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4];
list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
sub4, sub5, sub6, sub7,
sub8, sub9, sub10, sub11,
sub12, sub13, sub14, sub15];
+ list<SubRegIndex> ret32 = [sub0, sub1, sub2, sub3,
+ sub4, sub5, sub6, sub7,
+ sub8, sub9, sub10, sub11,
+ sub12, sub13, sub14, sub15,
+ sub16, sub17, sub18, sub19,
+ sub20, sub21, sub22, sub23,
+ sub24, sub25, sub26, sub27,
+ sub28, sub29, sub30, sub31];
list<SubRegIndex> ret = !if(!eq(size, 2), ret2,
!if(!eq(size, 3), ret3,
!if(!eq(size, 4), ret4,
- !if(!eq(size, 8), ret8, ret16))));
+ !if(!eq(size, 5), ret5,
+ !if(!eq(size, 8), ret8,
+ !if(!eq(size, 16), ret16, ret32))))));
+}
+
+let Namespace = "AMDGPU" in {
+defset list<RegAltNameIndex> AllRegAltNameIndices = {
+ def Reg32 : RegAltNameIndex;
+ def Reg64 : RegAltNameIndex;
+ def Reg96 : RegAltNameIndex;
+ def Reg128 : RegAltNameIndex;
+ def Reg160 : RegAltNameIndex;
+ def Reg256 : RegAltNameIndex;
+ def Reg512 : RegAltNameIndex;
+ def Reg1024 : RegAltNameIndex;
+}
}
//===----------------------------------------------------------------------===//
// Declarations that describe the SI registers
//===----------------------------------------------------------------------===//
-class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
+class SIReg <string n, bits<16> regIdx = 0, string prefix = "",
+ int regNo = !cast<int>(regIdx)> :
+ Register<n, !if(!eq(prefix, ""),
+ [ n, n, n, n, n, n, n, n ],
+ [ prefix # regNo,
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 1), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 2), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 3), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 4), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 7), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 15), 255) # "]",
+ prefix # "[" # regNo # ":" # !and(!add(regNo, 31), 255) # "]",
+ ])>,
DwarfRegNum<[!cast<int>(HWEncoding)]> {
let Namespace = "AMDGPU";
+ let RegAltNameIndices = AllRegAltNameIndices;
// This is the not yet the complete register encoding. An additional
// bit is set for VGPRs.
let HWEncoding = regIdx;
}
+class SIRegisterWithSubRegs<string n, list<Register> subregs> :
+ RegisterWithSubRegs<n, subregs> {
+ let RegAltNameIndices = AllRegAltNameIndices;
+ let AltNames = [ n, n, n, n, n, n, n, n ];
+}
+
// Special Registers
def VCC_LO : SIReg<"vcc_lo", 106>;
def VCC_HI : SIReg<"vcc_hi", 107>;
// Pseudo-registers: Used as placeholders during isel and immediately
// replaced, never seeing the verifier.
-def PRIVATE_RSRC_REG : SIReg<"", 0>;
-def FP_REG : SIReg<"", 0>;
-def SP_REG : SIReg<"", 0>;
-def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>;
+def PRIVATE_RSRC_REG : SIReg<"private_rsrc", 0>;
+def FP_REG : SIReg<"fp", 0>;
+def SP_REG : SIReg<"sp", 0>;
+def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>;
// VCC for 64-bit instructions
-def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
+def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
DwarfRegAlias<VCC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -61,25 +103,38 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
def EXEC_LO : SIReg<"exec_lo", 126>;
def EXEC_HI : SIReg<"exec_hi", 127>;
-def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
+def EXEC : SIRegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
DwarfRegAlias<EXEC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 126;
}
-def SCC : SIReg<"scc", 253>;
+// 32-bit real registers, for MC only.
+// May be used with both 32-bit and 64-bit operands.
+def SRC_VCCZ : SIReg<"src_vccz", 251>;
+def SRC_EXECZ : SIReg<"src_execz", 252>;
+def SRC_SCC : SIReg<"src_scc", 253>;
+
+// 1-bit pseudo register, for codegen only.
+// Should never be emitted.
+def SCC : SIReg<"scc">;
+
def M0 : SIReg <"m0", 124>;
+def SGPR_NULL : SIReg<"null", 125>;
def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>;
def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
+def SRC_POPS_EXITING_WAVE_ID : SIReg<"src_pops_exiting_wave_id", 239>;
+
+def LDS_DIRECT : SIReg <"src_lds_direct", 254>;
def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>;
def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>;
-def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
+def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
DwarfRegAlias<XNACK_MASK_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -90,7 +145,7 @@ def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI
def TBA_LO : SIReg<"tba_lo", 108>;
def TBA_HI : SIReg<"tba_hi", 109>;
-def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
+def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
DwarfRegAlias<TBA_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -100,7 +155,7 @@ def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
def TMA_LO : SIReg<"tma_lo", 110>;
def TMA_HI : SIReg<"tma_hi", 111>;
-def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
+def TMA : SIRegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
DwarfRegAlias<TMA_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -108,19 +163,19 @@ def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
}
foreach Index = 0-15 in {
- def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>;
- def TTMP#Index#_gfx9 : SIReg<"ttmp"#Index, !add(108, Index)>;
- def TTMP#Index : SIReg<"", 0>;
+ def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>;
+ def TTMP#Index#_gfx9_gfx10 : SIReg<"ttmp"#Index, !add(108, Index)>;
+ def TTMP#Index : SIReg<"ttmp"#Index, 0>;
}
multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
def _ci : SIReg<n, ci_e>;
def _vi : SIReg<n, vi_e>;
- def "" : SIReg<"", 0>;
+ def "" : SIReg<n, 0>;
}
class FlatReg <Register lo, Register hi, bits<16> encoding> :
- RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
+ SIRegisterWithSubRegs<"flat_scratch", [lo, hi]>,
DwarfRegAlias<lo> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -135,13 +190,20 @@ def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>;
def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
// SGPR registers
-foreach Index = 0-103 in {
- def SGPR#Index : SIReg <"SGPR"#Index, Index>;
+foreach Index = 0-105 in {
+ def SGPR#Index : SIReg <"SGPR"#Index, Index, "s">;
}
// VGPR registers
foreach Index = 0-255 in {
- def VGPR#Index : SIReg <"VGPR"#Index, Index> {
+ def VGPR#Index : SIReg <"VGPR"#Index, Index, "v"> {
+ let HWEncoding{8} = 1;
+ }
+}
+
+// AccVGPR registers
+foreach Index = 0-255 in {
+ def AGPR#Index : SIReg <"AGPR"#Index, Index, "a"> {
let HWEncoding{8} = 1;
}
}
@@ -164,10 +226,10 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
// SGPR 32-bit registers
def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add (sequence "SGPR%u", 0, 103))> {
+ (add (sequence "SGPR%u", 0, 105)), Reg32> {
// Give all SGPR classes higher priority than VGPR classes, because
// we want to spill SGPRs to VGPRs.
- let AllocationPriority = 7;
+ let AllocationPriority = 9;
}
// SGPR 64-bit registers
@@ -175,6 +237,12 @@ def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret,
[(add (decimate SGPR_32, 2)),
(add (decimate (shl SGPR_32, 1), 2))]>;
+// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs.
+def SGPR_96Regs : RegisterTuples<getSubRegs<3>.ret,
+ [(add (decimate SGPR_32, 3)),
+ (add (decimate (shl SGPR_32, 1), 3)),
+ (add (decimate (shl SGPR_32, 2), 3))]>;
+
// SGPR 128-bit registers
def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret,
[(add (decimate SGPR_32, 4)),
@@ -182,6 +250,14 @@ def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret,
(add (decimate (shl SGPR_32, 2), 4)),
(add (decimate (shl SGPR_32, 3), 4))]>;
+// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
+def SGPR_160Regs : RegisterTuples<getSubRegs<5>.ret,
+ [(add (decimate SGPR_32, 4)),
+ (add (decimate (shl SGPR_32, 1), 4)),
+ (add (decimate (shl SGPR_32, 2), 4)),
+ (add (decimate (shl SGPR_32, 3), 4)),
+ (add (decimate (shl SGPR_32, 4), 4))]>;
+
// SGPR 256-bit registers
def SGPR_256Regs : RegisterTuples<getSubRegs<8>.ret,
[(add (decimate SGPR_32, 4)),
@@ -212,6 +288,41 @@ def SGPR_512Regs : RegisterTuples<getSubRegs<16>.ret,
(add (decimate (shl SGPR_32, 14), 4)),
(add (decimate (shl SGPR_32, 15), 4))]>;
+// SGPR 1024-bit registers
+def SGPR_1024Regs : RegisterTuples<getSubRegs<32>.ret,
+ [(add (decimate SGPR_32, 4)),
+ (add (decimate (shl SGPR_32, 1), 4)),
+ (add (decimate (shl SGPR_32, 2), 4)),
+ (add (decimate (shl SGPR_32, 3), 4)),
+ (add (decimate (shl SGPR_32, 4), 4)),
+ (add (decimate (shl SGPR_32, 5), 4)),
+ (add (decimate (shl SGPR_32, 6), 4)),
+ (add (decimate (shl SGPR_32, 7), 4)),
+ (add (decimate (shl SGPR_32, 8), 4)),
+ (add (decimate (shl SGPR_32, 9), 4)),
+ (add (decimate (shl SGPR_32, 10), 4)),
+ (add (decimate (shl SGPR_32, 11), 4)),
+ (add (decimate (shl SGPR_32, 12), 4)),
+ (add (decimate (shl SGPR_32, 13), 4)),
+ (add (decimate (shl SGPR_32, 14), 4)),
+ (add (decimate (shl SGPR_32, 15), 4)),
+ (add (decimate (shl SGPR_32, 16), 4)),
+ (add (decimate (shl SGPR_32, 17), 4)),
+ (add (decimate (shl SGPR_32, 18), 4)),
+ (add (decimate (shl SGPR_32, 19), 4)),
+ (add (decimate (shl SGPR_32, 20), 4)),
+ (add (decimate (shl SGPR_32, 21), 4)),
+ (add (decimate (shl SGPR_32, 22), 4)),
+ (add (decimate (shl SGPR_32, 23), 4)),
+ (add (decimate (shl SGPR_32, 24), 4)),
+ (add (decimate (shl SGPR_32, 25), 4)),
+ (add (decimate (shl SGPR_32, 26), 4)),
+ (add (decimate (shl SGPR_32, 27), 4)),
+ (add (decimate (shl SGPR_32, 28), 4)),
+ (add (decimate (shl SGPR_32, 29), 4)),
+ (add (decimate (shl SGPR_32, 30), 4)),
+ (add (decimate (shl SGPR_32, 31), 4))]>;
+
// Trap handler TMP 32-bit registers
def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
(add (sequence "TTMP%u", 0, 15))> {
@@ -263,7 +374,7 @@ class TmpRegTuplesBase<int index, int size,
list<SubRegIndex> indices = getSubRegs<size>.ret,
int index1 = !add(index, !add(size, -1)),
string name = "ttmp["#index#":"#index1#"]"> :
- RegisterWithSubRegs<name, subRegs> {
+ SIRegisterWithSubRegs<name, subRegs> {
let HWEncoding = subRegs[0].HWEncoding;
let SubRegIndices = indices;
}
@@ -293,8 +404,8 @@ class TmpRegTuples<string tgt,
getSubRegs<size>.ret>;
foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in {
- def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>;
- def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>;
+ def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>;
+ def TTMP#Index#_TTMP#!add(Index,1)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 2, Index>;
}
foreach Index = {0, 4, 8, 12} in {
@@ -303,7 +414,7 @@ foreach Index = {0, 4, 8, 12} in {
_TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 4, Index>;
def TTMP#Index#_TTMP#!add(Index,1)#
_TTMP#!add(Index,2)#
- _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>;
+ _TTMP#!add(Index,3)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 4, Index>;
}
foreach Index = {0, 4, 8} in {
@@ -320,7 +431,7 @@ foreach Index = {0, 4, 8} in {
_TTMP#!add(Index,4)#
_TTMP#!add(Index,5)#
_TTMP#!add(Index,6)#
- _TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>;
+ _TTMP#!add(Index,7)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 8, Index>;
}
def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi :
@@ -330,18 +441,17 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT
TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi,
TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>;
-def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 :
+def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9_gfx10 :
TmpRegTuplesBase<0, 16,
- [TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9,
- TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9,
- TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9,
- TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>;
-
+ [TTMP0_gfx9_gfx10, TTMP1_gfx9_gfx10, TTMP2_gfx9_gfx10, TTMP3_gfx9_gfx10,
+ TTMP4_gfx9_gfx10, TTMP5_gfx9_gfx10, TTMP6_gfx9_gfx10, TTMP7_gfx9_gfx10,
+ TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10,
+ TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>;
// VGPR 32-bit registers
// i16/f16 only on VI+
def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add (sequence "VGPR%u", 0, 255))> {
+ (add (sequence "VGPR%u", 0, 255)), Reg32> {
let AllocationPriority = 1;
let Size = 32;
}
@@ -364,6 +474,14 @@ def VGPR_128 : RegisterTuples<getSubRegs<4>.ret,
(add (shl VGPR_32, 2)),
(add (shl VGPR_32, 3))]>;
+// VGPR 160-bit registers
+def VGPR_160 : RegisterTuples<getSubRegs<5>.ret,
+ [(add (trunc VGPR_32, 252)),
+ (add (shl VGPR_32, 1)),
+ (add (shl VGPR_32, 2)),
+ (add (shl VGPR_32, 3)),
+ (add (shl VGPR_32, 4))]>;
+
// VGPR 256-bit registers
def VGPR_256 : RegisterTuples<getSubRegs<8>.ret,
[(add (trunc VGPR_32, 249)),
@@ -394,88 +512,257 @@ def VGPR_512 : RegisterTuples<getSubRegs<16>.ret,
(add (shl VGPR_32, 14)),
(add (shl VGPR_32, 15))]>;
+// VGPR 1024-bit registers
+def VGPR_1024 : RegisterTuples<getSubRegs<32>.ret,
+ [(add (trunc VGPR_32, 225)),
+ (add (shl VGPR_32, 1)),
+ (add (shl VGPR_32, 2)),
+ (add (shl VGPR_32, 3)),
+ (add (shl VGPR_32, 4)),
+ (add (shl VGPR_32, 5)),
+ (add (shl VGPR_32, 6)),
+ (add (shl VGPR_32, 7)),
+ (add (shl VGPR_32, 8)),
+ (add (shl VGPR_32, 9)),
+ (add (shl VGPR_32, 10)),
+ (add (shl VGPR_32, 11)),
+ (add (shl VGPR_32, 12)),
+ (add (shl VGPR_32, 13)),
+ (add (shl VGPR_32, 14)),
+ (add (shl VGPR_32, 15)),
+ (add (shl VGPR_32, 16)),
+ (add (shl VGPR_32, 17)),
+ (add (shl VGPR_32, 18)),
+ (add (shl VGPR_32, 19)),
+ (add (shl VGPR_32, 20)),
+ (add (shl VGPR_32, 21)),
+ (add (shl VGPR_32, 22)),
+ (add (shl VGPR_32, 23)),
+ (add (shl VGPR_32, 24)),
+ (add (shl VGPR_32, 25)),
+ (add (shl VGPR_32, 26)),
+ (add (shl VGPR_32, 27)),
+ (add (shl VGPR_32, 28)),
+ (add (shl VGPR_32, 29)),
+ (add (shl VGPR_32, 30)),
+ (add (shl VGPR_32, 31))]>;
+
+// AccVGPR 32-bit registers
+def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add (sequence "AGPR%u", 0, 255)), Reg32> {
+ let AllocationPriority = 1;
+ let Size = 32;
+}
+
+// AGPR 64-bit registers
+def AGPR_64 : RegisterTuples<getSubRegs<2>.ret,
+ [(add (trunc AGPR_32, 255)),
+ (add (shl AGPR_32, 1))]>;
+
+// AGPR 128-bit registers
+def AGPR_128 : RegisterTuples<getSubRegs<4>.ret,
+ [(add (trunc AGPR_32, 253)),
+ (add (shl AGPR_32, 1)),
+ (add (shl AGPR_32, 2)),
+ (add (shl AGPR_32, 3))]>;
+
+// AGPR 512-bit registers
+def AGPR_512 : RegisterTuples<getSubRegs<16>.ret,
+ [(add (trunc AGPR_32, 241)),
+ (add (shl AGPR_32, 1)),
+ (add (shl AGPR_32, 2)),
+ (add (shl AGPR_32, 3)),
+ (add (shl AGPR_32, 4)),
+ (add (shl AGPR_32, 5)),
+ (add (shl AGPR_32, 6)),
+ (add (shl AGPR_32, 7)),
+ (add (shl AGPR_32, 8)),
+ (add (shl AGPR_32, 9)),
+ (add (shl AGPR_32, 10)),
+ (add (shl AGPR_32, 11)),
+ (add (shl AGPR_32, 12)),
+ (add (shl AGPR_32, 13)),
+ (add (shl AGPR_32, 14)),
+ (add (shl AGPR_32, 15))]>;
+
+// AGPR 1024-bit registers
+def AGPR_1024 : RegisterTuples<getSubRegs<32>.ret,
+ [(add (trunc AGPR_32, 225)),
+ (add (shl AGPR_32, 1)),
+ (add (shl AGPR_32, 2)),
+ (add (shl AGPR_32, 3)),
+ (add (shl AGPR_32, 4)),
+ (add (shl AGPR_32, 5)),
+ (add (shl AGPR_32, 6)),
+ (add (shl AGPR_32, 7)),
+ (add (shl AGPR_32, 8)),
+ (add (shl AGPR_32, 9)),
+ (add (shl AGPR_32, 10)),
+ (add (shl AGPR_32, 11)),
+ (add (shl AGPR_32, 12)),
+ (add (shl AGPR_32, 13)),
+ (add (shl AGPR_32, 14)),
+ (add (shl AGPR_32, 15)),
+ (add (shl AGPR_32, 16)),
+ (add (shl AGPR_32, 17)),
+ (add (shl AGPR_32, 18)),
+ (add (shl AGPR_32, 19)),
+ (add (shl AGPR_32, 20)),
+ (add (shl AGPR_32, 21)),
+ (add (shl AGPR_32, 22)),
+ (add (shl AGPR_32, 23)),
+ (add (shl AGPR_32, 24)),
+ (add (shl AGPR_32, 25)),
+ (add (shl AGPR_32, 26)),
+ (add (shl AGPR_32, 27)),
+ (add (shl AGPR_32, 28)),
+ (add (shl AGPR_32, 29)),
+ (add (shl AGPR_32, 30)),
+ (add (shl AGPR_32, 31))]>;
+
//===----------------------------------------------------------------------===//
// Register classes used as source and destination
//===----------------------------------------------------------------------===//
def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
+ (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG), Reg32> {
let isAllocatable = 0;
let CopyCost = -1;
}
def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
- (add PRIVATE_RSRC_REG)> {
+ (add PRIVATE_RSRC_REG), Reg128> {
+ let isAllocatable = 0;
+ let CopyCost = -1;
+}
+
+def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add LDS_DIRECT), Reg32> {
let isAllocatable = 0;
let CopyCost = -1;
}
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
-def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
- TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
- SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
- let AllocationPriority = 7;
+ SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
+ SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
+ SRC_VCCZ, SRC_EXECZ, SRC_SCC), Reg32> {
+ let AllocationPriority = 10;
}
-def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
- let AllocationPriority = 7;
+def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+ (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS), Reg32> {
+ let AllocationPriority = 10;
}
-def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
- let AllocationPriority = 7;
+def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+ (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI), Reg32> {
+ let AllocationPriority = 10;
}
// Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
- let AllocationPriority = 7;
+def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+ (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI), Reg32> {
+ let AllocationPriority = 10;
+}
+
+def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+ (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS),
+ Reg32> {
+ let isAllocatable = 0;
}
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
+ (add SGPR_64Regs), Reg64> {
let CopyCost = 1;
- let AllocationPriority = 8;
+ let AllocationPriority = 11;
+}
+
+// CCR (call clobbered registers) SGPR 64-bit registers
+def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
+ (add (trunc SGPR_64, 16)), Reg64> {
+ let CopyCost = SGPR_64.CopyCost;
+ let AllocationPriority = SGPR_64.AllocationPriority;
}
-def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> {
+def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
+ (add TTMP_64Regs)> {
let isAllocatable = 0;
}
def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
- (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
+ (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA), Reg64> {
let CopyCost = 1;
- let AllocationPriority = 8;
+ let AllocationPriority = 13;
}
def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
- (add SReg_64_XEXEC, EXEC)> {
+ (add SReg_64_XEXEC, EXEC), Reg64> {
let CopyCost = 1;
- let AllocationPriority = 8;
+ let AllocationPriority = 13;
+}
+
+def SReg_1_XEXEC : RegisterClass<"AMDGPU", [i1], 32,
+ (add SReg_64_XEXEC, SReg_32_XM0_XEXEC)> {
+ let CopyCost = 1;
+ let isAllocatable = 0;
+}
+
+def SReg_1 : RegisterClass<"AMDGPU", [i1], 32,
+ (add SReg_1_XEXEC, EXEC, EXEC_LO)> {
+ let CopyCost = 1;
+ let isAllocatable = 0;
}
// Requires 2 s_mov_b64 to copy
let CopyCost = 2 in {
-def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add SGPR_128Regs)> {
- let AllocationPriority = 10;
+// There are no 3-component scalar instructions, but this is needed
+// for symmetry with VGPRs.
+def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
+ (add SGPR_96Regs), Reg96> {
+ let AllocationPriority = 14;
}
-def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add TTMP_128Regs)> {
+def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
+ (add SGPR_96), Reg96> {
+ let AllocationPriority = 14;
+}
+
+def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
+ (add SGPR_128Regs), Reg128> {
+ let AllocationPriority = 15;
+}
+
+def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
+ (add TTMP_128Regs)> {
let isAllocatable = 0;
}
def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add SGPR_128, TTMP_128)> {
- let AllocationPriority = 10;
+ (add SGPR_128, TTMP_128), Reg128> {
+ let AllocationPriority = 15;
}
} // End CopyCost = 2
-def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> {
- let AllocationPriority = 11;
+// There are no 5-component scalar instructions, but this is needed
+// for symmetry with VGPRs.
+def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
+ (add SGPR_160Regs), Reg160> {
+ let AllocationPriority = 16;
+}
+
+def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
+ (add SGPR_160), Reg160> {
+ let AllocationPriority = 16;
+}
+
+def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs),
+ Reg256> {
+ let AllocationPriority = 17;
}
def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> {
@@ -483,29 +770,48 @@ def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> {
}
def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
- (add SGPR_256, TTMP_256)> {
+ (add SGPR_256, TTMP_256), Reg256> {
// Requires 4 s_mov_b64 to copy
let CopyCost = 4;
- let AllocationPriority = 11;
+ let AllocationPriority = 17;
}
-def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512Regs)> {
- let AllocationPriority = 12;
+def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+ (add SGPR_512Regs), Reg512> {
+ let AllocationPriority = 18;
}
-def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add TTMP_512Regs)> {
+def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+ (add TTMP_512Regs)> {
let isAllocatable = 0;
}
def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add SGPR_512, TTMP_512)> {
+ (add SGPR_512, TTMP_512), Reg512> {
// Requires 8 s_mov_b64 to copy
let CopyCost = 8;
- let AllocationPriority = 12;
+ let AllocationPriority = 18;
+}
+
+def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add VGPR_32, LDS_DIRECT_CLASS), Reg32> {
+ let isAllocatable = 0;
+}
+
+def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+ (add SGPR_1024Regs), Reg1024> {
+ let AllocationPriority = 19;
+}
+
+def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+ (add SGPR_1024), Reg1024> {
+ let CopyCost = 16;
+ let AllocationPriority = 19;
}
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> {
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
+ (add VGPR_64), Reg64> {
let Size = 64;
// Requires 2 v_mov_b32 to copy
@@ -513,7 +819,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32
let AllocationPriority = 2;
}
-def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
+def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96> {
let Size = 96;
// Requires 3 v_mov_b32 to copy
@@ -521,7 +827,8 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
let AllocationPriority = 3;
}
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
+ (add VGPR_128), Reg128> {
let Size = 128;
// Requires 4 v_mov_b32 to copy
@@ -529,28 +836,88 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VG
let AllocationPriority = 4;
}
-def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {
+def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
+ (add VGPR_160), Reg160> {
+ let Size = 160;
+
+ // Requires 5 v_mov_b32 to copy
+ let CopyCost = 5;
+ let AllocationPriority = 5;
+}
+
+def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
+ (add VGPR_256), Reg256> {
let Size = 256;
let CopyCost = 8;
- let AllocationPriority = 5;
+ let AllocationPriority = 6;
}
-def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+ (add VGPR_512), Reg512> {
let Size = 512;
let CopyCost = 16;
- let AllocationPriority = 6;
+ let AllocationPriority = 7;
+}
+
+def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+ (add VGPR_1024), Reg1024> {
+ let Size = 1024;
+ let CopyCost = 32;
+ let AllocationPriority = 8;
}
-def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
+ (add AGPR_64), Reg64> {
+ let Size = 64;
+
+ let CopyCost = 5;
+ let AllocationPriority = 2;
+}
+
+def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
+ (add AGPR_128), Reg128> {
+ let Size = 128;
+
+ // Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr
+ let CopyCost = 9;
+ let AllocationPriority = 4;
+}
+
+def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+ (add AGPR_512), Reg512> {
+ let Size = 512;
+ let CopyCost = 33;
+ let AllocationPriority = 7;
+}
+
+def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+ (add AGPR_1024), Reg1024> {
+ let Size = 1024;
+ let CopyCost = 65;
+ let AllocationPriority = 8;
+}
+
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32), Reg32> {
let Size = 32;
}
def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add VGPR_32, SReg_32)> {
+ (add VGPR_32, SReg_32, LDS_DIRECT_CLASS), Reg32> {
+ let isAllocatable = 0;
+}
+
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64),
+ Reg64> {
let isAllocatable = 0;
}
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add AGPR_32, VGPR_32), Reg32> {
+ let isAllocatable = 0;
+}
+
+def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32,
+ (add AReg_64, VReg_64), Reg64> {
let isAllocatable = 0;
}
@@ -563,47 +930,40 @@ class RegImmMatcher<string name> : AsmOperandClass {
let RenderMethod = "addRegOrImmOperands";
}
-multiclass SIRegOperand <string rc, string MatchName, string opType> {
+multiclass SIRegOperand32 <string rc, string MatchName, string opType,
+ string rc_suffix = "_32"> {
let OperandNamespace = "AMDGPU" in {
- def _b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _b16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_INT16";
let ParserMatchClass = RegImmMatcher<MatchName#"B16">;
let DecoderMethod = "decodeOperand_VSrc16";
}
- def _f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _f16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_FP16";
let ParserMatchClass = RegImmMatcher<MatchName#"F16">;
- let DecoderMethod = "decodeOperand_VSrc16";
+ let DecoderMethod = "decodeOperand_" # rc # "_16";
}
- def _b32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_INT32";
let ParserMatchClass = RegImmMatcher<MatchName#"B32">;
+ let DecoderMethod = "decodeOperand_" # rc # rc_suffix;
}
- def _f32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_FP32";
let ParserMatchClass = RegImmMatcher<MatchName#"F32">;
+ let DecoderMethod = "decodeOperand_" # rc # rc_suffix;
}
- def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
- let OperandType = opType#"_INT64";
- let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
- }
-
- def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
- let OperandType = opType#"_FP64";
- let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
- }
-
- def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_V2INT16";
let ParserMatchClass = RegImmMatcher<MatchName#"V2B16">;
let DecoderMethod = "decodeOperand_VSrcV216";
}
- def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+ def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_V2FP16";
let ParserMatchClass = RegImmMatcher<MatchName#"V2F16">;
let DecoderMethod = "decodeOperand_VSrcV216";
@@ -611,6 +971,21 @@ multiclass SIRegOperand <string rc, string MatchName, string opType> {
}
}
+multiclass SIRegOperand <string rc, string MatchName, string opType> :
+ SIRegOperand32<rc, MatchName, opType> {
+ let OperandNamespace = "AMDGPU" in {
+ def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ let OperandType = opType#"_INT64";
+ let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
+ }
+
+ def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ let OperandType = opType#"_FP64";
+ let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
+ }
+ }
+}
+
// FIXME: 64-bit sources can sometimes use 32-bit constants.
multiclass RegImmOperand <string rc, string MatchName>
: SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
@@ -618,20 +993,32 @@ multiclass RegImmOperand <string rc, string MatchName>
multiclass RegInlineOperand <string rc, string MatchName>
: SIRegOperand<rc, MatchName, "OPERAND_REG_INLINE_C">;
+multiclass RegInlineOperand32 <string rc, string MatchName,
+ string rc_suffix = "_32">
+ : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;
+
+multiclass RegInlineOperandAC <string rc, string MatchName,
+ string rc_suffix = "_32">
+ : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix>;
+
//===----------------------------------------------------------------------===//
// SSrc_* Operands with an SGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
defm SSrc : RegImmOperand<"SReg", "SSrc">;
+def SSrcOrLds_b32 : RegisterOperand<SRegOrLds_32> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_REG_IMM_INT32";
+ let ParserMatchClass = RegImmMatcher<"SSrcOrLdsB32">;
+}
+
//===----------------------------------------------------------------------===//
// SCSrc_* Operands with an SGPR or a inline constant
//===----------------------------------------------------------------------===//
defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
-def SCSrc_i1 : RegisterOperand<SReg_64_XEXEC>;
-
//===----------------------------------------------------------------------===//
// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
@@ -654,7 +1041,45 @@ def VRegSrc_32 : RegisterOperand<VGPR_32> {
}
//===----------------------------------------------------------------------===//
+// ASrc_* Operands with an AccVGPR
+//===----------------------------------------------------------------------===//
+
+def ARegSrc_32 : RegisterOperand<AGPR_32> {
+ let DecoderMethod = "DecodeAGPR_32RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+//===----------------------------------------------------------------------===//
// VCSrc_* Operands with an SGPR, VGPR or an inline constant
//===----------------------------------------------------------------------===//
defm VCSrc : RegInlineOperand<"VS", "VCSrc">;
+
+//===----------------------------------------------------------------------===//
+// VISrc_* Operands with a VGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+defm VISrc : RegInlineOperand32<"VGPR", "VISrc">;
+
+//===----------------------------------------------------------------------===//
+// AVSrc_* Operands with an AGPR or VGPR
+//===----------------------------------------------------------------------===//
+
+def AVSrc_32 : RegisterOperand<AV_32> {
+ let DecoderMethod = "DecodeAV_32RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVSrc_64 : RegisterOperand<AV_64> {
+ let DecoderMethod = "DecodeAV_64RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+//===----------------------------------------------------------------------===//
+// ACSrc_* Operands with an AGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+defm AISrc : RegInlineOperandAC<"AGPR", "AISrc">;
+defm AISrc_128 : RegInlineOperandAC<"AReg", "AISrc_128", "_128">;
+defm AISrc_512 : RegInlineOperandAC<"AReg", "AISrc_512", "_512">;
+defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">;
diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td
index 7af69cb6a46d..824d1aeb0df9 100644
--- a/lib/Target/AMDGPU/SISchedule.td
+++ b/lib/Target/AMDGPU/SISchedule.td
@@ -1,9 +1,8 @@
//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -25,6 +24,9 @@ def WriteSMEM : SchedWrite;
def WriteVMEM : SchedWrite;
def WriteBarrier : SchedWrite;
+def MIVGPRRead : SchedRead;
+def MIMFMARead : SchedRead;
+
// Vector ALU instructions
def Write32Bit : SchedWrite;
def WriteQuarterRate32 : SchedWrite;
@@ -38,9 +40,17 @@ def WriteDouble : SchedWrite;
// half rate f64 instruction (same as v_add_f64)
def WriteDoubleAdd : SchedWrite;
+// Conversion to or from f64 instruction
+def WriteDoubleCvt : SchedWrite;
+
// Half rate 64-bit instructions.
def Write64Bit : SchedWrite;
+// mAI multipass instructions.
+def Write2PassMAI : SchedWrite;
+def Write8PassMAI : SchedWrite;
+def Write16PassMAI : SchedWrite;
+
// FIXME: Should there be a class for instructions which are VALU
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
// instructions)
@@ -62,6 +72,7 @@ class SISchedMachineModel : SchedMachineModel {
def SIFullSpeedModel : SISchedMachineModel;
def SIQuarterSpeedModel : SISchedMachineModel;
+def GFX10SpeedModel : SISchedMachineModel;
// XXX: Are the resource counts correct?
def HWBranch : ProcResource<1> {
@@ -82,6 +93,9 @@ def HWVMEM : ProcResource<1> {
def HWVALU : ProcResource<1> {
let BufferSize = 1;
}
+def HWRC : ProcResource<1> { // Register destination cache
+ let BufferSize = 1;
+}
class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
int latency> : WriteRes<write, resources> {
@@ -91,6 +105,11 @@ class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
class HWVALUWriteRes<SchedWrite write, int latency> :
HWWriteRes<write, [HWVALU], latency>;
+def PredMIReadVGPR : SchedPredicate<[{TII->hasVGPRUses(*MI)}]>;
+
+def MIReadVGPR : SchedReadVariant<[
+ SchedVar<PredMIReadVGPR, [MIVGPRRead]>,
+ SchedVar<NoSchedPred, [ReadDefault]>]>;
// The latency numbers are taken from AMD Accelerated Parallel Processing
// guide. They may not be accurate.
@@ -109,6 +128,24 @@ multiclass SICommonWriteRes {
def : HWVALUWriteRes<Write32Bit, 1>;
def : HWVALUWriteRes<Write64Bit, 2>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+ def : HWVALUWriteRes<Write2PassMAI, 2>;
+ def : HWVALUWriteRes<Write8PassMAI, 8>;
+ def : HWVALUWriteRes<Write16PassMAI, 16>;
+
+ def : ReadAdvance<MIVGPRRead, -2>;
+ def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>;
+
+ // Technicaly mfma reads can be from 0 to 4 cycles but that does not make
+ // sense to model because its register setup is huge. In particular if we
+ // properly model read advanice as -2 for a vgpr read it will result in a
+ // bad scheduling of acc writes before that mfma. To avoid it we would
+ // need to consume 2 or 4 more vgprs to be initialized before the acc
+ // write sequence. Just assume worst case here.
+ def : ReadAdvance<MIMFMARead, -4>;
+
+ def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
+ def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
+ def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;
}
def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;
@@ -125,6 +162,7 @@ defm : SICommonWriteRes;
def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 4>;
def : HWVALUWriteRes<WriteDoubleAdd, 2>;
+def : HWVALUWriteRes<WriteDoubleCvt, 4>;
def : InstRW<[WriteCopy], (instrs COPY)>;
@@ -137,7 +175,32 @@ defm : SICommonWriteRes;
def : HWVALUWriteRes<WriteFloatFMA, 16>;
def : HWVALUWriteRes<WriteDouble, 16>;
def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+def : HWVALUWriteRes<WriteDoubleCvt, 4>;
def : InstRW<[WriteCopy], (instrs COPY)>;
} // End SchedModel = SIQuarterSpeedModel
+
+let SchedModel = GFX10SpeedModel in {
+
+// The latency values are 1 / (operations / cycle).
+// Add 1 stall cycle for VGPR read.
+def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
+def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 9>;
+def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 17>;
+def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 17>;
+def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 17>;
+def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 17>;
+
+def : HWWriteRes<WriteBranch, [HWBranch], 32>;
+def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
+def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 5>;
+def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
+def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
+} // End SchedModel = GFX10SpeedModel
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 6ad7dd0e3a7c..7ee178149c7a 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -1,9 +1,8 @@
//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
/// The pass tries to use the 32-bit encoding for instructions when possible.
//===----------------------------------------------------------------------===//
@@ -39,6 +38,8 @@ class SIShrinkInstructions : public MachineFunctionPass {
public:
static char ID;
+ void shrinkMIMG(MachineInstr &MI);
+
public:
SIShrinkInstructions() : MachineFunctionPass(ID) {
}
@@ -94,6 +95,10 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
Src0.setSubReg(0);
Src0.ChangeToFrameIndex(MovSrc.getIndex());
ConstantFolded = true;
+ } else if (MovSrc.isGlobal()) {
+ Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
+ MovSrc.getTargetFlags());
+ ConstantFolded = true;
}
if (ConstantFolded) {
@@ -212,6 +217,96 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
}
}
+// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
+void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+ if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+ return;
+
+ MachineFunction *MF = MI.getParent()->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ int VAddr0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
+ unsigned NewAddrDwords = Info->VAddrDwords;
+ const TargetRegisterClass *RC;
+
+ if (Info->VAddrDwords == 2) {
+ RC = &AMDGPU::VReg_64RegClass;
+ } else if (Info->VAddrDwords == 3) {
+ RC = &AMDGPU::VReg_96RegClass;
+ } else if (Info->VAddrDwords == 4) {
+ RC = &AMDGPU::VReg_128RegClass;
+ } else if (Info->VAddrDwords <= 8) {
+ RC = &AMDGPU::VReg_256RegClass;
+ NewAddrDwords = 8;
+ } else {
+ RC = &AMDGPU::VReg_512RegClass;
+ NewAddrDwords = 16;
+ }
+
+ unsigned VgprBase = 0;
+ bool IsUndef = true;
+ bool IsKill = NewAddrDwords == Info->VAddrDwords;
+ for (unsigned i = 0; i < Info->VAddrDwords; ++i) {
+ const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
+ unsigned Vgpr = TRI.getHWRegIndex(Op.getReg());
+
+ if (i == 0) {
+ VgprBase = Vgpr;
+ } else if (VgprBase + i != Vgpr)
+ return;
+
+ if (!Op.isUndef())
+ IsUndef = false;
+ if (!Op.isKill())
+ IsKill = false;
+ }
+
+ if (VgprBase + NewAddrDwords > 256)
+ return;
+
+ // Further check for implicit tied operands - this may be present if TFE is
+ // enabled
+ int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
+ int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
+ unsigned TFEVal = MI.getOperand(TFEIdx).getImm();
+ unsigned LWEVal = MI.getOperand(LWEIdx).getImm();
+ int ToUntie = -1;
+ if (TFEVal || LWEVal) {
+ // TFE/LWE is enabled so we need to deal with an implicit tied operand
+ for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
+ if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
+ MI.getOperand(i).isImplicit()) {
+ // This is the tied operand
+ assert(
+ ToUntie == -1 &&
+ "found more than one tied implicit operand when expecting only 1");
+ ToUntie = i;
+ MI.untieRegOperand(ToUntie);
+ }
+ }
+ }
+
+ unsigned NewOpcode =
+ AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
+ Info->VDataDwords, NewAddrDwords);
+ MI.setDesc(TII->get(NewOpcode));
+ MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
+ MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
+ MI.getOperand(VAddr0Idx).setIsKill(IsKill);
+
+ for (unsigned i = 1; i < Info->VAddrDwords; ++i)
+ MI.RemoveOperand(VAddr0Idx + 1);
+
+ if (ToUntie >= 0) {
+ MI.tieOperands(
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
+ ToUntie - (Info->VAddrDwords - 1));
+ }
+}
+
/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
@@ -277,7 +372,9 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
if (Opc == AMDGPU::S_BITSET0_B32 ||
Opc == AMDGPU::S_BITSET1_B32) {
Src0->ChangeToImmediate(NewImm);
- MI.RemoveOperand(2);
+ // Remove the immediate and add the tied input.
+ MI.getOperand(2).ChangeToRegister(Dest->getReg(), false);
+ MI.tieOperands(0, 2);
} else {
SrcImm->setImm(NewImm);
}
@@ -458,6 +555,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
std::vector<unsigned> I1Defs;
@@ -596,6 +694,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
}
+ if (TII->isMIMG(MI.getOpcode()) &&
+ ST.getGeneration() >= AMDGPUSubtarget::GFX10 &&
+ MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::NoVRegs)) {
+ shrinkMIMG(MI);
+ continue;
+ }
+
if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;
@@ -625,10 +731,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// So, instead of forcing the instruction to write to VCC, we provide
// a hint to the register allocator to use VCC and then we will run
// this pass again after RA and shrink it if it outputs to VCC.
- MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
+ MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
continue;
}
- if (DstReg != AMDGPU::VCC)
+ if (DstReg != VCCReg)
continue;
}
@@ -641,10 +747,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
unsigned SReg = Src2->getReg();
if (TargetRegisterInfo::isVirtualRegister(SReg)) {
- MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
+ MRI.setRegAllocationHint(SReg, 0, VCCReg);
continue;
}
- if (SReg != AMDGPU::VCC)
+ if (SReg != VCCReg)
continue;
}
@@ -657,20 +763,24 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
AMDGPU::OpName::src2);
if (SDst) {
- if (SDst->getReg() != AMDGPU::VCC) {
+ bool Next = false;
+
+ if (SDst->getReg() != VCCReg) {
if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
- MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
- continue;
+ MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
+ Next = true;
}
// All of the instructions with carry outs also have an SGPR input in
// src2.
- if (Src2 && Src2->getReg() != AMDGPU::VCC) {
+ if (Src2 && Src2->getReg() != VCCReg) {
if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
- MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
+ MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
+ Next = true;
+ }
+ if (Next)
continue;
- }
}
// We can shrink this instruction
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 879726b1528c..4e07efff55d8 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1,9 +1,8 @@
//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -149,6 +148,7 @@ private:
CallingConv::ID CallingConv;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
+ const GCNSubtarget *ST;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
@@ -201,6 +201,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervals>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -277,7 +279,7 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
- if (Reg == AMDGPU::EXEC)
+ if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
continue;
for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
@@ -386,7 +388,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
unsigned Reg = MO.getReg();
if (!TRI->isVirtualRegister(Reg) &&
- TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
+ TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
Flags = StateWQM;
break;
}
@@ -619,13 +621,16 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineInstr *MI;
if (SaveWQM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
+ AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
SaveWQM)
.addReg(LiveMaskReg);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
- AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
+ AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
+ Exec)
+ .addReg(Exec)
.addReg(LiveMaskReg);
}
@@ -637,13 +642,15 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
unsigned SavedWQM) {
MachineInstr *MI;
+ unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (SavedWQM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
.addReg(SavedWQM);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
- AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
+ AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
+ Exec)
+ .addReg(Exec);
}
LIS->InsertMachineInstrInMaps(*MI);
@@ -655,8 +662,7 @@ void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
MachineInstr *MI;
assert(SaveOrig);
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
- SaveOrig)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
.addImm(-1);
LIS->InsertMachineInstrInMaps(*MI);
}
@@ -667,7 +673,8 @@ void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
MachineInstr *MI;
assert(SavedOrig);
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
+ ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
.addReg(SavedOrig);
LIS->InsertMachineInstrInMaps(*MI);
}
@@ -693,6 +700,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
bool WQMFromExec = isEntry;
char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
char NonWWMState = 0;
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
if (isEntry)
@@ -780,13 +788,13 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (Needs == StateWWM) {
NonWWMState = State;
- SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
toWWM(MBB, Before, SavedNonWWMReg);
State = StateWWM;
} else {
if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
if (!WQMFromExec && (OutNeeds & StateWQM))
- SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ SavedWQMReg = MRI->createVirtualRegister(BoolRC);
toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
State = StateExact;
@@ -838,7 +846,23 @@ void SIWholeQuadMode::lowerCopyInstrs() {
for (MachineInstr *MI : LowerToCopyInstrs) {
for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
MI->RemoveOperand(i);
- MI->setDesc(TII->get(AMDGPU::COPY));
+
+ const unsigned Reg = MI->getOperand(0).getReg();
+
+ if (TRI->isVGPR(*MRI, Reg)) {
+ const TargetRegisterClass *regClass =
+ TargetRegisterInfo::isVirtualRegister(Reg)
+ ? MRI->getRegClass(Reg)
+ : TRI->getPhysRegClass(Reg);
+
+ const unsigned MovOp = TII->getMovOpcode(regClass);
+ MI->setDesc(TII->get(MovOp));
+
+ // And make it implicitly depend on exec (like all VALU movs should do).
+ MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+ } else {
+ MI->setDesc(TII->get(AMDGPU::COPY));
+ }
}
}
@@ -849,17 +873,18 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToCopyInstrs.clear();
CallingConv = MF.getFunction().getCallingConv();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ ST = &MF.getSubtarget<GCNSubtarget>();
- TII = ST.getInstrInfo();
+ TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
char GlobalFlags = analyzeFunction(MF);
unsigned LiveMaskReg = 0;
+ unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (!(GlobalFlags & StateWQM)) {
- lowerLiveMaskQueries(AMDGPU::EXEC);
+ lowerLiveMaskQueries(Exec);
if (!(GlobalFlags & StateWWM))
return !LiveMaskQueries.empty();
} else {
@@ -868,10 +893,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
- LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
TII->get(AMDGPU::COPY), LiveMaskReg)
- .addReg(AMDGPU::EXEC);
+ .addReg(Exec);
LIS->InsertMachineInstrInMaps(*MI);
}
@@ -879,9 +904,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (GlobalFlags == StateWQM) {
// For a shader that needs only WQM, we can just set it once.
- BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
- AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC);
+ BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
+ AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
+ Exec)
+ .addReg(Exec);
lowerCopyInstrs();
// EntryMI may become invalid here
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 8a063e1a4867..1b410b6b5912 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -1,9 +1,8 @@
//===---- SMInstructions.td - Scalar Memory Instruction Defintions --------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -34,7 +33,6 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
let SchedRW = [WriteSMEM];
- let SubtargetPredicate = isGCN;
string Mnemonic = opName;
string AsmOperands = asmOps;
@@ -42,6 +40,7 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
bits<1> has_sbase = 1;
bits<1> has_sdst = 1;
bit has_glc = 0;
+ bit has_dlc = 0;
bits<1> has_offset = 1;
bits<1> offset_is_imm = 0;
}
@@ -81,6 +80,7 @@ class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag>
let mayLoad = 1;
let mayStore = 0;
let has_glc = 1;
+ let has_dlc = 1;
}
class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern = []>
@@ -90,6 +90,7 @@ class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern
let mayLoad = 0;
let mayStore = 1;
let has_glc = 1;
+ let has_dlc = 1;
let ScalarStore = 1;
}
@@ -110,21 +111,23 @@ multiclass SM_Pseudo_Loads<string opName,
RegisterClass dstClass> {
def _IMM : SM_Load_Pseudo <opName,
(outs dstClass:$sdst),
- (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc),
- " $sdst, $sbase, $offset$glc", []> {
+ (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc),
+ " $sdst, $sbase, $offset$glc$dlc", []> {
let offset_is_imm = 1;
let BaseClass = baseClass;
let PseudoInstr = opName # "_IMM";
let has_glc = 1;
+ let has_dlc = 1;
}
def _SGPR : SM_Load_Pseudo <opName,
(outs dstClass:$sdst),
- (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc),
- " $sdst, $sbase, $offset$glc", []> {
+ (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc),
+ " $sdst, $sbase, $offset$glc$dlc", []> {
let BaseClass = baseClass;
let PseudoInstr = opName # "_SGPR";
let has_glc = 1;
+ let has_dlc = 1;
}
}
@@ -132,8 +135,8 @@ multiclass SM_Pseudo_Stores<string opName,
RegisterClass baseClass,
RegisterClass srcClass> {
def _IMM : SM_Store_Pseudo <opName,
- (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc),
- " $sdata, $sbase, $offset$glc", []> {
+ (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc),
+ " $sdata, $sbase, $offset$glc$dlc", []> {
let offset_is_imm = 1;
let BaseClass = baseClass;
let SrcClass = srcClass;
@@ -141,8 +144,8 @@ multiclass SM_Pseudo_Stores<string opName,
}
def _SGPR : SM_Store_Pseudo <opName,
- (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc),
- " $sdata, $sbase, $offset$glc", []> {
+ (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc),
+ " $sdata, $sbase, $offset$glc$dlc", []> {
let BaseClass = baseClass;
let SrcClass = srcClass;
let PseudoInstr = opName # "_SGPR";
@@ -154,17 +157,25 @@ multiclass SM_Pseudo_Discards<string opName> {
def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>;
}
-class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
+class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pseudo<
opName, (outs SReg_64_XEXEC:$sdst), (ins),
" $sdst", [(set i64:$sdst, (node))]> {
let hasSideEffects = 1;
- let mayStore = 0;
+
+ // FIXME: This should be definitively mayStore = 0. TableGen
+ // brokenly tries to infer these based on the intrinsic properties
+ // corresponding to the IR attributes. The target intrinsics are
+ // considered as writing to memory for IR dependency purposes, but
+ // those can be modeled with hasSideEffects here. These also end up
+ // inferring differently for llvm.readcyclecounter and the amdgcn
+ // intrinsics.
+ let mayStore = ?;
let mayLoad = 1;
let has_sbase = 0;
let has_offset = 0;
}
-class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo<
+class SM_Inval_Pseudo <string opName, SDPatternOperator node = null_frag> : SM_Pseudo<
opName, (outs), (ins), "", [(node)]> {
let hasSideEffects = 1;
let mayStore = 1;
@@ -178,6 +189,16 @@ multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> {
def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>;
}
+class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
+ opName, (outs SReg_32_XM0_XEXEC:$sdst), (ins),
+ " $sdst", [(set i32:$sdst, (node))]> {
+ let hasSideEffects = 1;
+ let mayStore = 0;
+ let mayLoad = 1;
+ let has_sbase = 0;
+ let has_offset = 0;
+}
+
//===----------------------------------------------------------------------===//
// Scalar Atomic Memory Classes
//===----------------------------------------------------------------------===//
@@ -191,6 +212,7 @@ class SM_Atomic_Pseudo <string opName,
let mayLoad = 1;
let mayStore = 1;
let has_glc = 1;
+ let has_dlc = 1;
// Should these be set?
let ScalarStore = 1;
@@ -206,9 +228,9 @@ class SM_Pseudo_Atomic<string opName,
SM_Atomic_Pseudo<opName,
!if(isRet, (outs dataClass:$sdst), (outs)),
!if(isImm,
- (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset),
- (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset)),
- !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", ""),
+ (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset, DLC:$dlc),
+ (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, DLC:$dlc)),
+ !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", "") # "$dlc",
isRet> {
let offset_is_imm = isImm;
let PseudoInstr = opName # !if(isImm,
@@ -266,6 +288,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <
"s_buffer_load_dwordx16", SReg_128, SReg_512
>;
+let SubtargetPredicate = HasScalarStores in {
defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>;
defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>;
@@ -281,25 +304,32 @@ defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <
defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <
"s_buffer_store_dwordx4", SReg_128, SReg_128
>;
-
+} // End SubtargetPredicate = HasScalarStores
def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>;
def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>;
-let SubtargetPredicate = isCIVI in {
+let SubtargetPredicate = isGFX7GFX8GFX9 in {
def S_DCACHE_INV_VOL : SM_Inval_Pseudo <"s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>;
-} // let SubtargetPredicate = isCIVI
+} // let SubtargetPredicate = isGFX7GFX8GFX9
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8Plus in {
+let OtherPredicates = [HasScalarStores] in {
def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
+} // End OtherPredicates = [HasScalarStores]
def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>;
defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
-} // SubtargetPredicate = isVI
+} // SubtargetPredicate = isGFX8Plus
+
+let SubtargetPredicate = isGFX10Plus in {
+def S_GL1_INV : SM_Inval_Pseudo<"s_gl1_inv">;
+def S_GET_WAVEID_IN_WORKGROUP : SM_WaveId_Pseudo <"s_get_waveid_in_workgroup", int_amdgcn_s_get_waveid_in_workgroup>;
+} // End SubtargetPredicate = isGFX10Plus
-let SubtargetPredicate = HasFlatScratchInsts, Uses = [FLAT_SCR] in {
+let SubtargetPredicate = HasScalarFlatScratchInsts, Uses = [FLAT_SCR] in {
defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>;
defm S_SCRATCH_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_scratch_load_dwordx2", SReg_64, SReg_64_XEXEC>;
defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_64, SReg_128>;
@@ -307,7 +337,7 @@ defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_6
defm S_SCRATCH_STORE_DWORD : SM_Pseudo_Stores <"s_scratch_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <"s_scratch_store_dwordx2", SReg_64, SReg_64_XEXEC>;
defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg_64, SReg_128>;
-} // SubtargetPredicate = HasFlatScratchInsts
+} // SubtargetPredicate = HasScalarFlatScratchInsts
let SubtargetPredicate = HasScalarAtomics in {
@@ -369,7 +399,7 @@ defm S_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_atomic_dec_x2", SReg_6
} // let SubtargetPredicate = HasScalarAtomics
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = HasScalarAtomics in {
defm S_DCACHE_DISCARD : SM_Pseudo_Discards <"s_dcache_discard">;
defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">;
}
@@ -387,8 +417,8 @@ class SMRD_Real_si <bits<5> op, SM_Pseudo ps>
, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI>
, Enc32 {
- let AssemblerPredicates = [isSICI];
- let DecoderNamespace = "SICI";
+ let AssemblerPredicates = [isGFX6GFX7];
+ let DecoderNamespace = "GFX6GFX7";
let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?);
let Inst{8} = imm;
@@ -405,13 +435,13 @@ multiclass SM_Real_Loads_si<bits<5> op, string ps,
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_si : SMRD_Real_si <op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc, DLC:$dlc);
}
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _SGPR_si : SMRD_Real_si <op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
}
}
@@ -441,8 +471,8 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
, Enc64 {
bit glc;
- let AssemblerPredicates = [isVI];
- let DecoderNamespace = "VI";
+ let AssemblerPredicates = [isGFX8GFX9];
+ let DecoderNamespace = "GFX8";
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
@@ -458,10 +488,10 @@ multiclass SM_Real_Loads_vi<bits<8> op, string ps,
SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_vi : SMEM_Real_vi <op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
}
def _SGPR_vi : SMEM_Real_vi <op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
}
}
@@ -479,11 +509,11 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps,
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _IMM_vi : SMEM_Real_Store_vi <op, immPs> {
- let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc);
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
}
def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> {
- let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
}
}
@@ -630,9 +660,9 @@ class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> :
SM_Real<ps>,
Enc64 {
- let AssemblerPredicates = [isCIOnly];
- let DecoderNamespace = "CI";
- let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc);
+ let AssemblerPredicates = [isGFX7Only];
+ let DecoderNamespace = "GFX7";
+ let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc, DLC:$dlc);
let LGKM_CNT = ps.LGKM_CNT;
let SMRD = ps.SMRD;
@@ -667,8 +697,8 @@ class SMRD_Real_ci <bits<5> op, SM_Pseudo ps>
, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI>
, Enc32 {
- let AssemblerPredicates = [isCIOnly];
- let DecoderNamespace = "CI";
+ let AssemblerPredicates = [isGFX7Only];
+ let DecoderNamespace = "GFX7";
let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?);
let Inst{8} = imm;
@@ -684,7 +714,22 @@ def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
// Scalar Memory Patterns
//===----------------------------------------------------------------------===//
-def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]>;
+def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]> {
+ let GISelPredicateCode = [{
+ if (!MI.hasOneMemOperand())
+ return false;
+ if (!isInstrUniform(MI))
+ return false;
+
+ // FIXME: We should probably be caching this.
+ SmallVector<GEPInfo, 4> AddrInfo;
+ getAddrModeInfo(MI, MRI, AddrInfo);
+
+ if (hasVgprParts(AddrInfo))
+ return false;
+ return true;
+ }];
+}
def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
@@ -697,41 +742,49 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
// 1. IMM offset
def : GCNPat <
(smrd_load (SMRDImm i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0, 0))
>;
// 2. 32-bit IMM offset on CI
def : GCNPat <
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
- (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
- let OtherPredicates = [isCIOnly];
+ (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0, 0))> {
+ let OtherPredicates = [isGFX7Only];
}
// 3. SGPR offset
def : GCNPat <
(smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0, 0))
+ >;
+
+ // 4. No offset
+ def : GCNPat <
+ (vt (smrd_load (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0, 0))
>;
}
multiclass SMLoad_Pattern <string Instr, ValueType vt> {
// 1. Offset as an immediate
def : GCNPat <
- (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc)))
+ (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc, i1:$dlc),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc),
+ (as_i1imm $dlc)))
>;
// 2. 32-bit IMM offset on CI
def : GCNPat <
- (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)),
- (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc))> {
- let OtherPredicates = [isCIOnly];
+ (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc, i1:$dlc)),
+ (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc), (as_i1imm $dlc))> {
+ let OtherPredicates = [isGFX7Only];
}
// 3. Offset loaded in an 32bit SGPR
def : GCNPat <
- (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc)))
+ (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc, i1:$dlc),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc),
+ (as_i1imm $dlc)))
>;
}
@@ -759,18 +812,202 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>;
} // End let AddedComplexity = 100
-let OtherPredicates = [isSICI] in {
def : GCNPat <
(i64 (readcyclecounter)),
(S_MEMTIME)
>;
+
+//===----------------------------------------------------------------------===//
+// GFX10.
+//===----------------------------------------------------------------------===//
+
+class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> :
+ SM_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10>, Enc64 {
+ bit glc;
+ bit dlc;
+
+ let AssemblerPredicates = [isGFX10Plus];
+ let DecoderNamespace = "GFX10";
+
+ let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
+ let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
+ let Inst{14} = !if(ps.has_dlc, dlc, ?);
+ let Inst{16} = !if(ps.has_glc, glc, ?);
+ let Inst{25-18} = op;
+ let Inst{31-26} = 0x3d;
+ let Inst{51-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{19-0}, ?), ?);
+ let Inst{63-57} = !if(ps.offset_is_imm, !cast<int>(SGPR_NULL.HWEncoding),
+ !if(ps.has_offset, offset{6-0}, ?));
}
-let OtherPredicates = [isVI] in {
+multiclass SM_Real_Loads_gfx10<bits<8> op, string ps,
+ SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
+ SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
+ def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> {
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+ }
+ def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> {
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ }
+}
-def : GCNPat <
- (i64 (readcyclecounter)),
- (S_MEMREALTIME)
->;
+class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps> {
+ bits<7> sdata;
+
+ let sdst = ?;
+ let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?);
+}
+
+multiclass SM_Real_Stores_gfx10<bits<8> op, string ps,
+ SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM),
+ SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> {
+ // FIXME: The operand name $offset is inconsistent with $soff used
+ // in the pseudo
+ def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> {
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+ }
+
+ def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> {
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ }
+}
+
+defm S_LOAD_DWORD : SM_Real_Loads_gfx10<0x000, "S_LOAD_DWORD">;
+defm S_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x001, "S_LOAD_DWORDX2">;
+defm S_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x002, "S_LOAD_DWORDX4">;
+defm S_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x003, "S_LOAD_DWORDX8">;
+defm S_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x004, "S_LOAD_DWORDX16">;
+
+let SubtargetPredicate = HasScalarFlatScratchInsts in {
+defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_gfx10<0x005, "S_SCRATCH_LOAD_DWORD">;
+defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x006, "S_SCRATCH_LOAD_DWORDX2">;
+defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x007, "S_SCRATCH_LOAD_DWORDX4">;
+} // End SubtargetPredicate = HasScalarFlatScratchInsts
+
+defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_gfx10<0x008, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x009, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x00a, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x00b, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x00c, "S_BUFFER_LOAD_DWORDX16">;
+
+let SubtargetPredicate = HasScalarStores in {
+defm S_STORE_DWORD : SM_Real_Stores_gfx10<0x010, "S_STORE_DWORD">;
+defm S_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x011, "S_STORE_DWORDX2">;
+defm S_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x012, "S_STORE_DWORDX4">;
+let OtherPredicates = [HasScalarFlatScratchInsts] in {
+defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_gfx10<0x015, "S_SCRATCH_STORE_DWORD">;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x016, "S_SCRATCH_STORE_DWORDX2">;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x017, "S_SCRATCH_STORE_DWORDX4">;
+} // End OtherPredicates = [HasScalarFlatScratchInsts]
+defm S_BUFFER_STORE_DWORD : SM_Real_Stores_gfx10<0x018, "S_BUFFER_STORE_DWORD">;
+defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x019, "S_BUFFER_STORE_DWORDX2">;
+defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x01a, "S_BUFFER_STORE_DWORDX4">;
+} // End SubtargetPredicate = HasScalarStores
+
+def S_MEMREALTIME_gfx10 : SMEM_Real_gfx10<0x025, S_MEMREALTIME>;
+def S_MEMTIME_gfx10 : SMEM_Real_gfx10<0x024, S_MEMTIME>;
+def S_GL1_INV_gfx10 : SMEM_Real_gfx10<0x01f, S_GL1_INV>;
+def S_GET_WAVEID_IN_WORKGROUP_gfx10 : SMEM_Real_gfx10<0x02a, S_GET_WAVEID_IN_WORKGROUP>;
+def S_DCACHE_INV_gfx10 : SMEM_Real_gfx10<0x020, S_DCACHE_INV>;
+
+let SubtargetPredicate = HasScalarStores in {
+def S_DCACHE_WB_gfx10 : SMEM_Real_gfx10<0x021, S_DCACHE_WB>;
+} // End SubtargetPredicate = HasScalarStores
+
+multiclass SM_Real_Probe_gfx10<bits<8> op, string ps> {
+ def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>;
+ def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>;
+}
+
+defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26, "S_ATC_PROBE">;
+defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx10 <0x27, "S_ATC_PROBE_BUFFER">;
+
+class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps>
+ : SMEM_Real_gfx10 <op, ps> {
+
+ bits<7> sdata;
+ bit dlc;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ let glc = ps.glc;
+
+ let Inst{14} = !if(ps.has_dlc, dlc, 0);
+ let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0});
+}
+
+multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> {
+ def _IMM_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
+ def _SGPR_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
+ def _IMM_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>;
+ def _SGPR_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>;
+}
+
+let SubtargetPredicate = HasScalarAtomics in {
-} // let OtherPredicates = [isVI]
+defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x40, "S_BUFFER_ATOMIC_SWAP">;
+defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x41, "S_BUFFER_ATOMIC_CMPSWAP">;
+defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x42, "S_BUFFER_ATOMIC_ADD">;
+defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x43, "S_BUFFER_ATOMIC_SUB">;
+defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x44, "S_BUFFER_ATOMIC_SMIN">;
+defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x45, "S_BUFFER_ATOMIC_UMIN">;
+defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x46, "S_BUFFER_ATOMIC_SMAX">;
+defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x47, "S_BUFFER_ATOMIC_UMAX">;
+defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x48, "S_BUFFER_ATOMIC_AND">;
+defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x49, "S_BUFFER_ATOMIC_OR">;
+defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x4a, "S_BUFFER_ATOMIC_XOR">;
+defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x4b, "S_BUFFER_ATOMIC_INC">;
+defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x4c, "S_BUFFER_ATOMIC_DEC">;
+
+defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0x60, "S_BUFFER_ATOMIC_SWAP_X2">;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">;
+defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0x62, "S_BUFFER_ATOMIC_ADD_X2">;
+defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0x63, "S_BUFFER_ATOMIC_SUB_X2">;
+defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0x64, "S_BUFFER_ATOMIC_SMIN_X2">;
+defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0x65, "S_BUFFER_ATOMIC_UMIN_X2">;
+defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0x66, "S_BUFFER_ATOMIC_SMAX_X2">;
+defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0x67, "S_BUFFER_ATOMIC_UMAX_X2">;
+defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0x68, "S_BUFFER_ATOMIC_AND_X2">;
+defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0x69, "S_BUFFER_ATOMIC_OR_X2">;
+defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0x6a, "S_BUFFER_ATOMIC_XOR_X2">;
+defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0x6b, "S_BUFFER_ATOMIC_INC_X2">;
+defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0x6c, "S_BUFFER_ATOMIC_DEC_X2">;
+
+defm S_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x80, "S_ATOMIC_SWAP">;
+defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x81, "S_ATOMIC_CMPSWAP">;
+defm S_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x82, "S_ATOMIC_ADD">;
+defm S_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x83, "S_ATOMIC_SUB">;
+defm S_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x84, "S_ATOMIC_SMIN">;
+defm S_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x85, "S_ATOMIC_UMIN">;
+defm S_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x86, "S_ATOMIC_SMAX">;
+defm S_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x87, "S_ATOMIC_UMAX">;
+defm S_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x88, "S_ATOMIC_AND">;
+defm S_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x89, "S_ATOMIC_OR">;
+defm S_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x8a, "S_ATOMIC_XOR">;
+defm S_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x8b, "S_ATOMIC_INC">;
+defm S_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x8c, "S_ATOMIC_DEC">;
+
+defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0xa0, "S_ATOMIC_SWAP_X2">;
+defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0xa1, "S_ATOMIC_CMPSWAP_X2">;
+defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0xa2, "S_ATOMIC_ADD_X2">;
+defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0xa3, "S_ATOMIC_SUB_X2">;
+defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0xa4, "S_ATOMIC_SMIN_X2">;
+defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0xa5, "S_ATOMIC_UMIN_X2">;
+defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0xa6, "S_ATOMIC_SMAX_X2">;
+defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0xa7, "S_ATOMIC_UMAX_X2">;
+defm S_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0xa8, "S_ATOMIC_AND_X2">;
+defm S_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0xa9, "S_ATOMIC_OR_X2">;
+defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0xaa, "S_ATOMIC_XOR_X2">;
+defm S_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0xab, "S_ATOMIC_INC_X2">;
+defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0xac, "S_ATOMIC_DEC_X2">;
+
+multiclass SM_Real_Discard_gfx10<bits<8> op, string ps> {
+ def _IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>;
+ def _SGPR_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>;
+}
+
+defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">;
+defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_gfx10 <0x29, "S_DCACHE_DISCARD_X2">;
+
+} // End SubtargetPredicate = HasScalarAtomics
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index ca5e981ac5c2..dfafdccc05a3 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -1,15 +1,15 @@
//===-- SOPInstructions.td - SOP Instruction Defintions -------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
def GPRIdxModeMatchClass : AsmOperandClass {
let Name = "GPRIdxMode";
let PredicateMethod = "isGPRIdxMode";
+ let ParserMethod = "parseGPRIdxMode";
let RenderMethod = "addImmOperands";
}
@@ -26,7 +26,6 @@ class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps,
let isPseudo = 1;
let isCodeGenOnly = 1;
- let SubtargetPredicate = isGCN;
string Mnemonic = opName;
string AsmOperands = asmOps;
@@ -78,10 +77,13 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps> :
let Inst{31-23} = 0x17d; //encoding;
}
-class SOP1_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
- opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0),
- "$sdst, $src0", pattern
->;
+class SOP1_32 <string opName, list<dag> pattern=[], bit tied_in = 0> : SOP1_Pseudo <
+ opName, (outs SReg_32:$sdst),
+ !if(tied_in, (ins SSrc_b32:$src0, SReg_32:$sdst_in),
+ (ins SSrc_b32:$src0)),
+ "$sdst, $src0", pattern> {
+ let Constraints = !if(tied_in, "$sdst = $sdst_in", "");
+}
// 32-bit input, no output.
class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo <
@@ -108,10 +110,13 @@ class SOP1_32_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
>;
// 32-bit input, 64-bit output.
-class SOP1_64_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
- opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0),
- "$sdst, $src0", pattern
->;
+class SOP1_64_32 <string opName, list<dag> pattern=[], bit tied_in = 0> : SOP1_Pseudo <
+ opName, (outs SReg_64:$sdst),
+ !if(tied_in, (ins SSrc_b32:$src0, SReg_64:$sdst_in),
+ (ins SSrc_b32:$src0)),
+ "$sdst, $src0", pattern> {
+ let Constraints = !if(tied_in, "$sdst = $sdst_in", "");
+}
// no input, 64-bit output.
class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
@@ -120,8 +125,8 @@ class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
}
// 64-bit input, no output
-class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
- opName, (outs), (ins SReg_64:$src0), "$src0", pattern> {
+class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs), (ins rc:$src0), "$src0", pattern> {
let has_sdst = 0;
}
@@ -147,12 +152,24 @@ let Defs = [SCC] in {
[(set i64:$sdst, (not i64:$src0))]
>;
def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
- def S_WQM_B64 : SOP1_64 <"s_wqm_b64",
- [(set i1:$sdst, (int_amdgcn_wqm_vote i1:$src0))]
- >;
+ def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
} // End Defs = [SCC]
+let WaveSizePredicate = isWave32 in {
+def : GCNPat <
+ (int_amdgcn_wqm_vote i1:$src0),
+ (S_WQM_B32 $src0)
+>;
+}
+
+let WaveSizePredicate = isWave64 in {
+def : GCNPat <
+ (int_amdgcn_wqm_vote i1:$src0),
+ (S_WQM_B64 $src0)
+>;
+}
+
def S_BREV_B32 : SOP1_32 <"s_brev_b32",
[(set i32:$sdst, (bitreverse i32:$src0))]
>;
@@ -191,10 +208,10 @@ def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16",
[(set i32:$sdst, (sext_inreg i32:$src0, i16))]
>;
-def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">;
-def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">;
-def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">;
-def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
+def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32", [], 1>;
+def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>;
+def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>;
+def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>;
def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64",
[(set i64:$sdst, (int_amdgcn_s_getpc))]
>;
@@ -207,7 +224,7 @@ def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
let isReturn = 1 in {
// Define variant marked as return rather than branch.
-def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>;
+def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>;
}
} // End isTerminator = 1, isBarrier = 1
@@ -241,8 +258,11 @@ def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">;
def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">;
} // End Uses = [M0]
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in {
def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">;
+} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9
+
let Defs = [SCC] in {
def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
} // End Defs = [SCC]
@@ -255,7 +275,7 @@ def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
}
}
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = isGFX9Plus in {
let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in {
def S_ANDN1_SAVEEXEC_B64 : SOP1_64<"s_andn1_saveexec_b64">;
def S_ORN1_SAVEEXEC_B64 : SOP1_64<"s_orn1_saveexec_b64">;
@@ -264,7 +284,28 @@ let SubtargetPredicate = isGFX9 in {
} // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
-} // End SubtargetPredicate = isGFX9
+} // End SubtargetPredicate = isGFX9Plus
+
+let SubtargetPredicate = isGFX10Plus in {
+ let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in {
+ def S_AND_SAVEEXEC_B32 : SOP1_32<"s_and_saveexec_b32">;
+ def S_OR_SAVEEXEC_B32 : SOP1_32<"s_or_saveexec_b32">;
+ def S_XOR_SAVEEXEC_B32 : SOP1_32<"s_xor_saveexec_b32">;
+ def S_ANDN2_SAVEEXEC_B32 : SOP1_32<"s_andn2_saveexec_b32">;
+ def S_ORN2_SAVEEXEC_B32 : SOP1_32<"s_orn2_saveexec_b32">;
+ def S_NAND_SAVEEXEC_B32 : SOP1_32<"s_nand_saveexec_b32">;
+ def S_NOR_SAVEEXEC_B32 : SOP1_32<"s_nor_saveexec_b32">;
+ def S_XNOR_SAVEEXEC_B32 : SOP1_32<"s_xnor_saveexec_b32">;
+ def S_ANDN1_SAVEEXEC_B32 : SOP1_32<"s_andn1_saveexec_b32">;
+ def S_ORN1_SAVEEXEC_B32 : SOP1_32<"s_orn1_saveexec_b32">;
+ def S_ANDN1_WREXEC_B32 : SOP1_32<"s_andn1_wrexec_b32">;
+ def S_ANDN2_WREXEC_B32 : SOP1_32<"s_andn2_wrexec_b32">;
+ } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
+
+ let Uses = [M0] in {
+ def S_MOVRELSD_2_B32 : SOP1_32<"s_movrelsd_2_b32">;
+ } // End Uses = [M0]
+} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
// SOP2 Instructions
@@ -302,6 +343,8 @@ class SOP2_Real<bits<7> op, SOP_Pseudo ps> :
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let TSFlags = ps.TSFlags;
// encoding
bits<7> sdst;
@@ -468,22 +511,22 @@ let AddedComplexity = 1 in {
let Defs = [SCC] in {
// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
- [(set i32:$sdst, (UniformBinFrag<shl> i32:$src0, i32:$src1))]
+ [(set SReg_32:$sdst, (shl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
- [(set i64:$sdst, (UniformBinFrag<shl> i64:$src0, i32:$src1))]
+ [(set SReg_64:$sdst, (shl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
- [(set i32:$sdst, (UniformBinFrag<srl> i32:$src0, i32:$src1))]
+ [(set SReg_32:$sdst, (srl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
- [(set i64:$sdst, (UniformBinFrag<srl> i64:$src0, i32:$src1))]
+ [(set SReg_64:$sdst, (srl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
- [(set i32:$sdst, (UniformBinFrag<sra> i32:$src0, i32:$src1))]
+ [(set SReg_32:$sdst, (sra (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
- [(set i64:$sdst, (UniformBinFrag<sra> i64:$src0, i32:$src1))]
+ [(set SReg_64:$sdst, (sra (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
} // End Defs = [SCC]
@@ -512,13 +555,14 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo <
"$src0, $src1"
> {
let has_sdst = 0;
+ let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
}
let Defs = [SCC] in {
def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
} // End Defs = [SCC]
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8GFX9 in {
def S_RFE_RESTORE_B64 : SOP2_Pseudo <
"s_rfe_restore_b64", (outs),
(ins SSrc_b64:$src0, SSrc_b32:$src1),
@@ -529,7 +573,7 @@ let SubtargetPredicate = isVI in {
}
}
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = isGFX9Plus in {
def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
@@ -543,7 +587,7 @@ let SubtargetPredicate = isGFX9 in {
def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">;
def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">;
-}
+} // End SubtargetPredicate = isGFX9Plus
//===----------------------------------------------------------------------===//
// SOPK Instructions
@@ -555,7 +599,6 @@ class SOPK_Pseudo <string opName, dag outs, dag ins,
SIMCInstr<opName, SIEncodingFamily.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
- let SubtargetPredicate = isGCN;
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
@@ -618,6 +661,19 @@ class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
"$sdst, $simm16",
pattern>;
+class SOPK_32_BR <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
+ opName,
+ (outs),
+ (ins sopp_brtarget:$simm16, SReg_32:$sdst),
+ "$sdst, $simm16",
+ pattern> {
+ let Defs = [EXEC];
+ let Uses = [EXEC];
+ let isBranch = 1;
+ let isTerminator = 1;
+ let SchedRW = [WriteBranch];
+}
+
class SOPK_SCC <string opName, string base_op, bit isSignExt> : SOPK_Pseudo <
opName,
(outs),
@@ -684,9 +740,10 @@ let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0",
def S_MULK_I32 : SOPK_32TIE <"s_mulk_i32">;
}
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in
def S_CBRANCH_I_FORK : SOPK_Pseudo <
"s_cbranch_i_fork",
- (outs), (ins SReg_64:$sdst, s16imm:$simm16),
+ (outs), (ins SReg_64:$sdst, sopp_brtarget:$simm16),
"$sdst, $simm16"
>;
@@ -720,15 +777,46 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo <
} // End hasSideEffects = 1
-let SubtargetPredicate = isGFX9 in {
+class SOPK_WAITCNT<string opName, list<dag> pat=[]> :
+ SOPK_Pseudo<
+ opName,
+ (outs),
+ (ins SReg_32:$sdst, s16imm:$simm16),
+ "$sdst, $simm16",
+ pat> {
+ let hasSideEffects = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_sdst = 1; // First source takes place of sdst in encoding
+}
+
+let SubtargetPredicate = isGFX9Plus in {
def S_CALL_B64 : SOPK_Pseudo<
"s_call_b64",
(outs SReg_64:$sdst),
- (ins s16imm:$simm16),
+ (ins sopp_brtarget:$simm16),
"$sdst, $simm16"> {
let isCall = 1;
}
-}
+} // End SubtargetPredicate = isGFX9Plus
+
+let SubtargetPredicate = isGFX10Plus in {
+ def S_VERSION : SOPK_Pseudo<
+ "s_version",
+ (outs),
+ (ins s16imm:$simm16),
+ "$simm16"> {
+ let has_sdst = 0;
+ }
+
+ def S_SUBVECTOR_LOOP_BEGIN : SOPK_32_BR<"s_subvector_loop_begin">;
+ def S_SUBVECTOR_LOOP_END : SOPK_32_BR<"s_subvector_loop_end">;
+
+ def S_WAITCNT_VSCNT : SOPK_WAITCNT<"s_waitcnt_vscnt">;
+ def S_WAITCNT_VMCNT : SOPK_WAITCNT<"s_waitcnt_vmcnt">;
+ def S_WAITCNT_EXPCNT : SOPK_WAITCNT<"s_waitcnt_expcnt">;
+ def S_WAITCNT_LGKMCNT : SOPK_WAITCNT<"s_waitcnt_lgkmcnt">;
+} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
// SOPC Instructions
@@ -756,7 +844,6 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm,
let Defs = [SCC];
let SchedRW = [WriteSALU];
let UseNamedOperandTable = 1;
- let SubtargetPredicate = isGCN;
}
class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
@@ -811,12 +898,13 @@ def S_BITCMP0_B32 : SOPC_32 <0x0c, "s_bitcmp0_b32">;
def S_BITCMP1_B32 : SOPC_32 <0x0d, "s_bitcmp1_b32">;
def S_BITCMP0_B64 : SOPC_64_32 <0x0e, "s_bitcmp0_b64">;
def S_BITCMP1_B64 : SOPC_64_32 <0x0f, "s_bitcmp1_b64">;
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in
def S_SETVSKIP : SOPC_32 <0x10, "s_setvskip">;
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8Plus in {
def S_CMP_EQ_U64 : SOPC_CMP_64 <0x12, "s_cmp_eq_u64", COND_EQ>;
def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>;
-}
+} // End SubtargetPredicate = isGFX8Plus
let SubtargetPredicate = HasVGPRIndexMode in {
def S_SET_GPR_IDX_ON : SOPC <0x11,
@@ -834,6 +922,10 @@ def S_SET_GPR_IDX_ON : SOPC <0x11,
// SOPP Instructions
//===----------------------------------------------------------------------===//
+class Base_SOPP <string asm> {
+ string AsmString = asm;
+}
+
class SOPPe <bits<7> op> : Enc32 {
bits <16> simm16;
@@ -843,7 +935,7 @@ class SOPPe <bits<7> op> : Enc32 {
}
class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
- InstSI <(outs), ins, asm, pattern >, SOPPe <op> {
+ InstSI <(outs), ins, asm, pattern >, SOPPe <op>, Base_SOPP <asm> {
let mayLoad = 0;
let mayStore = 0;
@@ -854,92 +946,124 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
let SchedRW = [WriteSALU];
let UseNamedOperandTable = 1;
- let SubtargetPredicate = isGCN;
}
-
def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
+class SOPP_w_nop_e <bits<7> op> : Enc64 {
+ bits <16> simm16;
+
+ let Inst{15-0} = simm16;
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x17f; // encoding
+ let Inst{47-32} = 0x0;
+ let Inst{54-48} = S_NOP.Inst{22-16}; // opcode
+ let Inst{63-55} = S_NOP.Inst{31-23}; // encoding
+}
+
+class SOPP_w_nop <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
+ InstSI <(outs), ins, asm, pattern >, SOPP_w_nop_e <op>, Base_SOPP <asm> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let SALU = 1;
+ let SOPP = 1;
+ let Size = 8;
+ let SchedRW = [WriteSALU];
+
+ let UseNamedOperandTable = 1;
+}
+
+multiclass SOPP_With_Relaxation <bits<7> op, dag ins, string asm, list<dag> pattern = []> {
+ def "" : SOPP <op, ins, asm, pattern>;
+ def _pad_s_nop : SOPP_w_nop <op, ins, asm, pattern>;
+}
+
let isTerminator = 1 in {
-def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
- [(AMDGPUendpgm)]> {
- let simm16 = 0;
+def S_ENDPGM : SOPP <0x00000001, (ins EndpgmImm:$simm16), "s_endpgm$simm16"> {
let isBarrier = 1;
let isReturn = 1;
}
-let SubtargetPredicate = isVI in {
def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> {
+ let SubtargetPredicate = isGFX8Plus;
let simm16 = 0;
let isBarrier = 1;
let isReturn = 1;
}
-}
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = isGFX9Plus in {
let isBarrier = 1, isReturn = 1, simm16 = 0 in {
def S_ENDPGM_ORDERED_PS_DONE :
SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">;
} // End isBarrier = 1, isReturn = 1, simm16 = 0
-} // End SubtargetPredicate = isGFX9
+} // End SubtargetPredicate = isGFX9Plus
+
+let SubtargetPredicate = isGFX10Plus in {
+ let isBarrier = 1, isReturn = 1, simm16 = 0 in {
+ def S_CODE_END :
+ SOPP<0x01f, (ins), "s_code_end">;
+ } // End isBarrier = 1, isReturn = 1, simm16 = 0
+} // End SubtargetPredicate = isGFX10Plus
let isBranch = 1, SchedRW = [WriteBranch] in {
-def S_BRANCH : SOPP <
+let isBarrier = 1 in {
+defm S_BRANCH : SOPP_With_Relaxation <
0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
- [(br bb:$simm16)]> {
- let isBarrier = 1;
+ [(br bb:$simm16)]>;
}
let Uses = [SCC] in {
-def S_CBRANCH_SCC0 : SOPP <
+defm S_CBRANCH_SCC0 : SOPP_With_Relaxation <
0x00000004, (ins sopp_brtarget:$simm16),
"s_cbranch_scc0 $simm16"
>;
-def S_CBRANCH_SCC1 : SOPP <
+defm S_CBRANCH_SCC1 : SOPP_With_Relaxation <
0x00000005, (ins sopp_brtarget:$simm16),
"s_cbranch_scc1 $simm16"
>;
} // End Uses = [SCC]
let Uses = [VCC] in {
-def S_CBRANCH_VCCZ : SOPP <
+defm S_CBRANCH_VCCZ : SOPP_With_Relaxation <
0x00000006, (ins sopp_brtarget:$simm16),
"s_cbranch_vccz $simm16"
>;
-def S_CBRANCH_VCCNZ : SOPP <
+defm S_CBRANCH_VCCNZ : SOPP_With_Relaxation <
0x00000007, (ins sopp_brtarget:$simm16),
"s_cbranch_vccnz $simm16"
>;
} // End Uses = [VCC]
let Uses = [EXEC] in {
-def S_CBRANCH_EXECZ : SOPP <
+defm S_CBRANCH_EXECZ : SOPP_With_Relaxation <
0x00000008, (ins sopp_brtarget:$simm16),
"s_cbranch_execz $simm16"
>;
-def S_CBRANCH_EXECNZ : SOPP <
+defm S_CBRANCH_EXECNZ : SOPP_With_Relaxation <
0x00000009, (ins sopp_brtarget:$simm16),
"s_cbranch_execnz $simm16"
>;
} // End Uses = [EXEC]
-def S_CBRANCH_CDBGSYS : SOPP <
+defm S_CBRANCH_CDBGSYS : SOPP_With_Relaxation <
0x00000017, (ins sopp_brtarget:$simm16),
"s_cbranch_cdbgsys $simm16"
>;
-def S_CBRANCH_CDBGSYS_AND_USER : SOPP <
+defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_With_Relaxation <
0x0000001A, (ins sopp_brtarget:$simm16),
"s_cbranch_cdbgsys_and_user $simm16"
>;
-def S_CBRANCH_CDBGSYS_OR_USER : SOPP <
+defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_With_Relaxation <
0x00000019, (ins sopp_brtarget:$simm16),
"s_cbranch_cdbgsys_or_user $simm16"
>;
-def S_CBRANCH_CDBGUSER : SOPP <
+defm S_CBRANCH_CDBGUSER : SOPP_With_Relaxation <
0x00000018, (ins sopp_brtarget:$simm16),
"s_cbranch_cdbguser $simm16"
>;
@@ -957,16 +1081,16 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
let isConvergent = 1;
}
-let SubtargetPredicate = isVI in {
def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
+ let SubtargetPredicate = isGFX8Plus;
let simm16 = 0;
let mayLoad = 1;
let mayStore = 1;
}
-}
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16",
+ [(int_amdgcn_s_waitcnt UIMM16bit:$simm16)]>;
def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
@@ -994,7 +1118,10 @@ def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $
>;
} // End Uses = [EXEC, M0]
-def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
+def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16"> {
+ let isTrap = 1;
+}
+
def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
let simm16 = 0;
}
@@ -1028,6 +1155,25 @@ def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
}
}
+let SubtargetPredicate = isGFX10Plus in {
+ def S_INST_PREFETCH :
+ SOPP<0x020, (ins s16imm:$simm16), "s_inst_prefetch $simm16">;
+ def S_CLAUSE :
+ SOPP<0x021, (ins s16imm:$simm16), "s_clause $simm16">;
+ def S_WAITCNT_IDLE :
+ SOPP <0x022, (ins), "s_wait_idle"> {
+ let simm16 = 0;
+ }
+ def S_WAITCNT_DEPCTR :
+ SOPP <0x023, (ins s16imm:$simm16), "s_waitcnt_depctr $simm16">;
+ def S_ROUND_MODE :
+ SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
+ def S_DENORM_MODE :
+ SOPP<0x025, (ins s16imm:$simm16), "s_denorm_mode $simm16">;
+ def S_TTRACEDATA_IMM :
+ SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">;
+} // End SubtargetPredicate = isGFX10Plus
+
//===----------------------------------------------------------------------===//
// S_GETREG_B32 Intrinsic Pattern.
//===----------------------------------------------------------------------===//
@@ -1041,6 +1187,11 @@ def : GCNPat <
//===----------------------------------------------------------------------===//
def : GCNPat <
+ (AMDGPUendpgm),
+ (S_ENDPGM (i16 0))
+>;
+
+def : GCNPat <
(i64 (ctpop i64:$src)),
(i64 (REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
@@ -1097,162 +1248,261 @@ def : GCNPat<
>;
+//===----------------------------------------------------------------------===//
+// Target-specific instruction encodings.
+//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// SOPP Patterns
+// SOP1 - GFX10.
//===----------------------------------------------------------------------===//
-def : GCNPat <
- (int_amdgcn_s_waitcnt i32:$simm16),
- (S_WAITCNT (as_i16imm $simm16))
->;
+class Select_gfx10<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX10> {
+ Predicate AssemblerPredicate = isGFX10Plus;
+ string DecoderNamespace = "GFX10";
+}
+
+multiclass SOP1_Real_gfx10<bits<8> op> {
+ def _gfx10 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
+ Select_gfx10<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
+}
+defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>;
+defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x038>;
+defm S_ANDN1_WREXEC_B64 : SOP1_Real_gfx10<0x039>;
+defm S_ANDN2_WREXEC_B64 : SOP1_Real_gfx10<0x03a>;
+defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx10<0x03b>;
+defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03c>;
+defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03d>;
+defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03e>;
+defm S_ANDN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03f>;
+defm S_ORN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x040>;
+defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x041>;
+defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x042>;
+defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x043>;
+defm S_ANDN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x044>;
+defm S_ORN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x045>;
+defm S_ANDN1_WREXEC_B32 : SOP1_Real_gfx10<0x046>;
+defm S_ANDN2_WREXEC_B32 : SOP1_Real_gfx10<0x047>;
+defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>;
//===----------------------------------------------------------------------===//
-// Real target instructions, move this to the appropriate subtarget TD file
+// SOP1 - GFX6, GFX7.
//===----------------------------------------------------------------------===//
-class Select_si<string opName> :
- SIMCInstr<opName, SIEncodingFamily.SI> {
- list<Predicate> AssemblerPredicates = [isSICI];
- string DecoderNamespace = "SICI";
+class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
+ Predicate AssemblerPredicate = isGFX6GFX7;
+ string DecoderNamespace = "GFX6GFX7";
}
-class SOP1_Real_si<bits<8> op, SOP1_Pseudo ps> :
- SOP1_Real<op, ps>,
- Select_si<ps.Mnemonic>;
+multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
+ def _gfx6_gfx7 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
+ Select_gfx6_gfx7<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
+}
-class SOP2_Real_si<bits<7> op, SOP2_Pseudo ps> :
- SOP2_Real<op, ps>,
- Select_si<ps.Mnemonic>;
+multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
+ SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>;
+
+defm S_CBRANCH_JOIN : SOP1_Real_gfx6_gfx7<0x032>;
+defm S_MOV_REGRD_B32 : SOP1_Real_gfx6_gfx7<0x033>;
+
+defm S_MOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x003>;
+defm S_MOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x004>;
+defm S_CMOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x005>;
+defm S_CMOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x006>;
+defm S_NOT_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x007>;
+defm S_NOT_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x008>;
+defm S_WQM_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x009>;
+defm S_WQM_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00a>;
+defm S_BREV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00b>;
+defm S_BREV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00c>;
+defm S_BCNT0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00d>;
+defm S_BCNT0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00e>;
+defm S_BCNT1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00f>;
+defm S_BCNT1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x010>;
+defm S_FF0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x011>;
+defm S_FF0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x012>;
+defm S_FF1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x013>;
+defm S_FF1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x014>;
+defm S_FLBIT_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x015>;
+defm S_FLBIT_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x016>;
+defm S_FLBIT_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x017>;
+defm S_FLBIT_I32_I64 : SOP1_Real_gfx6_gfx7_gfx10<0x018>;
+defm S_SEXT_I32_I8 : SOP1_Real_gfx6_gfx7_gfx10<0x019>;
+defm S_SEXT_I32_I16 : SOP1_Real_gfx6_gfx7_gfx10<0x01a>;
+defm S_BITSET0_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01b>;
+defm S_BITSET0_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01c>;
+defm S_BITSET1_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01d>;
+defm S_BITSET1_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01e>;
+defm S_GETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01f>;
+defm S_SETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x020>;
+defm S_SWAPPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x021>;
+defm S_RFE_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x022>;
+defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x024>;
+defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x025>;
+defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x026>;
+defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>;
+defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x028>;
+defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x029>;
+defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02a>;
+defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02b>;
+defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02c>;
+defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02d>;
+defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02e>;
+defm S_MOVRELS_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02f>;
+defm S_MOVRELD_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x030>;
+defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x031>;
+defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
+defm S_MOV_FED_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x035>;
-class SOPK_Real_si<bits<5> op, SOPK_Pseudo ps> :
- SOPK_Real32<op, ps>,
- Select_si<ps.Mnemonic>;
-
-def S_MOV_B32_si : SOP1_Real_si <0x03, S_MOV_B32>;
-def S_MOV_B64_si : SOP1_Real_si <0x04, S_MOV_B64>;
-def S_CMOV_B32_si : SOP1_Real_si <0x05, S_CMOV_B32>;
-def S_CMOV_B64_si : SOP1_Real_si <0x06, S_CMOV_B64>;
-def S_NOT_B32_si : SOP1_Real_si <0x07, S_NOT_B32>;
-def S_NOT_B64_si : SOP1_Real_si <0x08, S_NOT_B64>;
-def S_WQM_B32_si : SOP1_Real_si <0x09, S_WQM_B32>;
-def S_WQM_B64_si : SOP1_Real_si <0x0a, S_WQM_B64>;
-def S_BREV_B32_si : SOP1_Real_si <0x0b, S_BREV_B32>;
-def S_BREV_B64_si : SOP1_Real_si <0x0c, S_BREV_B64>;
-def S_BCNT0_I32_B32_si : SOP1_Real_si <0x0d, S_BCNT0_I32_B32>;
-def S_BCNT0_I32_B64_si : SOP1_Real_si <0x0e, S_BCNT0_I32_B64>;
-def S_BCNT1_I32_B32_si : SOP1_Real_si <0x0f, S_BCNT1_I32_B32>;
-def S_BCNT1_I32_B64_si : SOP1_Real_si <0x10, S_BCNT1_I32_B64>;
-def S_FF0_I32_B32_si : SOP1_Real_si <0x11, S_FF0_I32_B32>;
-def S_FF0_I32_B64_si : SOP1_Real_si <0x12, S_FF0_I32_B64>;
-def S_FF1_I32_B32_si : SOP1_Real_si <0x13, S_FF1_I32_B32>;
-def S_FF1_I32_B64_si : SOP1_Real_si <0x14, S_FF1_I32_B64>;
-def S_FLBIT_I32_B32_si : SOP1_Real_si <0x15, S_FLBIT_I32_B32>;
-def S_FLBIT_I32_B64_si : SOP1_Real_si <0x16, S_FLBIT_I32_B64>;
-def S_FLBIT_I32_si : SOP1_Real_si <0x17, S_FLBIT_I32>;
-def S_FLBIT_I32_I64_si : SOP1_Real_si <0x18, S_FLBIT_I32_I64>;
-def S_SEXT_I32_I8_si : SOP1_Real_si <0x19, S_SEXT_I32_I8>;
-def S_SEXT_I32_I16_si : SOP1_Real_si <0x1a, S_SEXT_I32_I16>;
-def S_BITSET0_B32_si : SOP1_Real_si <0x1b, S_BITSET0_B32>;
-def S_BITSET0_B64_si : SOP1_Real_si <0x1c, S_BITSET0_B64>;
-def S_BITSET1_B32_si : SOP1_Real_si <0x1d, S_BITSET1_B32>;
-def S_BITSET1_B64_si : SOP1_Real_si <0x1e, S_BITSET1_B64>;
-def S_GETPC_B64_si : SOP1_Real_si <0x1f, S_GETPC_B64>;
-def S_SETPC_B64_si : SOP1_Real_si <0x20, S_SETPC_B64>;
-def S_SWAPPC_B64_si : SOP1_Real_si <0x21, S_SWAPPC_B64>;
-def S_RFE_B64_si : SOP1_Real_si <0x22, S_RFE_B64>;
-def S_AND_SAVEEXEC_B64_si : SOP1_Real_si <0x24, S_AND_SAVEEXEC_B64>;
-def S_OR_SAVEEXEC_B64_si : SOP1_Real_si <0x25, S_OR_SAVEEXEC_B64>;
-def S_XOR_SAVEEXEC_B64_si : SOP1_Real_si <0x26, S_XOR_SAVEEXEC_B64>;
-def S_ANDN2_SAVEEXEC_B64_si: SOP1_Real_si <0x27, S_ANDN2_SAVEEXEC_B64>;
-def S_ORN2_SAVEEXEC_B64_si : SOP1_Real_si <0x28, S_ORN2_SAVEEXEC_B64>;
-def S_NAND_SAVEEXEC_B64_si : SOP1_Real_si <0x29, S_NAND_SAVEEXEC_B64>;
-def S_NOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2a, S_NOR_SAVEEXEC_B64>;
-def S_XNOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2b, S_XNOR_SAVEEXEC_B64>;
-def S_QUADMASK_B32_si : SOP1_Real_si <0x2c, S_QUADMASK_B32>;
-def S_QUADMASK_B64_si : SOP1_Real_si <0x2d, S_QUADMASK_B64>;
-def S_MOVRELS_B32_si : SOP1_Real_si <0x2e, S_MOVRELS_B32>;
-def S_MOVRELS_B64_si : SOP1_Real_si <0x2f, S_MOVRELS_B64>;
-def S_MOVRELD_B32_si : SOP1_Real_si <0x30, S_MOVRELD_B32>;
-def S_MOVRELD_B64_si : SOP1_Real_si <0x31, S_MOVRELD_B64>;
-def S_CBRANCH_JOIN_si : SOP1_Real_si <0x32, S_CBRANCH_JOIN>;
-def S_MOV_REGRD_B32_si : SOP1_Real_si <0x33, S_MOV_REGRD_B32>;
-def S_ABS_I32_si : SOP1_Real_si <0x34, S_ABS_I32>;
-def S_MOV_FED_B32_si : SOP1_Real_si <0x35, S_MOV_FED_B32>;
-
-def S_ADD_U32_si : SOP2_Real_si <0x00, S_ADD_U32>;
-def S_ADD_I32_si : SOP2_Real_si <0x02, S_ADD_I32>;
-def S_SUB_U32_si : SOP2_Real_si <0x01, S_SUB_U32>;
-def S_SUB_I32_si : SOP2_Real_si <0x03, S_SUB_I32>;
-def S_ADDC_U32_si : SOP2_Real_si <0x04, S_ADDC_U32>;
-def S_SUBB_U32_si : SOP2_Real_si <0x05, S_SUBB_U32>;
-def S_MIN_I32_si : SOP2_Real_si <0x06, S_MIN_I32>;
-def S_MIN_U32_si : SOP2_Real_si <0x07, S_MIN_U32>;
-def S_MAX_I32_si : SOP2_Real_si <0x08, S_MAX_I32>;
-def S_MAX_U32_si : SOP2_Real_si <0x09, S_MAX_U32>;
-def S_CSELECT_B32_si : SOP2_Real_si <0x0a, S_CSELECT_B32>;
-def S_CSELECT_B64_si : SOP2_Real_si <0x0b, S_CSELECT_B64>;
-def S_AND_B32_si : SOP2_Real_si <0x0e, S_AND_B32>;
-def S_AND_B64_si : SOP2_Real_si <0x0f, S_AND_B64>;
-def S_OR_B32_si : SOP2_Real_si <0x10, S_OR_B32>;
-def S_OR_B64_si : SOP2_Real_si <0x11, S_OR_B64>;
-def S_XOR_B32_si : SOP2_Real_si <0x12, S_XOR_B32>;
-def S_XOR_B64_si : SOP2_Real_si <0x13, S_XOR_B64>;
-def S_ANDN2_B32_si : SOP2_Real_si <0x14, S_ANDN2_B32>;
-def S_ANDN2_B64_si : SOP2_Real_si <0x15, S_ANDN2_B64>;
-def S_ORN2_B32_si : SOP2_Real_si <0x16, S_ORN2_B32>;
-def S_ORN2_B64_si : SOP2_Real_si <0x17, S_ORN2_B64>;
-def S_NAND_B32_si : SOP2_Real_si <0x18, S_NAND_B32>;
-def S_NAND_B64_si : SOP2_Real_si <0x19, S_NAND_B64>;
-def S_NOR_B32_si : SOP2_Real_si <0x1a, S_NOR_B32>;
-def S_NOR_B64_si : SOP2_Real_si <0x1b, S_NOR_B64>;
-def S_XNOR_B32_si : SOP2_Real_si <0x1c, S_XNOR_B32>;
-def S_XNOR_B64_si : SOP2_Real_si <0x1d, S_XNOR_B64>;
-def S_LSHL_B32_si : SOP2_Real_si <0x1e, S_LSHL_B32>;
-def S_LSHL_B64_si : SOP2_Real_si <0x1f, S_LSHL_B64>;
-def S_LSHR_B32_si : SOP2_Real_si <0x20, S_LSHR_B32>;
-def S_LSHR_B64_si : SOP2_Real_si <0x21, S_LSHR_B64>;
-def S_ASHR_I32_si : SOP2_Real_si <0x22, S_ASHR_I32>;
-def S_ASHR_I64_si : SOP2_Real_si <0x23, S_ASHR_I64>;
-def S_BFM_B32_si : SOP2_Real_si <0x24, S_BFM_B32>;
-def S_BFM_B64_si : SOP2_Real_si <0x25, S_BFM_B64>;
-def S_MUL_I32_si : SOP2_Real_si <0x26, S_MUL_I32>;
-def S_BFE_U32_si : SOP2_Real_si <0x27, S_BFE_U32>;
-def S_BFE_I32_si : SOP2_Real_si <0x28, S_BFE_I32>;
-def S_BFE_U64_si : SOP2_Real_si <0x29, S_BFE_U64>;
-def S_BFE_I64_si : SOP2_Real_si <0x2a, S_BFE_I64>;
-def S_CBRANCH_G_FORK_si : SOP2_Real_si <0x2b, S_CBRANCH_G_FORK>;
-def S_ABSDIFF_I32_si : SOP2_Real_si <0x2c, S_ABSDIFF_I32>;
-
-def S_MOVK_I32_si : SOPK_Real_si <0x00, S_MOVK_I32>;
-def S_CMOVK_I32_si : SOPK_Real_si <0x02, S_CMOVK_I32>;
-def S_CMPK_EQ_I32_si : SOPK_Real_si <0x03, S_CMPK_EQ_I32>;
-def S_CMPK_LG_I32_si : SOPK_Real_si <0x04, S_CMPK_LG_I32>;
-def S_CMPK_GT_I32_si : SOPK_Real_si <0x05, S_CMPK_GT_I32>;
-def S_CMPK_GE_I32_si : SOPK_Real_si <0x06, S_CMPK_GE_I32>;
-def S_CMPK_LT_I32_si : SOPK_Real_si <0x07, S_CMPK_LT_I32>;
-def S_CMPK_LE_I32_si : SOPK_Real_si <0x08, S_CMPK_LE_I32>;
-def S_CMPK_EQ_U32_si : SOPK_Real_si <0x09, S_CMPK_EQ_U32>;
-def S_CMPK_LG_U32_si : SOPK_Real_si <0x0a, S_CMPK_LG_U32>;
-def S_CMPK_GT_U32_si : SOPK_Real_si <0x0b, S_CMPK_GT_U32>;
-def S_CMPK_GE_U32_si : SOPK_Real_si <0x0c, S_CMPK_GE_U32>;
-def S_CMPK_LT_U32_si : SOPK_Real_si <0x0d, S_CMPK_LT_U32>;
-def S_CMPK_LE_U32_si : SOPK_Real_si <0x0e, S_CMPK_LE_U32>;
-def S_ADDK_I32_si : SOPK_Real_si <0x0f, S_ADDK_I32>;
-def S_MULK_I32_si : SOPK_Real_si <0x10, S_MULK_I32>;
-def S_CBRANCH_I_FORK_si : SOPK_Real_si <0x11, S_CBRANCH_I_FORK>;
-def S_GETREG_B32_si : SOPK_Real_si <0x12, S_GETREG_B32>;
-def S_SETREG_B32_si : SOPK_Real_si <0x13, S_SETREG_B32>;
-//def S_GETREG_REGRD_B32_si : SOPK_Real_si <0x14, S_GETREG_REGRD_B32>; // see pseudo for comments
-def S_SETREG_IMM32_B32_si : SOPK_Real64<0x15, S_SETREG_IMM32_B32>,
- Select_si<S_SETREG_IMM32_B32.Mnemonic>;
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX10.
+//===----------------------------------------------------------------------===//
+
+multiclass SOP2_Real_gfx10<bits<7> op> {
+ def _gfx10 : SOP2_Real<op, !cast<SOP2_Pseudo>(NAME)>,
+ Select_gfx10<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+}
+
+defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>;
+defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10<0x02f>;
+defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10<0x030>;
+defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10<0x031>;
+defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10<0x032>;
+defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10<0x033>;
+defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10<0x034>;
+defm S_MUL_HI_U32 : SOP2_Real_gfx10<0x035>;
+defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX6, GFX7.
+//===----------------------------------------------------------------------===//
+multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
+ def _gfx6_gfx7 : SOP2_Real<op, !cast<SOP_Pseudo>(NAME)>,
+ Select_gfx6_gfx7<!cast<SOP_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> :
+ SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>;
+
+defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>;
+
+defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x000>;
+defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x001>;
+defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x002>;
+defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x003>;
+defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x004>;
+defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x005>;
+defm S_MIN_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x006>;
+defm S_MIN_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x007>;
+defm S_MAX_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x008>;
+defm S_MAX_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x009>;
+defm S_CSELECT_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00a>;
+defm S_CSELECT_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00b>;
+defm S_AND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00e>;
+defm S_AND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00f>;
+defm S_OR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x010>;
+defm S_OR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x011>;
+defm S_XOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x012>;
+defm S_XOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x013>;
+defm S_ANDN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x014>;
+defm S_ANDN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x015>;
+defm S_ORN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x016>;
+defm S_ORN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x017>;
+defm S_NAND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x018>;
+defm S_NAND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x019>;
+defm S_NOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01a>;
+defm S_NOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01b>;
+defm S_XNOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01c>;
+defm S_XNOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01d>;
+defm S_LSHL_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01e>;
+defm S_LSHL_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01f>;
+defm S_LSHR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x020>;
+defm S_LSHR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x021>;
+defm S_ASHR_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x022>;
+defm S_ASHR_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x023>;
+defm S_BFM_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x024>;
+defm S_BFM_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x025>;
+defm S_MUL_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x026>;
+defm S_BFE_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x027>;
+defm S_BFE_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x028>;
+defm S_BFE_U64 : SOP2_Real_gfx6_gfx7_gfx10<0x029>;
+defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x02a>;
+defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>;
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX10.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx10<bits<5> op> {
+ def _gfx10 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPK_Real64_gfx10<bits<5> op> {
+ def _gfx10 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+defm S_VERSION : SOPK_Real32_gfx10<0x001>;
+defm S_CALL_B64 : SOPK_Real32_gfx10<0x016>;
+defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>;
+defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>;
+defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx10<0x019>;
+defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx10<0x01a>;
+defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx10<0x01b>;
+defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>;
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX6, GFX7.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> {
+ def _gfx6_gfx7 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> {
+ def _gfx6_gfx7 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> :
+ SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>;
+
+multiclass SOPK_Real64_gfx6_gfx7_gfx10<bits<5> op> :
+ SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>;
+
+defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>;
+
+defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x000>;
+defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x002>;
+defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x003>;
+defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x004>;
+defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x005>;
+defm S_CMPK_GE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x006>;
+defm S_CMPK_LT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x007>;
+defm S_CMPK_LE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x008>;
+defm S_CMPK_EQ_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x009>;
+defm S_CMPK_LG_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00a>;
+defm S_CMPK_GT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00b>;
+defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00c>;
+defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00d>;
+defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00e>;
+defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00f>;
+defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x010>;
+defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x012>;
+defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x013>;
+defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
+
+//===----------------------------------------------------------------------===//
+// GFX8, GFX9 (VI).
+//===----------------------------------------------------------------------===//
class Select_vi<string opName> :
SIMCInstr<opName, SIEncodingFamily.VI> {
- list<Predicate> AssemblerPredicates = [isVI];
- string DecoderNamespace = "VI";
+ list<Predicate> AssemblerPredicates = [isGFX8GFX9];
+ string DecoderNamespace = "GFX8";
}
class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> :
diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index e4c442db3016..30cf12337c6e 100644
--- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -1,9 +1,8 @@
//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -11,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPUTargetMachine.h"
+#include "TargetInfo/AMDGPUTargetInfo.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h
new file mode 100644
index 000000000000..1e6dbd90b0c1
--- /dev/null
+++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h
@@ -0,0 +1,29 @@
+//===-- TargetInfo/AMDGPUTargetInfo.h - TargetInfo for AMDGPU ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+/// The target which supports all AMD GPUs. This will eventually
+/// be deprecated and there will be a R600 target and a GCN target.
+Target &getTheAMDGPUTarget();
+
+/// The target for GCN GPUs
+Target &getTheGCNTarget();
+
+}
+
+#endif // LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 9eb4c6513cce..075e08986c0c 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -1,9 +1,8 @@
//===-- AMDGPUAsmUtils.cpp - AsmParser/InstPrinter common -----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AMDGPUAsmUtils.h"
@@ -23,8 +22,8 @@ const char* const IdSymbolic[] = {
nullptr,
nullptr,
nullptr,
- nullptr,
- nullptr,
+ "MSG_GS_ALLOC_REQ",
+ "MSG_GET_DOORBELL",
nullptr,
nullptr,
nullptr,
@@ -69,7 +68,17 @@ const char* const IdSymbolic[] = {
nullptr,
nullptr,
nullptr,
- "HW_REG_SH_MEM_BASES"
+ "HW_REG_SH_MEM_BASES",
+ "HW_REG_TBA_LO",
+ "HW_REG_TBA_HI",
+ "HW_REG_TMA_LO",
+ "HW_REG_TMA_HI",
+ "HW_REG_FLAT_SCR_LO",
+ "HW_REG_FLAT_SCR_HI",
+ "HW_REG_XNACK_MASK",
+ nullptr, // HW_ID1, no predictable values
+ nullptr, // HW_ID2, no predictable values
+ "HW_REG_POPS_PACKER"
};
} // namespace Hwreg
@@ -86,5 +95,18 @@ const char* const IdSymbolic[] = {
};
} // namespace Swizzle
+
+namespace VGPRIndexMode {
+
+// This must be in sync with llvm::AMDGPU::VGPRIndexMode::Id enum members, see SIDefines.h.
+const char* const IdSymbolic[] = {
+ "SRC0",
+ "SRC1",
+ "SRC2",
+ "DST",
+};
+
+} // namespace VGPRIndexMode
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index ebb2be22b487..cd91c5f6edd5 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -1,9 +1,8 @@
//===-- AMDGPUAsmUtils.h - AsmParser/InstPrinter common ---------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -31,6 +30,13 @@ namespace Swizzle { // Symbolic names for the swizzle(...) syntax.
extern const char* const IdSymbolic[];
} // namespace Swizzle
+
+namespace VGPRIndexMode { // Symbolic names for the gpr_idx(...) syntax.
+
+extern const char* const IdSymbolic[];
+
+} // namespace VGPRIndexMode
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 54c866bdc63c..e90f40e6abea 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1,9 +1,8 @@
//===- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -11,6 +10,7 @@
#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPU.h"
#include "SIDefines.h"
+#include "AMDGPUAsmUtils.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/BinaryFormat/ELF.h"
@@ -85,7 +85,9 @@ unsigned getExpcntBitWidth() { return 3; }
unsigned getLgkmcntBitShift() { return 8; }
/// \returns Lgkmcnt bit width.
-unsigned getLgkmcntBitWidth() { return 4; }
+unsigned getLgkmcntBitWidth(unsigned VersionMajor) {
+ return (VersionMajor >= 10) ? 6 : 4;
+}
/// \returns Vmcnt bit shift (higher bits).
unsigned getVmcntBitShiftHi() { return 14; }
@@ -99,18 +101,11 @@ namespace llvm {
namespace AMDGPU {
-struct MIMGInfo {
- uint16_t Opcode;
- uint16_t BaseOpcode;
- uint8_t MIMGEncoding;
- uint8_t VDataDwords;
- uint8_t VAddrDwords;
-};
-
#define GET_MIMGBaseOpcodesTable_IMPL
#define GET_MIMGDimInfoTable_IMPL
#define GET_MIMGInfoTable_IMPL
#define GET_MIMGLZMappingTable_IMPL
+#define GET_MIMGMIPMappingTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -120,6 +115,11 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
return Info ? Info->Opcode : -1;
}
+const MIMGBaseOpcodeInfo *getMIMGBaseOpcode(unsigned Opc) {
+ const MIMGInfo *Info = getMIMGInfo(Opc);
+ return Info ? getMIMGBaseOpcodeInfo(Info->BaseOpcode) : nullptr;
+}
+
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
const MIMGInfo *OrigInfo = getMIMGInfo(Opc);
const MIMGInfo *NewInfo =
@@ -230,7 +230,8 @@ unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize) {
- if (!STI->getFeatureBits().test(FeatureGCN))
+ assert(FlatWorkGroupSize != 0);
+ if (STI->getTargetTriple().getArch() != Triple::amdgcn)
return 8;
unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize);
if (N == 1)
@@ -279,6 +280,8 @@ unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) {
IsaVersion Version = getIsaVersion(STI->getCPU());
+ if (Version.Major >= 10)
+ return getAddressableNumSGPRs(STI);
if (Version.Major >= 8)
return 16;
return 8;
@@ -300,6 +303,8 @@ unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) {
return FIXED_NUM_SGPRS_FOR_INIT_BUG;
IsaVersion Version = getIsaVersion(STI->getCPU());
+ if (Version.Major >= 10)
+ return 106;
if (Version.Major >= 8)
return 102;
return 104;
@@ -308,6 +313,10 @@ unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) {
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
+ IsaVersion Version = getIsaVersion(STI->getCPU());
+ if (Version.Major >= 10)
+ return 0;
+
if (WavesPerEU >= getMaxWavesPerEU())
return 0;
@@ -322,8 +331,10 @@ unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
bool Addressable) {
assert(WavesPerEU != 0);
- IsaVersion Version = getIsaVersion(STI->getCPU());
unsigned AddressableNumSGPRs = getAddressableNumSGPRs(STI);
+ IsaVersion Version = getIsaVersion(STI->getCPU());
+ if (Version.Major >= 10)
+ return Addressable ? AddressableNumSGPRs : 108;
if (Version.Major >= 8 && !Addressable)
AddressableNumSGPRs = 112;
unsigned MaxNumSGPRs = getTotalNumSGPRs(STI) / WavesPerEU;
@@ -340,6 +351,9 @@ unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
ExtraSGPRs = 2;
IsaVersion Version = getIsaVersion(STI->getCPU());
+ if (Version.Major >= 10)
+ return ExtraSGPRs;
+
if (Version.Major < 8) {
if (FlatScrUsed)
ExtraSGPRs = 4;
@@ -366,12 +380,17 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
return NumSGPRs / getSGPREncodingGranule(STI) - 1;
}
-unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI) {
- return 4;
+unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
+ Optional<bool> EnableWavefrontSize32) {
+ bool IsWave32 = EnableWavefrontSize32 ?
+ *EnableWavefrontSize32 :
+ STI->getFeatureBits().test(FeatureWavefrontSize32);
+ return IsWave32 ? 8 : 4;
}
-unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI) {
- return getVGPRAllocGranule(STI);
+unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
+ Optional<bool> EnableWavefrontSize32) {
+ return getVGPRAllocGranule(STI, EnableWavefrontSize32);
}
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
@@ -402,10 +421,12 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
}
-unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs) {
- NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(STI));
+unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
+ Optional<bool> EnableWavefrontSize32) {
+ NumVGPRs = alignTo(std::max(1u, NumVGPRs),
+ getVGPREncodingGranule(STI, EnableWavefrontSize32));
// VGPRBlocks is actual number of VGPR blocks minus 1.
- return NumVGPRs / getVGPREncodingGranule(STI) - 1;
+ return NumVGPRs / getVGPREncodingGranule(STI, EnableWavefrontSize32) - 1;
}
} // end namespace IsaInfo
@@ -423,7 +444,6 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
Header.amd_machine_version_minor = Version.Minor;
Header.amd_machine_version_stepping = Version.Stepping;
Header.kernel_code_entry_byte_offset = sizeof(Header);
- // wavefront_size is specified as a power of 2: 2^6 = 64 threads.
Header.wavefront_size = 6;
// If the code object does not support indirect functions, then the value must
@@ -435,11 +455,25 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
Header.kernarg_segment_alignment = 4;
Header.group_segment_alignment = 4;
Header.private_segment_alignment = 4;
+
+ if (Version.Major >= 10) {
+ if (STI->getFeatureBits().test(FeatureWavefrontSize32)) {
+ Header.wavefront_size = 5;
+ Header.code_properties |= AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
+ }
+ Header.compute_pgm_resource_registers |=
+ S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) |
+ S_00B848_MEM_ORDERED(1);
+ }
}
-amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
+ const MCSubtargetInfo *STI) {
+ IsaVersion Version = getIsaVersion(STI->getCPU());
+
amdhsa::kernel_descriptor_t KD;
memset(&KD, 0, sizeof(KD));
+
AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE);
@@ -449,6 +483,16 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {
amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1);
AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1);
+ if (Version.Major >= 10) {
+ AMDHSA_BITS_SET(KD.kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
+ STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1 : 0);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE,
+ STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1);
+ }
return KD;
}
@@ -523,13 +567,14 @@ unsigned getExpcntBitMask(const IsaVersion &Version) {
}
unsigned getLgkmcntBitMask(const IsaVersion &Version) {
- return (1 << getLgkmcntBitWidth()) - 1;
+ return (1 << getLgkmcntBitWidth(Version.Major)) - 1;
}
unsigned getWaitcntBitMask(const IsaVersion &Version) {
unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
- unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
+ unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(),
+ getLgkmcntBitWidth(Version.Major));
unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt;
if (Version.Major < 9)
return Waitcnt;
@@ -555,7 +600,8 @@ unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt) {
}
unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt) {
- return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
+ return unpackBits(Waitcnt, getLgkmcntBitShift(),
+ getLgkmcntBitWidth(Version.Major));
}
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
@@ -591,7 +637,8 @@ unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Lgkmcnt) {
- return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
+ return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(),
+ getLgkmcntBitWidth(Version.Major));
}
unsigned encodeWaitcnt(const IsaVersion &Version,
@@ -607,6 +654,181 @@ unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) {
return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt);
}
+//===----------------------------------------------------------------------===//
+// hwreg
+//===----------------------------------------------------------------------===//
+
+namespace Hwreg {
+
+int64_t getHwregId(const StringRef Name) {
+ for (int Id = ID_SYMBOLIC_FIRST_; Id < ID_SYMBOLIC_LAST_; ++Id) {
+ if (IdSymbolic[Id] && Name == IdSymbolic[Id])
+ return Id;
+ }
+ return ID_UNKNOWN_;
+}
+
+static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) {
+ if (isSI(STI) || isCI(STI) || isVI(STI))
+ return ID_SYMBOLIC_FIRST_GFX9_;
+ else if (isGFX9(STI))
+ return ID_SYMBOLIC_FIRST_GFX10_;
+ else
+ return ID_SYMBOLIC_LAST_;
+}
+
+bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) {
+ return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
+ IdSymbolic[Id];
+}
+
+bool isValidHwreg(int64_t Id) {
+ return 0 <= Id && isUInt<ID_WIDTH_>(Id);
+}
+
+bool isValidHwregOffset(int64_t Offset) {
+ return 0 <= Offset && isUInt<OFFSET_WIDTH_>(Offset);
+}
+
+bool isValidHwregWidth(int64_t Width) {
+ return 0 <= (Width - 1) && isUInt<WIDTH_M1_WIDTH_>(Width - 1);
+}
+
+uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
+ return (Id << ID_SHIFT_) |
+ (Offset << OFFSET_SHIFT_) |
+ ((Width - 1) << WIDTH_M1_SHIFT_);
+}
+
+StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
+ return isValidHwreg(Id, STI) ? IdSymbolic[Id] : "";
+}
+
+void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) {
+ Id = (Val & ID_MASK_) >> ID_SHIFT_;
+ Offset = (Val & OFFSET_MASK_) >> OFFSET_SHIFT_;
+ Width = ((Val & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
+}
+
+} // namespace Hwreg
+
+//===----------------------------------------------------------------------===//
+// SendMsg
+//===----------------------------------------------------------------------===//
+
+namespace SendMsg {
+
+int64_t getMsgId(const StringRef Name) {
+ for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) {
+ if (IdSymbolic[i] && Name == IdSymbolic[i])
+ return i;
+ }
+ return ID_UNKNOWN_;
+}
+
+static bool isValidMsgId(int64_t MsgId) {
+ return (ID_GAPS_FIRST_ <= MsgId && MsgId < ID_GAPS_LAST_) && IdSymbolic[MsgId];
+}
+
+bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) {
+ if (Strict) {
+ if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL)
+ return isGFX9(STI) || isGFX10(STI);
+ else
+ return isValidMsgId(MsgId);
+ } else {
+ return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId);
+ }
+}
+
+StringRef getMsgName(int64_t MsgId) {
+ return isValidMsgId(MsgId)? IdSymbolic[MsgId] : "";
+}
+
+int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
+ const char* const *S = (MsgId == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic;
+ const int F = (MsgId == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_;
+ const int L = (MsgId == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_;
+ for (int i = F; i < L; ++i) {
+ if (Name == S[i]) {
+ return i;
+ }
+ }
+ return OP_UNKNOWN_;
+}
+
+bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict) {
+
+ if (!Strict)
+ return 0 <= OpId && isUInt<OP_WIDTH_>(OpId);
+
+ switch(MsgId)
+ {
+ case ID_GS:
+ return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP;
+ case ID_GS_DONE:
+ return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_;
+ case ID_SYSMSG:
+ return OP_SYS_FIRST_ <= OpId && OpId < OP_SYS_LAST_;
+ default:
+ return OpId == OP_NONE_;
+ }
+}
+
+StringRef getMsgOpName(int64_t MsgId, int64_t OpId) {
+ assert(msgRequiresOp(MsgId));
+ return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId];
+}
+
+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict) {
+
+ if (!Strict)
+ return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId);
+
+ switch(MsgId)
+ {
+ case ID_GS:
+ return STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_;
+ case ID_GS_DONE:
+ return (OpId == OP_GS_NOP)?
+ (StreamId == STREAM_ID_NONE_) :
+ (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_);
+ default:
+ return StreamId == STREAM_ID_NONE_;
+ }
+}
+
+bool msgRequiresOp(int64_t MsgId) {
+ return MsgId == ID_GS || MsgId == ID_GS_DONE || MsgId == ID_SYSMSG;
+}
+
+bool msgSupportsStream(int64_t MsgId, int64_t OpId) {
+ return (MsgId == ID_GS || MsgId == ID_GS_DONE) && OpId != OP_GS_NOP;
+}
+
+void decodeMsg(unsigned Val,
+ uint16_t &MsgId,
+ uint16_t &OpId,
+ uint16_t &StreamId) {
+ MsgId = Val & ID_MASK_;
+ OpId = (Val & OP_MASK_) >> OP_SHIFT_;
+ StreamId = (Val & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
+}
+
+uint64_t encodeMsg(uint64_t MsgId,
+ uint64_t OpId,
+ uint64_t StreamId) {
+ return (MsgId << ID_SHIFT_) |
+ (OpId << OP_SHIFT_) |
+ (StreamId << STREAM_ID_SHIFT_);
+}
+
+} // namespace SendMsg
+
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
unsigned getInitialPSInputAddr(const Function &F) {
return getIntegerAttribute(F, "InitialPSInputAddr", 0);
}
@@ -679,6 +901,10 @@ bool isGFX9(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
}
+bool isGFX10(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
+}
+
bool isGCN3Encoding(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
}
@@ -704,46 +930,46 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
CASE_CI_VI(FLAT_SCR) \
CASE_CI_VI(FLAT_SCR_LO) \
CASE_CI_VI(FLAT_SCR_HI) \
- CASE_VI_GFX9(TTMP0) \
- CASE_VI_GFX9(TTMP1) \
- CASE_VI_GFX9(TTMP2) \
- CASE_VI_GFX9(TTMP3) \
- CASE_VI_GFX9(TTMP4) \
- CASE_VI_GFX9(TTMP5) \
- CASE_VI_GFX9(TTMP6) \
- CASE_VI_GFX9(TTMP7) \
- CASE_VI_GFX9(TTMP8) \
- CASE_VI_GFX9(TTMP9) \
- CASE_VI_GFX9(TTMP10) \
- CASE_VI_GFX9(TTMP11) \
- CASE_VI_GFX9(TTMP12) \
- CASE_VI_GFX9(TTMP13) \
- CASE_VI_GFX9(TTMP14) \
- CASE_VI_GFX9(TTMP15) \
- CASE_VI_GFX9(TTMP0_TTMP1) \
- CASE_VI_GFX9(TTMP2_TTMP3) \
- CASE_VI_GFX9(TTMP4_TTMP5) \
- CASE_VI_GFX9(TTMP6_TTMP7) \
- CASE_VI_GFX9(TTMP8_TTMP9) \
- CASE_VI_GFX9(TTMP10_TTMP11) \
- CASE_VI_GFX9(TTMP12_TTMP13) \
- CASE_VI_GFX9(TTMP14_TTMP15) \
- CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \
- CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \
- CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \
- CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \
- CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
- CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
- CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
- CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
+ CASE_VI_GFX9_GFX10(TTMP0) \
+ CASE_VI_GFX9_GFX10(TTMP1) \
+ CASE_VI_GFX9_GFX10(TTMP2) \
+ CASE_VI_GFX9_GFX10(TTMP3) \
+ CASE_VI_GFX9_GFX10(TTMP4) \
+ CASE_VI_GFX9_GFX10(TTMP5) \
+ CASE_VI_GFX9_GFX10(TTMP6) \
+ CASE_VI_GFX9_GFX10(TTMP7) \
+ CASE_VI_GFX9_GFX10(TTMP8) \
+ CASE_VI_GFX9_GFX10(TTMP9) \
+ CASE_VI_GFX9_GFX10(TTMP10) \
+ CASE_VI_GFX9_GFX10(TTMP11) \
+ CASE_VI_GFX9_GFX10(TTMP12) \
+ CASE_VI_GFX9_GFX10(TTMP13) \
+ CASE_VI_GFX9_GFX10(TTMP14) \
+ CASE_VI_GFX9_GFX10(TTMP15) \
+ CASE_VI_GFX9_GFX10(TTMP0_TTMP1) \
+ CASE_VI_GFX9_GFX10(TTMP2_TTMP3) \
+ CASE_VI_GFX9_GFX10(TTMP4_TTMP5) \
+ CASE_VI_GFX9_GFX10(TTMP6_TTMP7) \
+ CASE_VI_GFX9_GFX10(TTMP8_TTMP9) \
+ CASE_VI_GFX9_GFX10(TTMP10_TTMP11) \
+ CASE_VI_GFX9_GFX10(TTMP12_TTMP13) \
+ CASE_VI_GFX9_GFX10(TTMP14_TTMP15) \
+ CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3) \
+ CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7) \
+ CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11) \
+ CASE_VI_GFX9_GFX10(TTMP12_TTMP13_TTMP14_TTMP15) \
+ CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
+ CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
+ CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
+ CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
}
#define CASE_CI_VI(node) \
assert(!isSI(STI)); \
case node: return isCI(STI) ? node##_ci : node##_vi;
-#define CASE_VI_GFX9(node) \
- case node: return isGFX9(STI) ? node##_gfx9 : node##_vi;
+#define CASE_VI_GFX9_GFX10(node) \
+ case node: return (isGFX9(STI) || isGFX10(STI)) ? node##_gfx9_gfx10 : node##_vi;
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
if (STI.getTargetTriple().getArch() == Triple::r600)
@@ -752,17 +978,17 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
}
#undef CASE_CI_VI
-#undef CASE_VI_GFX9
+#undef CASE_VI_GFX9_GFX10
#define CASE_CI_VI(node) case node##_ci: case node##_vi: return node;
-#define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node;
+#define CASE_VI_GFX9_GFX10(node) case node##_vi: case node##_gfx9_gfx10: return node;
unsigned mc2PseudoReg(unsigned Reg) {
MAP_REG2REG
}
#undef CASE_CI_VI
-#undef CASE_VI_GFX9
+#undef CASE_VI_GFX9_GFX10
#undef MAP_REG2REG
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
@@ -779,10 +1005,17 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
return true;
default:
return false;
@@ -802,28 +1035,46 @@ unsigned getRegBitWidth(unsigned RCID) {
switch (RCID) {
case AMDGPU::SGPR_32RegClassID:
case AMDGPU::VGPR_32RegClassID:
+ case AMDGPU::VRegOrLds_32RegClassID:
+ case AMDGPU::AGPR_32RegClassID:
case AMDGPU::VS_32RegClassID:
+ case AMDGPU::AV_32RegClassID:
case AMDGPU::SReg_32RegClassID:
case AMDGPU::SReg_32_XM0RegClassID:
+ case AMDGPU::SRegOrLds_32RegClassID:
return 32;
case AMDGPU::SGPR_64RegClassID:
case AMDGPU::VS_64RegClassID:
+ case AMDGPU::AV_64RegClassID:
case AMDGPU::SReg_64RegClassID:
case AMDGPU::VReg_64RegClassID:
+ case AMDGPU::AReg_64RegClassID:
case AMDGPU::SReg_64_XEXECRegClassID:
return 64;
+ case AMDGPU::SGPR_96RegClassID:
+ case AMDGPU::SReg_96RegClassID:
case AMDGPU::VReg_96RegClassID:
return 96;
case AMDGPU::SGPR_128RegClassID:
case AMDGPU::SReg_128RegClassID:
case AMDGPU::VReg_128RegClassID:
+ case AMDGPU::AReg_128RegClassID:
return 128;
+ case AMDGPU::SGPR_160RegClassID:
+ case AMDGPU::SReg_160RegClassID:
+ case AMDGPU::VReg_160RegClassID:
+ return 160;
case AMDGPU::SReg_256RegClassID:
case AMDGPU::VReg_256RegClassID:
return 256;
case AMDGPU::SReg_512RegClassID:
case AMDGPU::VReg_512RegClassID:
+ case AMDGPU::AReg_512RegClassID:
return 512;
+ case AMDGPU::SReg_1024RegClassID:
+ case AMDGPU::VReg_1024RegClassID:
+ case AMDGPU::AReg_1024RegClassID:
+ return 1024;
default:
llvm_unreachable("Unexpected register class");
}
@@ -905,6 +1156,13 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
assert(HasInv2Pi);
+ if (isInt<16>(Literal) || isUInt<16>(Literal)) {
+ int16_t Trunc = static_cast<int16_t>(Literal);
+ return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi);
+ }
+ if (!(Literal & 0xffff))
+ return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi);
+
int16_t Lo16 = static_cast<int16_t>(Literal);
int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
@@ -936,15 +1194,19 @@ bool isArgPassedInSGPR(const Argument *A) {
}
}
+static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
+ return isGCN3Encoding(ST) || isGFX10(ST);
+}
+
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
- if (isGCN3Encoding(ST))
+ if (hasSMEMByteOffset(ST))
return ByteOffset;
return ByteOffset >> 2;
}
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset);
- return isGCN3Encoding(ST) ?
+ return (hasSMEMByteOffset(ST)) ?
isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset);
}
@@ -994,6 +1256,19 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
return true;
}
+SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
+ *this = getDefaultForCallingConv(F.getCallingConv());
+
+ StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
+ if (!IEEEAttr.empty())
+ IEEE = IEEEAttr == "true";
+
+ StringRef DX10ClampAttr
+ = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString();
+ if (!DX10ClampAttr.empty())
+ DX10Clamp = DX10ClampAttr == "true";
+}
+
namespace {
struct SourceOfDivergence {
@@ -1009,5 +1284,6 @@ const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);
bool isIntrinsicSourceOfDivergence(unsigned IntrID) {
return lookupSourceOfDivergence(IntrID);
}
+
} // namespace AMDGPU
} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 20123ed4ac81..209ef7eef749 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1,9 +1,8 @@
//===- AMDGPUBaseInfo.h - Top level definitions for AMDGPU ------*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -46,6 +45,7 @@ namespace AMDGPU {
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
#define GET_MIMGLZMapping_DECL
+#define GET_MIMGMIPMapping_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
@@ -150,10 +150,18 @@ unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
/// \returns VGPR allocation granularity for given subtarget \p STI.
-unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI);
+///
+/// For subtargets which support it, \p EnableWavefrontSize32 should match
+/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
+unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
+ Optional<bool> EnableWavefrontSize32 = None);
/// \returns VGPR encoding granularity for given subtarget \p STI.
-unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI);
+///
+/// For subtargets which support it, \p EnableWavefrontSize32 should match
+/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
+unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
+ Optional<bool> EnableWavefrontSize32 = None);
/// \returns Total number of VGPRs for given subtarget \p STI.
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
@@ -171,13 +179,20 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
/// \returns Number of VGPR blocks needed for given subtarget \p STI when
/// \p NumVGPRs are used.
-unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
+///
+/// For subtargets which support it, \p EnableWavefrontSize32 should match the
+/// ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
+unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs,
+ Optional<bool> EnableWavefrontSize32 = None);
} // end namespace IsaInfo
LLVM_READONLY
int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+LLVM_READONLY
+int getSOPPWithRelaxation(uint16_t Opcode);
+
struct MIMGBaseOpcodeInfo {
MIMGBaseOpcode BaseOpcode;
bool Store;
@@ -201,26 +216,53 @@ struct MIMGDimInfo {
uint8_t NumCoords;
uint8_t NumGradients;
bool DA;
+ uint8_t Encoding;
+ const char *AsmSuffix;
};
LLVM_READONLY
-const MIMGDimInfo *getMIMGDimInfo(unsigned Dim);
+const MIMGDimInfo *getMIMGDimInfo(unsigned DimEnum);
+
+LLVM_READONLY
+const MIMGDimInfo *getMIMGDimInfoByEncoding(uint8_t DimEnc);
+
+LLVM_READONLY
+const MIMGDimInfo *getMIMGDimInfoByAsmSuffix(StringRef AsmSuffix);
struct MIMGLZMappingInfo {
MIMGBaseOpcode L;
MIMGBaseOpcode LZ;
};
+struct MIMGMIPMappingInfo {
+ MIMGBaseOpcode MIP;
+ MIMGBaseOpcode NONMIP;
+};
+
LLVM_READONLY
const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);
LLVM_READONLY
+const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned L);
+
+LLVM_READONLY
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
unsigned VDataDwords, unsigned VAddrDwords);
LLVM_READONLY
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
+struct MIMGInfo {
+ uint16_t Opcode;
+ uint16_t BaseOpcode;
+ uint8_t MIMGEncoding;
+ uint8_t VDataDwords;
+ uint8_t VAddrDwords;
+};
+
+LLVM_READONLY
+const MIMGInfo *getMIMGInfo(unsigned Opc);
+
LLVM_READONLY
int getMUBUFBaseOpcode(unsigned Opc);
@@ -245,7 +287,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen);
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
const MCSubtargetInfo *STI);
-amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor();
+amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
+ const MCSubtargetInfo *STI);
bool isGroupSegment(const GlobalValue *GV);
bool isGlobalSegment(const GlobalValue *GV);
@@ -285,21 +328,30 @@ struct Waitcnt {
unsigned VmCnt = ~0u;
unsigned ExpCnt = ~0u;
unsigned LgkmCnt = ~0u;
+ unsigned VsCnt = ~0u;
Waitcnt() {}
- Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt)
- : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt) {}
+ Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt)
+ : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {}
+
+ static Waitcnt allZero(const IsaVersion &Version) {
+ return Waitcnt(0, 0, 0, Version.Major >= 10 ? 0 : ~0u);
+ }
+ static Waitcnt allZeroExceptVsCnt() { return Waitcnt(0, 0, 0, ~0u); }
- static Waitcnt allZero() { return Waitcnt(0, 0, 0); }
+ bool hasWait() const {
+ return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;
+ }
bool dominates(const Waitcnt &Other) const {
return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
- LgkmCnt <= Other.LgkmCnt;
+ LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt;
}
Waitcnt combined(const Waitcnt &Other) const {
return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt),
- std::min(LgkmCnt, Other.LgkmCnt));
+ std::min(LgkmCnt, Other.LgkmCnt),
+ std::min(VsCnt, Other.VsCnt));
}
};
@@ -332,7 +384,8 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt);
/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9 only)
/// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only)
/// \p Expcnt = \p Waitcnt[6:4]
-/// \p Lgkmcnt = \p Waitcnt[11:8]
+/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10 only)
+/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10+ only)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
@@ -357,7 +410,8 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9 only)
/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9+ only)
/// Waitcnt[6:4] = \p Expcnt
-/// Waitcnt[11:8] = \p Lgkmcnt
+/// Waitcnt[11:8] = \p Lgkmcnt (pre-gfx10 only)
+/// Waitcnt[13:8] = \p Lgkmcnt (gfx10+ only)
/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9+ only)
///
/// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
@@ -367,6 +421,75 @@ unsigned encodeWaitcnt(const IsaVersion &Version,
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded);
+namespace Hwreg {
+
+LLVM_READONLY
+int64_t getHwregId(const StringRef Name);
+
+LLVM_READNONE
+bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI);
+
+LLVM_READNONE
+bool isValidHwreg(int64_t Id);
+
+LLVM_READNONE
+bool isValidHwregOffset(int64_t Offset);
+
+LLVM_READNONE
+bool isValidHwregWidth(int64_t Width);
+
+LLVM_READNONE
+uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width);
+
+LLVM_READNONE
+StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI);
+
+void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width);
+
+} // namespace Hwreg
+
+namespace SendMsg {
+
+LLVM_READONLY
+int64_t getMsgId(const StringRef Name);
+
+LLVM_READONLY
+int64_t getMsgOpId(int64_t MsgId, const StringRef Name);
+
+LLVM_READNONE
+StringRef getMsgName(int64_t MsgId);
+
+LLVM_READNONE
+StringRef getMsgOpName(int64_t MsgId, int64_t OpId);
+
+LLVM_READNONE
+bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true);
+
+LLVM_READNONE
+bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict = true);
+
+LLVM_READNONE
+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict = true);
+
+LLVM_READNONE
+bool msgRequiresOp(int64_t MsgId);
+
+LLVM_READNONE
+bool msgSupportsStream(int64_t MsgId, int64_t OpId);
+
+void decodeMsg(unsigned Val,
+ uint16_t &MsgId,
+ uint16_t &OpId,
+ uint16_t &StreamId);
+
+LLVM_READNONE
+uint64_t encodeMsg(uint64_t MsgId,
+ uint64_t OpId,
+ uint64_t StreamId);
+
+} // namespace SendMsg
+
+
unsigned getInitialPSInputAddr(const Function &F);
LLVM_READNONE
@@ -399,6 +522,7 @@ bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
bool isVI(const MCSubtargetInfo &STI);
bool isGFX9(const MCSubtargetInfo &STI);
+bool isGFX10(const MCSubtargetInfo &STI);
/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -440,6 +564,8 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
return 4;
case AMDGPU::OPERAND_REG_IMM_INT64:
@@ -454,6 +580,12 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
return 2;
default:
@@ -496,6 +628,45 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
+
+// Track defaults for fields in the MODE registser.
+struct SIModeRegisterDefaults {
+ /// Floating point opcodes that support exception flag gathering quiet and
+ /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10
+ /// become IEEE 754- 2008 compliant due to signaling NaN propagation and
+ /// quieting.
+ bool IEEE : 1;
+
+ /// Used by the vector ALU to force DX10-style treatment of NaNs: when set,
+ /// clamp NaN to zero; otherwise, pass NaN through.
+ bool DX10Clamp : 1;
+
+ // TODO: FP mode fields
+
+ SIModeRegisterDefaults() :
+ IEEE(true),
+ DX10Clamp(true) {}
+
+ SIModeRegisterDefaults(const Function &F);
+
+ static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
+ SIModeRegisterDefaults Mode;
+ Mode.DX10Clamp = true;
+ Mode.IEEE = AMDGPU::isCompute(CC);
+ return Mode;
+ }
+
+ bool operator ==(const SIModeRegisterDefaults Other) const {
+ return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp;
+ }
+
+ // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should
+ // be able to override.
+ bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const {
+ return *this == CalleeMode;
+ }
+};
+
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
new file mode 100644
index 000000000000..db20d5ccf5f9
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -0,0 +1,723 @@
+//===-- AMDGPUPALMetadata.cpp - Accumulate and print AMDGPU PAL metadata -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This class has methods called by AMDGPUAsmPrinter to accumulate and print
+/// the PAL metadata.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUPALMetadata.h"
+#include "AMDGPU.h"
+#include "AMDGPUAsmPrinter.h"
+#include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "SIDefines.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/EndianStream.h"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+// Read the PAL metadata from IR metadata, where it was put by the frontend.
+void AMDGPUPALMetadata::readFromIR(Module &M) {
+ auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata.msgpack");
+ if (NamedMD && NamedMD->getNumOperands()) {
+ // This is the new msgpack format for metadata. It is a NamedMD containing
+ // an MDTuple containing an MDString containing the msgpack data.
+ BlobType = ELF::NT_AMDGPU_METADATA;
+ auto MDN = dyn_cast<MDTuple>(NamedMD->getOperand(0));
+ if (MDN && MDN->getNumOperands()) {
+ if (auto MDS = dyn_cast<MDString>(MDN->getOperand(0)))
+ setFromMsgPackBlob(MDS->getString());
+ }
+ return;
+ }
+ BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
+ NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
+ if (!NamedMD || !NamedMD->getNumOperands())
+ return;
+ // This is the old reg=value pair format for metadata. It is a NamedMD
+ // containing an MDTuple containing a number of MDNodes each of which is an
+ // integer value, and each two integer values forms a key=value pair that we
+ // store as Registers[key]=value in the map.
+ auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0));
+ if (!Tuple)
+ return;
+ for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) {
+ auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I));
+ auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1));
+ if (!Key || !Val)
+ continue;
+ setRegister(Key->getZExtValue(), Val->getZExtValue());
+ }
+}
+
+// Set PAL metadata from a binary blob from the applicable .note record.
+// Returns false if bad format. Blob must remain valid for the lifetime of the
+// Metadata.
+bool AMDGPUPALMetadata::setFromBlob(unsigned Type, StringRef Blob) {
+ BlobType = Type;
+ if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
+ return setFromLegacyBlob(Blob);
+ return setFromMsgPackBlob(Blob);
+}
+
+// Set PAL metadata from legacy (array of key=value pairs) blob.
+bool AMDGPUPALMetadata::setFromLegacyBlob(StringRef Blob) {
+ auto Data = reinterpret_cast<const uint32_t *>(Blob.data());
+ for (unsigned I = 0; I != Blob.size() / sizeof(uint32_t) / 2; ++I)
+ setRegister(Data[I * 2], Data[I * 2 + 1]);
+ return true;
+}
+
+// Set PAL metadata from msgpack blob.
+bool AMDGPUPALMetadata::setFromMsgPackBlob(StringRef Blob) {
+ msgpack::Reader Reader(Blob);
+ return MsgPackDoc.readFromBlob(Blob, /*Multi=*/false);
+}
+
+// Given the calling convention, calculate the register number for rsrc1. In
+// principle the register number could change in future hardware, but we know
+// it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so
+// we can use fixed values.
+static unsigned getRsrc1Reg(CallingConv::ID CC) {
+ switch (CC) {
+ default:
+ return PALMD::R_2E12_COMPUTE_PGM_RSRC1;
+ case CallingConv::AMDGPU_LS:
+ return PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS;
+ case CallingConv::AMDGPU_HS:
+ return PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS;
+ case CallingConv::AMDGPU_ES:
+ return PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES;
+ case CallingConv::AMDGPU_GS:
+ return PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS;
+ case CallingConv::AMDGPU_VS:
+ return PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS;
+ case CallingConv::AMDGPU_PS:
+ return PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS;
+ }
+}
+
+// Calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used
+// with a constant offset to access any non-register shader-specific PAL
+// metadata key.
+static unsigned getScratchSizeKey(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::AMDGPU_PS:
+ return PALMD::Key::PS_SCRATCH_SIZE;
+ case CallingConv::AMDGPU_VS:
+ return PALMD::Key::VS_SCRATCH_SIZE;
+ case CallingConv::AMDGPU_GS:
+ return PALMD::Key::GS_SCRATCH_SIZE;
+ case CallingConv::AMDGPU_ES:
+ return PALMD::Key::ES_SCRATCH_SIZE;
+ case CallingConv::AMDGPU_HS:
+ return PALMD::Key::HS_SCRATCH_SIZE;
+ case CallingConv::AMDGPU_LS:
+ return PALMD::Key::LS_SCRATCH_SIZE;
+ default:
+ return PALMD::Key::CS_SCRATCH_SIZE;
+ }
+}
+
+// Set the rsrc1 register in the metadata for a particular shader stage.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, unsigned Val) {
+ setRegister(getRsrc1Reg(CC), Val);
+}
+
+// Set the rsrc2 register in the metadata for a particular shader stage.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, unsigned Val) {
+ setRegister(getRsrc1Reg(CC) + 1, Val);
+}
+
+// Set the SPI_PS_INPUT_ENA register in the metadata.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setSpiPsInputEna(unsigned Val) {
+ setRegister(PALMD::R_A1B3_SPI_PS_INPUT_ENA, Val);
+}
+
+// Set the SPI_PS_INPUT_ADDR register in the metadata.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setSpiPsInputAddr(unsigned Val) {
+ setRegister(PALMD::R_A1B4_SPI_PS_INPUT_ADDR, Val);
+}
+
+// Get a register from the metadata, or 0 if not currently set.
+unsigned AMDGPUPALMetadata::getRegister(unsigned Reg) {
+ auto Regs = getRegisters();
+ auto It = Regs.find(MsgPackDoc.getNode(Reg));
+ if (It == Regs.end())
+ return 0;
+ auto N = It->second;
+ if (N.getKind() != msgpack::Type::UInt)
+ return 0;
+ return N.getUInt();
+}
+
+// Set a register in the metadata.
+// In fact this ORs the value into any previous setting of the register.
+void AMDGPUPALMetadata::setRegister(unsigned Reg, unsigned Val) {
+ if (!isLegacy()) {
+ // In the new MsgPack format, ignore register numbered >= 0x10000000. It
+ // is a PAL ABI pseudo-register in the old non-MsgPack format.
+ if (Reg >= 0x10000000)
+ return;
+ }
+ auto &N = getRegisters()[MsgPackDoc.getNode(Reg)];
+ if (N.getKind() == msgpack::Type::UInt)
+ Val |= N.getUInt();
+ N = N.getDocument()->getNode(Val);
+}
+
+// Set the entry point name for one shader.
+void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) {
+ if (isLegacy())
+ return;
+ // Msgpack format.
+ getHwStage(CC)[".entry_point"] = MsgPackDoc.getNode(Name, /*Copy=*/true);
+}
+
+// Set the number of used vgprs in the metadata. This is an optional
+// advisory record for logging etc; wave dispatch actually uses the rsrc1
+// register for the shader stage to determine the number of vgprs to
+// allocate.
+void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ unsigned NumUsedVgprsKey = getScratchSizeKey(CC) +
+ PALMD::Key::VS_NUM_USED_VGPRS -
+ PALMD::Key::VS_SCRATCH_SIZE;
+ setRegister(NumUsedVgprsKey, Val);
+ return;
+ }
+ // Msgpack format.
+ getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the number of used sgprs in the metadata. This is an optional advisory
+// record for logging etc; wave dispatch actually uses the rsrc1 register for
+// the shader stage to determine the number of sgprs to allocate.
+void AMDGPUPALMetadata::setNumUsedSgprs(CallingConv::ID CC, unsigned Val) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ unsigned NumUsedSgprsKey = getScratchSizeKey(CC) +
+ PALMD::Key::VS_NUM_USED_SGPRS -
+ PALMD::Key::VS_SCRATCH_SIZE;
+ setRegister(NumUsedSgprsKey, Val);
+ return;
+ }
+ // Msgpack format.
+ getHwStage(CC)[".sgpr_count"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the scratch size in the metadata.
+void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) {
+ if (isLegacy()) {
+ // Old non-msgpack format.
+ setRegister(getScratchSizeKey(CC), Val);
+ return;
+ }
+ // Msgpack format.
+ getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the hardware register bit in PAL metadata to enable wave32 on the
+// shader of the given calling convention.
+void AMDGPUPALMetadata::setWave32(unsigned CC) {
+ switch (CC) {
+ case CallingConv::AMDGPU_HS:
+ setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_HS_W32_EN(1));
+ break;
+ case CallingConv::AMDGPU_GS:
+ setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_GS_W32_EN(1));
+ break;
+ case CallingConv::AMDGPU_VS:
+ setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_VS_W32_EN(1));
+ break;
+ case CallingConv::AMDGPU_PS:
+ setRegister(PALMD::R_A1B6_SPI_PS_IN_CONTROL, S_0286D8_PS_W32_EN(1));
+ break;
+ case CallingConv::AMDGPU_CS:
+ setRegister(PALMD::R_2E00_COMPUTE_DISPATCH_INITIATOR,
+ S_00B800_CS_W32_EN(1));
+ break;
+ }
+}
+
+// Convert a register number to name, for display by toString().
+// Returns nullptr if none.
+static const char *getRegisterName(unsigned RegNum) {
+ // Table of registers.
+ static const struct RegInfo {
+ unsigned Num;
+ const char *Name;
+ } RegInfoTable[] = {
+ // Registers that code generation sets/modifies metadata for.
+ {PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS, "SPI_SHADER_PGM_RSRC1_VS"},
+ {PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS + 1, "SPI_SHADER_PGM_RSRC2_VS"},
+ {PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS, "SPI_SHADER_PGM_RSRC1_LS"},
+ {PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS + 1, "SPI_SHADER_PGM_RSRC2_LS"},
+ {PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS, "SPI_SHADER_PGM_RSRC1_HS"},
+ {PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS + 1, "SPI_SHADER_PGM_RSRC2_HS"},
+ {PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES, "SPI_SHADER_PGM_RSRC1_ES"},
+ {PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES + 1, "SPI_SHADER_PGM_RSRC2_ES"},
+ {PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS, "SPI_SHADER_PGM_RSRC1_GS"},
+ {PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS + 1, "SPI_SHADER_PGM_RSRC2_GS"},
+ {PALMD::R_2E00_COMPUTE_DISPATCH_INITIATOR, "COMPUTE_DISPATCH_INITIATOR"},
+ {PALMD::R_2E12_COMPUTE_PGM_RSRC1, "COMPUTE_PGM_RSRC1"},
+ {PALMD::R_2E12_COMPUTE_PGM_RSRC1 + 1, "COMPUTE_PGM_RSRC2"},
+ {PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS, "SPI_SHADER_PGM_RSRC1_PS"},
+ {PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS + 1, "SPI_SHADER_PGM_RSRC2_PS"},
+ {PALMD::R_A1B3_SPI_PS_INPUT_ENA, "SPI_PS_INPUT_ENA"},
+ {PALMD::R_A1B4_SPI_PS_INPUT_ADDR, "SPI_PS_INPUT_ADDR"},
+ {PALMD::R_A1B6_SPI_PS_IN_CONTROL, "SPI_PS_IN_CONTROL"},
+ {PALMD::R_A2D5_VGT_SHADER_STAGES_EN, "VGT_SHADER_STAGES_EN"},
+
+ // Registers not known to code generation.
+ {0x2c07, "SPI_SHADER_PGM_RSRC3_PS"},
+ {0x2c46, "SPI_SHADER_PGM_RSRC3_VS"},
+ {0x2c87, "SPI_SHADER_PGM_RSRC3_GS"},
+ {0x2cc7, "SPI_SHADER_PGM_RSRC3_ES"},
+ {0x2d07, "SPI_SHADER_PGM_RSRC3_HS"},
+ {0x2d47, "SPI_SHADER_PGM_RSRC3_LS"},
+
+ {0xa1c3, "SPI_SHADER_POS_FORMAT"},
+ {0xa1b1, "SPI_VS_OUT_CONFIG"},
+ {0xa207, "PA_CL_VS_OUT_CNTL"},
+ {0xa204, "PA_CL_CLIP_CNTL"},
+ {0xa206, "PA_CL_VTE_CNTL"},
+ {0xa2f9, "PA_SU_VTX_CNTL"},
+ {0xa293, "PA_SC_MODE_CNTL_1"},
+ {0xa2a1, "VGT_PRIMITIVEID_EN"},
+ {0x2c81, "SPI_SHADER_PGM_RSRC4_GS"},
+ {0x2e18, "COMPUTE_TMPRING_SIZE"},
+ {0xa1b5, "SPI_INTERP_CONTROL_0"},
+ {0xa1ba, "SPI_TMPRING_SIZE"},
+ {0xa1c4, "SPI_SHADER_Z_FORMAT"},
+ {0xa1c5, "SPI_SHADER_COL_FORMAT"},
+ {0xa203, "DB_SHADER_CONTROL"},
+ {0xa08f, "CB_SHADER_MASK"},
+ {0xa191, "SPI_PS_INPUT_CNTL_0"},
+ {0xa192, "SPI_PS_INPUT_CNTL_1"},
+ {0xa193, "SPI_PS_INPUT_CNTL_2"},
+ {0xa194, "SPI_PS_INPUT_CNTL_3"},
+ {0xa195, "SPI_PS_INPUT_CNTL_4"},
+ {0xa196, "SPI_PS_INPUT_CNTL_5"},
+ {0xa197, "SPI_PS_INPUT_CNTL_6"},
+ {0xa198, "SPI_PS_INPUT_CNTL_7"},
+ {0xa199, "SPI_PS_INPUT_CNTL_8"},
+ {0xa19a, "SPI_PS_INPUT_CNTL_9"},
+ {0xa19b, "SPI_PS_INPUT_CNTL_10"},
+ {0xa19c, "SPI_PS_INPUT_CNTL_11"},
+ {0xa19d, "SPI_PS_INPUT_CNTL_12"},
+ {0xa19e, "SPI_PS_INPUT_CNTL_13"},
+ {0xa19f, "SPI_PS_INPUT_CNTL_14"},
+ {0xa1a0, "SPI_PS_INPUT_CNTL_15"},
+ {0xa1a1, "SPI_PS_INPUT_CNTL_16"},
+ {0xa1a2, "SPI_PS_INPUT_CNTL_17"},
+ {0xa1a3, "SPI_PS_INPUT_CNTL_18"},
+ {0xa1a4, "SPI_PS_INPUT_CNTL_19"},
+ {0xa1a5, "SPI_PS_INPUT_CNTL_20"},
+ {0xa1a6, "SPI_PS_INPUT_CNTL_21"},
+ {0xa1a7, "SPI_PS_INPUT_CNTL_22"},
+ {0xa1a8, "SPI_PS_INPUT_CNTL_23"},
+ {0xa1a9, "SPI_PS_INPUT_CNTL_24"},
+ {0xa1aa, "SPI_PS_INPUT_CNTL_25"},
+ {0xa1ab, "SPI_PS_INPUT_CNTL_26"},
+ {0xa1ac, "SPI_PS_INPUT_CNTL_27"},
+ {0xa1ad, "SPI_PS_INPUT_CNTL_28"},
+ {0xa1ae, "SPI_PS_INPUT_CNTL_29"},
+ {0xa1af, "SPI_PS_INPUT_CNTL_30"},
+ {0xa1b0, "SPI_PS_INPUT_CNTL_31"},
+
+ {0xa2ce, "VGT_GS_MAX_VERT_OUT"},
+ {0xa2ab, "VGT_ESGS_RING_ITEMSIZE"},
+ {0xa290, "VGT_GS_MODE"},
+ {0xa291, "VGT_GS_ONCHIP_CNTL"},
+ {0xa2d7, "VGT_GS_VERT_ITEMSIZE"},
+ {0xa2d8, "VGT_GS_VERT_ITEMSIZE_1"},
+ {0xa2d9, "VGT_GS_VERT_ITEMSIZE_2"},
+ {0xa2da, "VGT_GS_VERT_ITEMSIZE_3"},
+ {0xa298, "VGT_GSVS_RING_OFFSET_1"},
+ {0xa299, "VGT_GSVS_RING_OFFSET_2"},
+ {0xa29a, "VGT_GSVS_RING_OFFSET_3"},
+
+ {0xa2e4, "VGT_GS_INSTANCE_CNT"},
+ {0xa297, "VGT_GS_PER_VS"},
+ {0xa29b, "VGT_GS_OUT_PRIM_TYPE"},
+ {0xa2ac, "VGT_GSVS_RING_ITEMSIZE"},
+
+ {0xa2ad, "VGT_REUSE_OFF"},
+ {0xa1b8, "SPI_BARYC_CNTL"},
+
+ {0x2c4c, "SPI_SHADER_USER_DATA_VS_0"},
+ {0x2c4d, "SPI_SHADER_USER_DATA_VS_1"},
+ {0x2c4e, "SPI_SHADER_USER_DATA_VS_2"},
+ {0x2c4f, "SPI_SHADER_USER_DATA_VS_3"},
+ {0x2c50, "SPI_SHADER_USER_DATA_VS_4"},
+ {0x2c51, "SPI_SHADER_USER_DATA_VS_5"},
+ {0x2c52, "SPI_SHADER_USER_DATA_VS_6"},
+ {0x2c53, "SPI_SHADER_USER_DATA_VS_7"},
+ {0x2c54, "SPI_SHADER_USER_DATA_VS_8"},
+ {0x2c55, "SPI_SHADER_USER_DATA_VS_9"},
+ {0x2c56, "SPI_SHADER_USER_DATA_VS_10"},
+ {0x2c57, "SPI_SHADER_USER_DATA_VS_11"},
+ {0x2c58, "SPI_SHADER_USER_DATA_VS_12"},
+ {0x2c59, "SPI_SHADER_USER_DATA_VS_13"},
+ {0x2c5a, "SPI_SHADER_USER_DATA_VS_14"},
+ {0x2c5b, "SPI_SHADER_USER_DATA_VS_15"},
+ {0x2c5c, "SPI_SHADER_USER_DATA_VS_16"},
+ {0x2c5d, "SPI_SHADER_USER_DATA_VS_17"},
+ {0x2c5e, "SPI_SHADER_USER_DATA_VS_18"},
+ {0x2c5f, "SPI_SHADER_USER_DATA_VS_19"},
+ {0x2c60, "SPI_SHADER_USER_DATA_VS_20"},
+ {0x2c61, "SPI_SHADER_USER_DATA_VS_21"},
+ {0x2c62, "SPI_SHADER_USER_DATA_VS_22"},
+ {0x2c63, "SPI_SHADER_USER_DATA_VS_23"},
+ {0x2c64, "SPI_SHADER_USER_DATA_VS_24"},
+ {0x2c65, "SPI_SHADER_USER_DATA_VS_25"},
+ {0x2c66, "SPI_SHADER_USER_DATA_VS_26"},
+ {0x2c67, "SPI_SHADER_USER_DATA_VS_27"},
+ {0x2c68, "SPI_SHADER_USER_DATA_VS_28"},
+ {0x2c69, "SPI_SHADER_USER_DATA_VS_29"},
+ {0x2c6a, "SPI_SHADER_USER_DATA_VS_30"},
+ {0x2c6b, "SPI_SHADER_USER_DATA_VS_31"},
+
+ {0x2ccc, "SPI_SHADER_USER_DATA_ES_0"},
+ {0x2ccd, "SPI_SHADER_USER_DATA_ES_1"},
+ {0x2cce, "SPI_SHADER_USER_DATA_ES_2"},
+ {0x2ccf, "SPI_SHADER_USER_DATA_ES_3"},
+ {0x2cd0, "SPI_SHADER_USER_DATA_ES_4"},
+ {0x2cd1, "SPI_SHADER_USER_DATA_ES_5"},
+ {0x2cd2, "SPI_SHADER_USER_DATA_ES_6"},
+ {0x2cd3, "SPI_SHADER_USER_DATA_ES_7"},
+ {0x2cd4, "SPI_SHADER_USER_DATA_ES_8"},
+ {0x2cd5, "SPI_SHADER_USER_DATA_ES_9"},
+ {0x2cd6, "SPI_SHADER_USER_DATA_ES_10"},
+ {0x2cd7, "SPI_SHADER_USER_DATA_ES_11"},
+ {0x2cd8, "SPI_SHADER_USER_DATA_ES_12"},
+ {0x2cd9, "SPI_SHADER_USER_DATA_ES_13"},
+ {0x2cda, "SPI_SHADER_USER_DATA_ES_14"},
+ {0x2cdb, "SPI_SHADER_USER_DATA_ES_15"},
+ {0x2cdc, "SPI_SHADER_USER_DATA_ES_16"},
+ {0x2cdd, "SPI_SHADER_USER_DATA_ES_17"},
+ {0x2cde, "SPI_SHADER_USER_DATA_ES_18"},
+ {0x2cdf, "SPI_SHADER_USER_DATA_ES_19"},
+ {0x2ce0, "SPI_SHADER_USER_DATA_ES_20"},
+ {0x2ce1, "SPI_SHADER_USER_DATA_ES_21"},
+ {0x2ce2, "SPI_SHADER_USER_DATA_ES_22"},
+ {0x2ce3, "SPI_SHADER_USER_DATA_ES_23"},
+ {0x2ce4, "SPI_SHADER_USER_DATA_ES_24"},
+ {0x2ce5, "SPI_SHADER_USER_DATA_ES_25"},
+ {0x2ce6, "SPI_SHADER_USER_DATA_ES_26"},
+ {0x2ce7, "SPI_SHADER_USER_DATA_ES_27"},
+ {0x2ce8, "SPI_SHADER_USER_DATA_ES_28"},
+ {0x2ce9, "SPI_SHADER_USER_DATA_ES_29"},
+ {0x2cea, "SPI_SHADER_USER_DATA_ES_30"},
+ {0x2ceb, "SPI_SHADER_USER_DATA_ES_31"},
+
+ {0x2c0c, "SPI_SHADER_USER_DATA_PS_0"},
+ {0x2c0d, "SPI_SHADER_USER_DATA_PS_1"},
+ {0x2c0e, "SPI_SHADER_USER_DATA_PS_2"},
+ {0x2c0f, "SPI_SHADER_USER_DATA_PS_3"},
+ {0x2c10, "SPI_SHADER_USER_DATA_PS_4"},
+ {0x2c11, "SPI_SHADER_USER_DATA_PS_5"},
+ {0x2c12, "SPI_SHADER_USER_DATA_PS_6"},
+ {0x2c13, "SPI_SHADER_USER_DATA_PS_7"},
+ {0x2c14, "SPI_SHADER_USER_DATA_PS_8"},
+ {0x2c15, "SPI_SHADER_USER_DATA_PS_9"},
+ {0x2c16, "SPI_SHADER_USER_DATA_PS_10"},
+ {0x2c17, "SPI_SHADER_USER_DATA_PS_11"},
+ {0x2c18, "SPI_SHADER_USER_DATA_PS_12"},
+ {0x2c19, "SPI_SHADER_USER_DATA_PS_13"},
+ {0x2c1a, "SPI_SHADER_USER_DATA_PS_14"},
+ {0x2c1b, "SPI_SHADER_USER_DATA_PS_15"},
+ {0x2c1c, "SPI_SHADER_USER_DATA_PS_16"},
+ {0x2c1d, "SPI_SHADER_USER_DATA_PS_17"},
+ {0x2c1e, "SPI_SHADER_USER_DATA_PS_18"},
+ {0x2c1f, "SPI_SHADER_USER_DATA_PS_19"},
+ {0x2c20, "SPI_SHADER_USER_DATA_PS_20"},
+ {0x2c21, "SPI_SHADER_USER_DATA_PS_21"},
+ {0x2c22, "SPI_SHADER_USER_DATA_PS_22"},
+ {0x2c23, "SPI_SHADER_USER_DATA_PS_23"},
+ {0x2c24, "SPI_SHADER_USER_DATA_PS_24"},
+ {0x2c25, "SPI_SHADER_USER_DATA_PS_25"},
+ {0x2c26, "SPI_SHADER_USER_DATA_PS_26"},
+ {0x2c27, "SPI_SHADER_USER_DATA_PS_27"},
+ {0x2c28, "SPI_SHADER_USER_DATA_PS_28"},
+ {0x2c29, "SPI_SHADER_USER_DATA_PS_29"},
+ {0x2c2a, "SPI_SHADER_USER_DATA_PS_30"},
+ {0x2c2b, "SPI_SHADER_USER_DATA_PS_31"},
+
+ {0x2e40, "COMPUTE_USER_DATA_0"},
+ {0x2e41, "COMPUTE_USER_DATA_1"},
+ {0x2e42, "COMPUTE_USER_DATA_2"},
+ {0x2e43, "COMPUTE_USER_DATA_3"},
+ {0x2e44, "COMPUTE_USER_DATA_4"},
+ {0x2e45, "COMPUTE_USER_DATA_5"},
+ {0x2e46, "COMPUTE_USER_DATA_6"},
+ {0x2e47, "COMPUTE_USER_DATA_7"},
+ {0x2e48, "COMPUTE_USER_DATA_8"},
+ {0x2e49, "COMPUTE_USER_DATA_9"},
+ {0x2e4a, "COMPUTE_USER_DATA_10"},
+ {0x2e4b, "COMPUTE_USER_DATA_11"},
+ {0x2e4c, "COMPUTE_USER_DATA_12"},
+ {0x2e4d, "COMPUTE_USER_DATA_13"},
+ {0x2e4e, "COMPUTE_USER_DATA_14"},
+ {0x2e4f, "COMPUTE_USER_DATA_15"},
+
+ {0x2e07, "COMPUTE_NUM_THREAD_X"},
+ {0x2e08, "COMPUTE_NUM_THREAD_Y"},
+ {0x2e09, "COMPUTE_NUM_THREAD_Z"},
+ {0xa2db, "VGT_TF_PARAM"},
+ {0xa2d6, "VGT_LS_HS_CONFIG"},
+ {0xa287, "VGT_HOS_MIN_TESS_LEVEL"},
+ {0xa286, "VGT_HOS_MAX_TESS_LEVEL"},
+ {0xa2f8, "PA_SC_AA_CONFIG"},
+ {0xa310, "PA_SC_SHADER_CONTROL"},
+ {0xa313, "PA_SC_CONSERVATIVE_RASTERIZATION_CNTL"},
+
+ {0x2d0c, "SPI_SHADER_USER_DATA_LS_0"},
+ {0x2d0d, "SPI_SHADER_USER_DATA_LS_1"},
+ {0x2d0e, "SPI_SHADER_USER_DATA_LS_2"},
+ {0x2d0f, "SPI_SHADER_USER_DATA_LS_3"},
+ {0x2d10, "SPI_SHADER_USER_DATA_LS_4"},
+ {0x2d11, "SPI_SHADER_USER_DATA_LS_5"},
+ {0x2d12, "SPI_SHADER_USER_DATA_LS_6"},
+ {0x2d13, "SPI_SHADER_USER_DATA_LS_7"},
+ {0x2d14, "SPI_SHADER_USER_DATA_LS_8"},
+ {0x2d15, "SPI_SHADER_USER_DATA_LS_9"},
+ {0x2d16, "SPI_SHADER_USER_DATA_LS_10"},
+ {0x2d17, "SPI_SHADER_USER_DATA_LS_11"},
+ {0x2d18, "SPI_SHADER_USER_DATA_LS_12"},
+ {0x2d19, "SPI_SHADER_USER_DATA_LS_13"},
+ {0x2d1a, "SPI_SHADER_USER_DATA_LS_14"},
+ {0x2d1b, "SPI_SHADER_USER_DATA_LS_15"},
+ {0x2d1c, "SPI_SHADER_USER_DATA_LS_16"},
+ {0x2d1d, "SPI_SHADER_USER_DATA_LS_17"},
+ {0x2d1e, "SPI_SHADER_USER_DATA_LS_18"},
+ {0x2d1f, "SPI_SHADER_USER_DATA_LS_19"},
+ {0x2d20, "SPI_SHADER_USER_DATA_LS_20"},
+ {0x2d21, "SPI_SHADER_USER_DATA_LS_21"},
+ {0x2d22, "SPI_SHADER_USER_DATA_LS_22"},
+ {0x2d23, "SPI_SHADER_USER_DATA_LS_23"},
+ {0x2d24, "SPI_SHADER_USER_DATA_LS_24"},
+ {0x2d25, "SPI_SHADER_USER_DATA_LS_25"},
+ {0x2d26, "SPI_SHADER_USER_DATA_LS_26"},
+ {0x2d27, "SPI_SHADER_USER_DATA_LS_27"},
+ {0x2d28, "SPI_SHADER_USER_DATA_LS_28"},
+ {0x2d29, "SPI_SHADER_USER_DATA_LS_29"},
+ {0x2d2a, "SPI_SHADER_USER_DATA_LS_30"},
+ {0x2d2b, "SPI_SHADER_USER_DATA_LS_31"},
+
+ {0xa2aa, "IA_MULTI_VGT_PARAM"},
+ {0xa2a5, "VGT_GS_MAX_PRIMS_PER_SUBGROUP"},
+ {0xa2e6, "VGT_STRMOUT_BUFFER_CONFIG"},
+ {0xa2e5, "VGT_STRMOUT_CONFIG"},
+ {0xa2b5, "VGT_STRMOUT_VTX_STRIDE_0"},
+ {0xa2b9, "VGT_STRMOUT_VTX_STRIDE_1"},
+ {0xa2bd, "VGT_STRMOUT_VTX_STRIDE_2"},
+ {0xa2c1, "VGT_STRMOUT_VTX_STRIDE_3"},
+ {0xa316, "VGT_VERTEX_REUSE_BLOCK_CNTL"},
+
+ {0, nullptr}};
+ auto Entry = RegInfoTable;
+ for (; Entry->Num && Entry->Num != RegNum; ++Entry)
+ ;
+ return Entry->Name;
+}
+
+// Convert the accumulated PAL metadata into an asm directive.
+void AMDGPUPALMetadata::toString(std::string &String) {
+ String.clear();
+ if (!BlobType)
+ return;
+ raw_string_ostream Stream(String);
+ if (isLegacy()) {
+ if (MsgPackDoc.getRoot().getKind() == msgpack::Type::Nil)
+ return;
+ // Old linear reg=val format.
+ Stream << '\t' << AMDGPU::PALMD::AssemblerDirective << ' ';
+ auto Regs = getRegisters();
+ for (auto I = Regs.begin(), E = Regs.end(); I != E; ++I) {
+ if (I != Regs.begin())
+ Stream << ',';
+ unsigned Reg = I->first.getUInt();
+ unsigned Val = I->second.getUInt();
+ Stream << "0x" << Twine::utohexstr(Reg) << ",0x" << Twine::utohexstr(Val);
+ }
+ Stream << '\n';
+ return;
+ }
+
+ // New msgpack-based format -- output as YAML (with unsigned numbers in hex),
+ // but first change the registers map to use names.
+ MsgPackDoc.setHexMode();
+ auto &RegsObj = refRegisters();
+ auto OrigRegs = RegsObj.getMap();
+ RegsObj = MsgPackDoc.getMapNode();
+ for (auto I : OrigRegs) {
+ auto Key = I.first;
+ if (const char *RegName = getRegisterName(Key.getUInt())) {
+ std::string KeyName = Key.toString();
+ KeyName += " (";
+ KeyName += RegName;
+ KeyName += ')';
+ Key = MsgPackDoc.getNode(KeyName, /*Copy=*/true);
+ }
+ RegsObj.getMap()[Key] = I.second;
+ }
+
+ // Output as YAML.
+ Stream << '\t' << AMDGPU::PALMD::AssemblerDirectiveBegin << '\n';
+ MsgPackDoc.toYAML(Stream);
+ Stream << '\t' << AMDGPU::PALMD::AssemblerDirectiveEnd << '\n';
+
+ // Restore original registers map.
+ RegsObj = OrigRegs;
+}
+
+// Convert the accumulated PAL metadata into a binary blob for writing as
+// a .note record of the specified AMD type. Returns an empty blob if
+// there is no PAL metadata,
+void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) {
+ if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
+ toLegacyBlob(Blob);
+ else if (Type)
+ toMsgPackBlob(Blob);
+}
+
+void AMDGPUPALMetadata::toLegacyBlob(std::string &Blob) {
+ Blob.clear();
+ auto Registers = getRegisters();
+ if (Registers.getMap().empty())
+ return;
+ raw_string_ostream OS(Blob);
+ support::endian::Writer EW(OS, support::endianness::little);
+ for (auto I : Registers.getMap()) {
+ EW.write(uint32_t(I.first.getUInt()));
+ EW.write(uint32_t(I.second.getUInt()));
+ }
+}
+
+void AMDGPUPALMetadata::toMsgPackBlob(std::string &Blob) {
+ Blob.clear();
+ MsgPackDoc.writeToBlob(Blob);
+}
+
+// Set PAL metadata from YAML text. Returns false if failed.
+bool AMDGPUPALMetadata::setFromString(StringRef S) {
+ BlobType = ELF::NT_AMDGPU_METADATA;
+ if (!MsgPackDoc.fromYAML(S))
+ return false;
+
+ // In the registers map, some keys may be of the form "0xa191
+ // (SPI_PS_INPUT_CNTL_0)", in which case the YAML input code made it a
+ // string. We need to turn it into a number.
+ auto &RegsObj = refRegisters();
+ auto OrigRegs = RegsObj;
+ RegsObj = MsgPackDoc.getMapNode();
+ Registers = RegsObj.getMap();
+ bool Ok = true;
+ for (auto I : OrigRegs.getMap()) {
+ auto Key = I.first;
+ if (Key.getKind() == msgpack::Type::String) {
+ StringRef S = Key.getString();
+ uint64_t Val;
+ if (S.consumeInteger(0, Val)) {
+ Ok = false;
+ errs() << "Unrecognized PAL metadata register key '" << S << "'\n";
+ continue;
+ }
+ Key = MsgPackDoc.getNode(uint64_t(Val));
+ }
+ Registers.getMap()[Key] = I.second;
+ }
+ return Ok;
+}
+
+// Reference (create if necessary) the node for the registers map.
+msgpack::DocNode &AMDGPUPALMetadata::refRegisters() {
+ auto &N =
+ MsgPackDoc.getRoot()
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+ .getArray(/*Convert=*/true)[0]
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".registers")];
+ N.getMap(/*Convert=*/true);
+ return N;
+}
+
+// Get (create if necessary) the registers map.
+msgpack::MapDocNode AMDGPUPALMetadata::getRegisters() {
+ if (Registers.isEmpty())
+ Registers = refRegisters();
+ return Registers.getMap();
+}
+
+// Return the PAL metadata hardware shader stage name.
+static const char *getStageName(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::AMDGPU_PS:
+ return ".ps";
+ case CallingConv::AMDGPU_VS:
+ return ".vs";
+ case CallingConv::AMDGPU_GS:
+ return ".gs";
+ case CallingConv::AMDGPU_ES:
+ return ".es";
+ case CallingConv::AMDGPU_HS:
+ return ".hs";
+ case CallingConv::AMDGPU_LS:
+ return ".ls";
+ default:
+ return ".cs";
+ }
+}
+
+// Get (create if necessary) the .hardware_stages entry for the given calling
+// convention.
+msgpack::MapDocNode AMDGPUPALMetadata::getHwStage(unsigned CC) {
+ if (HwStages.isEmpty())
+ HwStages = MsgPackDoc.getRoot()
+ .getMap(/*Convert=*/true)["amdpal.pipelines"]
+ .getArray(/*Convert=*/true)[0]
+ .getMap(/*Convert=*/true)[".hardware_stages"]
+ .getMap(/*Convert=*/true);
+ return HwStages.getMap()[getStageName(CC)].getMap(/*Convert=*/true);
+}
+
+// Get .note record vendor name of metadata blob to be emitted.
+const char *AMDGPUPALMetadata::getVendor() const {
+ return isLegacy() ? ElfNote::NoteNameV2 : ElfNote::NoteNameV3;
+}
+
+// Get .note record type of metadata blob to be emitted:
+// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
+// ELF::NT_AMDGPU_METADATA (MsgPack format), or
+// 0 (no PAL metadata).
+unsigned AMDGPUPALMetadata::getType() const {
+ return BlobType;
+}
+
+// Return whether the blob type is legacy PAL metadata.
+bool AMDGPUPALMetadata::isLegacy() const {
+ return BlobType == ELF::NT_AMD_AMDGPU_PAL_METADATA;
+}
+
+// Set legacy PAL metadata format.
+void AMDGPUPALMetadata::setLegacy() {
+ BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
+}
+
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
new file mode 100644
index 000000000000..0f17c157b206
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -0,0 +1,135 @@
+//===-- AMDGPUPALMetadata.h - PAL metadata handling -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// PAL metadata handling
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
+#include <map>
+
+namespace llvm {
+
+class AMDGPUTargetStreamer;
+class formatted_raw_ostream;
+class MCStreamer;
+class Module;
+
+class AMDGPUPALMetadata {
+ unsigned BlobType = 0;
+ msgpack::Document MsgPackDoc;
+ msgpack::DocNode Registers;
+ msgpack::DocNode HwStages;
+
+public:
+ // Read the amdgpu.pal.metadata supplied by the frontend, ready for
+ // per-function modification.
+ void readFromIR(Module &M);
+
+ // Set PAL metadata from a binary blob from the applicable .note record.
+ // Returns false if bad format. Blob must remain valid for the lifetime of
+ // the Metadata.
+ bool setFromBlob(unsigned Type, StringRef Blob);
+
+ // Set the rsrc1 register in the metadata for a particular shader stage.
+ // In fact this ORs the value into any previous setting of the register.
+ void setRsrc1(unsigned CC, unsigned Val);
+
+ // Set the rsrc2 register in the metadata for a particular shader stage.
+ // In fact this ORs the value into any previous setting of the register.
+ void setRsrc2(unsigned CC, unsigned Val);
+
+ // Set the SPI_PS_INPUT_ENA register in the metadata.
+ // In fact this ORs the value into any previous setting of the register.
+ void setSpiPsInputEna(unsigned Val);
+
+ // Set the SPI_PS_INPUT_ADDR register in the metadata.
+ // In fact this ORs the value into any previous setting of the register.
+ void setSpiPsInputAddr(unsigned Val);
+
+ // Get a register from the metadata, or 0 if not currently set.
+ unsigned getRegister(unsigned Reg);
+
+ // Set a register in the metadata.
+ // In fact this ORs the value into any previous setting of the register.
+ void setRegister(unsigned Reg, unsigned Val);
+
+ // Set the entry point name for one shader.
+ void setEntryPoint(unsigned CC, StringRef Name);
+
+ // Set the number of used vgprs in the metadata. This is an optional advisory
+ // record for logging etc; wave dispatch actually uses the rsrc1 register for
+ // the shader stage to determine the number of vgprs to allocate.
+ void setNumUsedVgprs(unsigned CC, unsigned Val);
+
+ // Set the number of used sgprs in the metadata. This is an optional advisory
+ // record for logging etc; wave dispatch actually uses the rsrc1 register for
+ // the shader stage to determine the number of sgprs to allocate.
+ void setNumUsedSgprs(unsigned CC, unsigned Val);
+
+ // Set the scratch size in the metadata.
+ void setScratchSize(unsigned CC, unsigned Val);
+
+ // Set the hardware register bit in PAL metadata to enable wave32 on the
+ // shader of the given calling convention.
+ void setWave32(unsigned CC);
+
+ // Emit the accumulated PAL metadata as asm directives.
+ // This is called from AMDGPUTargetAsmStreamer::Finish().
+ void toString(std::string &S);
+
+ // Set PAL metadata from YAML text.
+ bool setFromString(StringRef S);
+
+ // Get .note record vendor name of metadata blob to be emitted.
+ const char *getVendor() const;
+
+ // Get .note record type of metadata blob to be emitted:
+ // ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
+ // ELF::NT_AMDGPU_METADATA (MsgPack format), or
+ // 0 (no PAL metadata).
+ unsigned getType() const;
+
+ // Emit the accumulated PAL metadata as a binary blob.
+ // This is called from AMDGPUTargetELFStreamer::Finish().
+ void toBlob(unsigned Type, std::string &S);
+
+ // Get the msgpack::Document for the PAL metadata.
+ msgpack::Document *getMsgPackDoc() { return &MsgPackDoc; }
+
+ // Set legacy PAL metadata format.
+ void setLegacy();
+
+private:
+ // Return whether the blob type is legacy PAL metadata.
+ bool isLegacy() const;
+
+ // Reference (create if necessary) the node for the registers map.
+ msgpack::DocNode &refRegisters();
+
+ // Get (create if necessary) the registers map.
+ msgpack::MapDocNode getRegisters();
+
+ // Get (create if necessary) the .hardware_stages entry for the given calling
+ // convention.
+ msgpack::MapDocNode getHwStage(unsigned CC);
+
+ bool setFromLegacyBlob(StringRef Blob);
+ bool setFromMsgPackBlob(StringRef Blob);
+ void toLegacyBlob(std::string &Blob);
+ void toMsgPackBlob(std::string &Blob);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index 82ffdef8e674..95ad3f35d18f 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -1,9 +1,8 @@
//===--------------------- AMDKernelCodeTInfo.h ---------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -83,6 +82,9 @@ COMPPGM1(priv, compute_pgm_rsrc1_priv, PRIV
COMPPGM1(enable_dx10_clamp, compute_pgm_rsrc1_dx10_clamp, DX10_CLAMP),
COMPPGM1(debug_mode, compute_pgm_rsrc1_debug_mode, DEBUG_MODE),
COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE_MODE),
+COMPPGM1(enable_wgp_mode, compute_pgm_rsrc1_wgp_mode, WGP_MODE),
+COMPPGM1(enable_mem_ordered, compute_pgm_rsrc1_mem_ordered, MEM_ORDERED),
+COMPPGM1(enable_fwd_progress, compute_pgm_rsrc1_fwd_progress, FWD_PROGRESS),
// TODO: bulky
// TODO: cdbg_user
COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN),
@@ -107,6 +109,7 @@ CODEPROP(enable_sgpr_private_segment_size, ENABLE_SGPR_PRIVATE_SEGMENT_SIZE),
CODEPROP(enable_sgpr_grid_workgroup_count_x, ENABLE_SGPR_GRID_WORKGROUP_COUNT_X),
CODEPROP(enable_sgpr_grid_workgroup_count_y, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y),
CODEPROP(enable_sgpr_grid_workgroup_count_z, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z),
+CODEPROP(enable_wavefront_size32, ENABLE_WAVEFRONT_SIZE32),
CODEPROP(enable_ordered_append_gds, ENABLE_ORDERED_APPEND_GDS),
CODEPROP(private_element_size, PRIVATE_ELEMENT_SIZE),
CODEPROP(is_ptr64, IS_PTR64),
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
index 20059f4a1ed7..443e2cc45ac0 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -1,9 +1,8 @@
//===- AMDKernelCodeTUtils.cpp --------------------------------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
index ef9f9bdb6bcb..a87325a78df3 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
@@ -1,9 +1,8 @@
//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t -*- C++ -*-===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/VIInstrFormats.td b/lib/Target/AMDGPU/VIInstrFormats.td
index 1fd1c1e21527..bd65a495fa72 100644
--- a/lib/Target/AMDGPU/VIInstrFormats.td
+++ b/lib/Target/AMDGPU/VIInstrFormats.td
@@ -1,9 +1,8 @@
//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td
index b45c8fc9c7d5..ec7d8875a746 100644
--- a/lib/Target/AMDGPU/VIInstructions.td
+++ b/lib/Target/AMDGPU/VIInstructions.td
@@ -1,9 +1,8 @@
//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Instruction definitions for VI and newer.
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 68446ab79720..6bc416ed7d4b 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1,9 +1,8 @@
//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -15,7 +14,7 @@ class VOP1e <bits<8> op, VOPProfile P> : Enc32 {
bits<8> vdst;
bits<9> src0;
- let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, 0);
+ let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, ?);
let Inst{16-9} = op;
let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
let Inst{31-25} = 0x3f; //encoding
@@ -48,7 +47,6 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let SubtargetPredicate = isGCN;
let VOP1 = 1;
let VALU = 1;
@@ -144,7 +142,7 @@ defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
// TODO: Make profile for this, there is VOP3 encoding also
def V_READFIRSTLANE_B32 :
InstSI <(outs SReg_32:$vdst),
- (ins VGPR_32:$src0),
+ (ins VRegOrLds_32:$src0),
"v_readfirstlane_b32 $vdst, $src0",
[(set i32:$vdst, (int_amdgcn_readfirstlane i32:$src0))]>,
Enc32 {
@@ -156,7 +154,6 @@ def V_READFIRSTLANE_B32 :
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let SubtargetPredicate = isGCN;
let VOP1 = 1;
let VALU = 1;
@@ -172,9 +169,16 @@ def V_READFIRSTLANE_B32 :
let Inst{31-25} = 0x3f; //encoding
}
-let SchedRW = [WriteQuarterRate32] in {
-defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
+let SchedRW = [WriteDoubleCvt] in {
+defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
+defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
+defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
+defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
+defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
+} // End SchedRW = [WriteDoubleCvt]
+
+let SchedRW = [WriteQuarterRate32] in {
defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
@@ -186,15 +190,12 @@ defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32>;
-defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
-defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
+} // End SchedRW = [WriteQuarterRate32]
+
defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>;
defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>;
defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>;
defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>;
-defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
-defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
-} // End SchedRW = [WriteQuarterRate32]
defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
@@ -271,6 +272,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let InsDPP = (ins DstRC:$vdst, DstRC:$old, Src0RC32:$src0,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsDPP16 = !con(InsDPP, (ins FI:$fi));
let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
@@ -279,6 +281,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let Asm32 = getAsm32<1, 1>.ret;
let Asm64 = getAsm64<1, 1, 0, 0, 1>.ret;
let AsmDPP = getAsmDPP<1, 1, 0>.ret;
+ let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret;
let AsmSDWA = getAsmSDWA<1, 1>.ret;
let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret;
@@ -305,41 +308,43 @@ defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
-// These instruction only exist on SI and CI
-let SubtargetPredicate = isSICI in {
-
-let SchedRW = [WriteQuarterRate32] in {
-defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
-defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>;
-defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
-defm V_RSQ_CLAMP_F32 : VOP1Inst <"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
-defm V_RSQ_LEGACY_F32 : VOP1Inst <"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
-} // End SchedRW = [WriteQuarterRate32]
-
-let SchedRW = [WriteDouble] in {
-defm V_RCP_CLAMP_F64 : VOP1Inst <"v_rcp_clamp_f64", VOP_F64_F64>;
-defm V_RSQ_CLAMP_F64 : VOP1Inst <"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
-} // End SchedRW = [WriteDouble]
-
-} // End SubtargetPredicate = isSICI
-
-
-let SubtargetPredicate = isCIVI in {
-
-let SchedRW = [WriteDoubleAdd] in {
-defm V_TRUNC_F64 : VOP1Inst <"v_trunc_f64", VOP_F64_F64, ftrunc>;
-defm V_CEIL_F64 : VOP1Inst <"v_ceil_f64", VOP_F64_F64, fceil>;
-defm V_FLOOR_F64 : VOP1Inst <"v_floor_f64", VOP_F64_F64, ffloor>;
-defm V_RNDNE_F64 : VOP1Inst <"v_rndne_f64", VOP_F64_F64, frint>;
-} // End SchedRW = [WriteDoubleAdd]
-
-let SchedRW = [WriteQuarterRate32] in {
-defm V_LOG_LEGACY_F32 : VOP1Inst <"v_log_legacy_f32", VOP_F32_F32>;
-defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
-} // End SchedRW = [WriteQuarterRate32]
-
-} // End SubtargetPredicate = isCIVI
-
+let SubtargetPredicate = isGFX6GFX7 in {
+ let SchedRW = [WriteQuarterRate32] in {
+ defm V_LOG_CLAMP_F32 :
+ VOP1Inst<"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
+ defm V_RCP_CLAMP_F32 :
+ VOP1Inst<"v_rcp_clamp_f32", VOP_F32_F32>;
+ defm V_RCP_LEGACY_F32 :
+ VOP1Inst<"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
+ defm V_RSQ_CLAMP_F32 :
+ VOP1Inst<"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
+ defm V_RSQ_LEGACY_F32 :
+ VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
+ } // End SchedRW = [WriteQuarterRate32]
+
+ let SchedRW = [WriteDouble] in {
+ defm V_RCP_CLAMP_F64 :
+ VOP1Inst<"v_rcp_clamp_f64", VOP_F64_F64>;
+ defm V_RSQ_CLAMP_F64 :
+ VOP1Inst<"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
+ } // End SchedRW = [WriteDouble]
+} // End SubtargetPredicate = isGFX6GFX7
+
+let SubtargetPredicate = isGFX7GFX8GFX9 in {
+ let SchedRW = [WriteQuarterRate32] in {
+ defm V_LOG_LEGACY_F32 : VOP1Inst<"v_log_legacy_f32", VOP_F32_F32>;
+ defm V_EXP_LEGACY_F32 : VOP1Inst<"v_exp_legacy_f32", VOP_F32_F32>;
+ } // End SchedRW = [WriteQuarterRate32]
+} // End SubtargetPredicate = isGFX7GFX8GFX9
+
+let SubtargetPredicate = isGFX7Plus in {
+ let SchedRW = [WriteDoubleAdd] in {
+ defm V_TRUNC_F64 : VOP1Inst<"v_trunc_f64", VOP_F64_F64, ftrunc>;
+ defm V_CEIL_F64 : VOP1Inst<"v_ceil_f64", VOP_F64_F64, fceil>;
+ defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, frint>;
+ defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>;
+ } // End SchedRW = [WriteDoubleAdd]
+} // End SubtargetPredicate = isGFX7Plus
let SubtargetPredicate = Has16BitInsts in {
@@ -393,125 +398,279 @@ def VOP_SWAP_I32 : VOPProfile<[i32, i32, i32, untyped]> {
let Ins64 = (ins);
}
-let SubtargetPredicate = isGFX9 in {
- let Constraints = "$vdst = $src1, $vdst1 = $src0",
- DisableEncoding="$vdst1,$src1",
- SchedRW = [Write64Bit, Write64Bit] in {
-// Never VOP3. Takes as long as 2 v_mov_b32s
-def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>;
+let SubtargetPredicate = isGFX9Plus in {
+ def V_SWAP_B32 : VOP1_Pseudo<"v_swap_b32", VOP_SWAP_I32, [], 1> {
+ let Constraints = "$vdst = $src1, $vdst1 = $src0";
+ let DisableEncoding = "$vdst1,$src1";
+ let SchedRW = [Write64Bit, Write64Bit];
+ }
+
+ defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
+ defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
+ defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+} // End SubtargetPredicate = isGFX9Plus
+
+let SubtargetPredicate = isGFX9Only in {
+ defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
+} // End SubtargetPredicate = isGFX9Only
+
+let SubtargetPredicate = isGFX10Plus in {
+ defm V_PIPEFLUSH : VOP1Inst<"v_pipeflush", VOP_NONE>;
+
+ let Uses = [M0] in {
+ // FIXME-GFX10: Should V_MOVRELSD_2_B32 be VOP_NO_EXT?
+ defm V_MOVRELSD_2_B32 :
+ VOP1Inst<"v_movrelsd_2_b32", VOP_NO_EXT<VOP_I32_I32>>;
+
+ def V_SWAPREL_B32 : VOP1_Pseudo<"v_swaprel_b32", VOP_SWAP_I32, [], 1> {
+ let Constraints = "$vdst = $src1, $vdst1 = $src0";
+ let DisableEncoding = "$vdst1,$src1";
+ let SchedRW = [Write64Bit, Write64Bit];
+ }
+ } // End Uses = [M0]
+} // End SubtargetPredicate = isGFX10Plus
+
+//===----------------------------------------------------------------------===//
+// Target-specific instruction encodings.
+//===----------------------------------------------------------------------===//
+
+class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> :
+ VOP_DPP<ps.OpName, p, isDPP16> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+
+ bits<8> vdst;
+ let Inst{8-0} = 0xfa;
+ let Inst{16-9} = op;
+ let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
+ let Inst{31-25} = 0x3f;
}
-defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
+class VOP1_DPP16<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
+ VOP1_DPP<op, ps, p, 1> {
+ let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
+ let SubtargetPredicate = HasDPP16;
+}
-defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
-defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
-defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
+ VOP_DPP8<ps.OpName, p> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
-} // End SubtargetPredicate = isGFX9
+ bits<8> vdst;
+ let Inst{8-0} = fi;
+ let Inst{16-9} = op;
+ let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
+ let Inst{31-25} = 0x3f;
+
+ let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst);
+ let SubtargetPredicate = HasDPP8;
+}
//===----------------------------------------------------------------------===//
-// Target
+// GFX10.
//===----------------------------------------------------------------------===//
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass VOP1Only_Real_gfx10<bits<9> op> {
+ def _gfx10 :
+ VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP1_Real_e32_gfx10<bits<9> op> {
+ def _e32_gfx10 :
+ VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
+ }
+ multiclass VOP1_Real_e64_gfx10<bits<9> op> {
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+ multiclass VOP1_Real_sdwa_gfx10<bits<9> op> {
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae<op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+ let DecoderNamespace = "SDWA10";
+ }
+ }
+ multiclass VOP1_Real_dpp_gfx10<bits<9> op> {
+ def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
+ let DecoderNamespace = "SDWA10";
+ }
+ }
+ multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
+ def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
+ let DecoderNamespace = "DPP8";
+ }
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+multiclass VOP1_Real_gfx10_no_dpp<bits<9> op> :
+ VOP1_Real_e32_gfx10<op>, VOP1_Real_e64_gfx10<op>,
+ VOP1_Real_sdwa_gfx10<op>;
+
+multiclass VOP1_Real_gfx10_no_dpp8<bits<9> op> :
+ VOP1_Real_e32_gfx10<op>, VOP1_Real_e64_gfx10<op>,
+ VOP1_Real_sdwa_gfx10<op>, VOP1_Real_dpp_gfx10<op>;
+
+multiclass VOP1_Real_gfx10<bits<9> op> :
+ VOP1_Real_gfx10_no_dpp8<op>, VOP1_Real_dpp8_gfx10<op>;
+
+defm V_PIPEFLUSH : VOP1_Real_gfx10<0x01b>;
+defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10<0x048>;
+defm V_CVT_F16_U16 : VOP1_Real_gfx10<0x050>;
+defm V_CVT_F16_I16 : VOP1_Real_gfx10<0x051>;
+defm V_CVT_U16_F16 : VOP1_Real_gfx10<0x052>;
+defm V_CVT_I16_F16 : VOP1_Real_gfx10<0x053>;
+defm V_RCP_F16 : VOP1_Real_gfx10<0x054>;
+defm V_SQRT_F16 : VOP1_Real_gfx10<0x055>;
+defm V_RSQ_F16 : VOP1_Real_gfx10<0x056>;
+defm V_LOG_F16 : VOP1_Real_gfx10<0x057>;
+defm V_EXP_F16 : VOP1_Real_gfx10<0x058>;
+defm V_FREXP_MANT_F16 : VOP1_Real_gfx10<0x059>;
+defm V_FREXP_EXP_I16_F16 : VOP1_Real_gfx10<0x05a>;
+defm V_FLOOR_F16 : VOP1_Real_gfx10<0x05b>;
+defm V_CEIL_F16 : VOP1_Real_gfx10<0x05c>;
+defm V_TRUNC_F16 : VOP1_Real_gfx10<0x05d>;
+defm V_RNDNE_F16 : VOP1_Real_gfx10<0x05e>;
+defm V_FRACT_F16 : VOP1_Real_gfx10<0x05f>;
+defm V_SIN_F16 : VOP1_Real_gfx10<0x060>;
+defm V_COS_F16 : VOP1_Real_gfx10<0x061>;
+defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10<0x062>;
+defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10<0x063>;
+defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10<0x064>;
+
+defm V_SWAP_B32 : VOP1Only_Real_gfx10<0x065>;
+defm V_SWAPREL_B32 : VOP1Only_Real_gfx10<0x068>;
+
//===----------------------------------------------------------------------===//
-// SI
+// GFX7, GFX10.
//===----------------------------------------------------------------------===//
-multiclass VOP1_Real_si <bits<9> op> {
- let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
- def _e32_si :
+let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
+ multiclass VOP1_Real_e32_gfx7<bits<9> op> {
+ def _e32_gfx7 :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
- def _e64_si :
+ }
+ multiclass VOP1_Real_e64_gfx7<bits<9> op> {
+ def _e64_gfx7 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ VOP3e_gfx6_gfx7<{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
-}
+} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
-defm V_NOP : VOP1_Real_si <0x0>;
-defm V_MOV_B32 : VOP1_Real_si <0x1>;
-defm V_CVT_I32_F64 : VOP1_Real_si <0x3>;
-defm V_CVT_F64_I32 : VOP1_Real_si <0x4>;
-defm V_CVT_F32_I32 : VOP1_Real_si <0x5>;
-defm V_CVT_F32_U32 : VOP1_Real_si <0x6>;
-defm V_CVT_U32_F32 : VOP1_Real_si <0x7>;
-defm V_CVT_I32_F32 : VOP1_Real_si <0x8>;
-defm V_MOV_FED_B32 : VOP1_Real_si <0x9>;
-defm V_CVT_F16_F32 : VOP1_Real_si <0xa>;
-defm V_CVT_F32_F16 : VOP1_Real_si <0xb>;
-defm V_CVT_RPI_I32_F32 : VOP1_Real_si <0xc>;
-defm V_CVT_FLR_I32_F32 : VOP1_Real_si <0xd>;
-defm V_CVT_OFF_F32_I4 : VOP1_Real_si <0xe>;
-defm V_CVT_F32_F64 : VOP1_Real_si <0xf>;
-defm V_CVT_F64_F32 : VOP1_Real_si <0x10>;
-defm V_CVT_F32_UBYTE0 : VOP1_Real_si <0x11>;
-defm V_CVT_F32_UBYTE1 : VOP1_Real_si <0x12>;
-defm V_CVT_F32_UBYTE2 : VOP1_Real_si <0x13>;
-defm V_CVT_F32_UBYTE3 : VOP1_Real_si <0x14>;
-defm V_CVT_U32_F64 : VOP1_Real_si <0x15>;
-defm V_CVT_F64_U32 : VOP1_Real_si <0x16>;
-defm V_FRACT_F32 : VOP1_Real_si <0x20>;
-defm V_TRUNC_F32 : VOP1_Real_si <0x21>;
-defm V_CEIL_F32 : VOP1_Real_si <0x22>;
-defm V_RNDNE_F32 : VOP1_Real_si <0x23>;
-defm V_FLOOR_F32 : VOP1_Real_si <0x24>;
-defm V_EXP_F32 : VOP1_Real_si <0x25>;
-defm V_LOG_CLAMP_F32 : VOP1_Real_si <0x26>;
-defm V_LOG_F32 : VOP1_Real_si <0x27>;
-defm V_RCP_CLAMP_F32 : VOP1_Real_si <0x28>;
-defm V_RCP_LEGACY_F32 : VOP1_Real_si <0x29>;
-defm V_RCP_F32 : VOP1_Real_si <0x2a>;
-defm V_RCP_IFLAG_F32 : VOP1_Real_si <0x2b>;
-defm V_RSQ_CLAMP_F32 : VOP1_Real_si <0x2c>;
-defm V_RSQ_LEGACY_F32 : VOP1_Real_si <0x2d>;
-defm V_RSQ_F32 : VOP1_Real_si <0x2e>;
-defm V_RCP_F64 : VOP1_Real_si <0x2f>;
-defm V_RCP_CLAMP_F64 : VOP1_Real_si <0x30>;
-defm V_RSQ_F64 : VOP1_Real_si <0x31>;
-defm V_RSQ_CLAMP_F64 : VOP1_Real_si <0x32>;
-defm V_SQRT_F32 : VOP1_Real_si <0x33>;
-defm V_SQRT_F64 : VOP1_Real_si <0x34>;
-defm V_SIN_F32 : VOP1_Real_si <0x35>;
-defm V_COS_F32 : VOP1_Real_si <0x36>;
-defm V_NOT_B32 : VOP1_Real_si <0x37>;
-defm V_BFREV_B32 : VOP1_Real_si <0x38>;
-defm V_FFBH_U32 : VOP1_Real_si <0x39>;
-defm V_FFBL_B32 : VOP1_Real_si <0x3a>;
-defm V_FFBH_I32 : VOP1_Real_si <0x3b>;
-defm V_FREXP_EXP_I32_F64 : VOP1_Real_si <0x3c>;
-defm V_FREXP_MANT_F64 : VOP1_Real_si <0x3d>;
-defm V_FRACT_F64 : VOP1_Real_si <0x3e>;
-defm V_FREXP_EXP_I32_F32 : VOP1_Real_si <0x3f>;
-defm V_FREXP_MANT_F32 : VOP1_Real_si <0x40>;
-defm V_CLREXCP : VOP1_Real_si <0x41>;
-defm V_MOVRELD_B32 : VOP1_Real_si <0x42>;
-defm V_MOVRELS_B32 : VOP1_Real_si <0x43>;
-defm V_MOVRELSD_B32 : VOP1_Real_si <0x44>;
+multiclass VOP1_Real_gfx7<bits<9> op> :
+ VOP1_Real_e32_gfx7<op>, VOP1_Real_e64_gfx7<op>;
+
+multiclass VOP1_Real_gfx7_gfx10<bits<9> op> :
+ VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>;
+
+defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>;
+defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>;
+
+defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10<0x017>;
+defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10<0x018>;
+defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10<0x019>;
+defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10<0x01a>;
//===----------------------------------------------------------------------===//
-// CI
+// GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
-multiclass VOP1_Real_ci <bits<9> op> {
- let AssemblerPredicates = [isCIOnly], DecoderNamespace = "CI" in {
- def _e32_ci :
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass VOP1_Real_e32_gfx6_gfx7<bits<9> op> {
+ def _e32_gfx6_gfx7 :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
- def _e64_ci :
+ }
+ multiclass VOP1_Real_e64_gfx6_gfx7<bits<9> op> {
+ def _e64_gfx6_gfx7 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ VOP3e_gfx6_gfx7<{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
-}
-
-defm V_TRUNC_F64 : VOP1_Real_ci <0x17>;
-defm V_CEIL_F64 : VOP1_Real_ci <0x18>;
-defm V_FLOOR_F64 : VOP1_Real_ci <0x1A>;
-defm V_RNDNE_F64 : VOP1_Real_ci <0x19>;
-defm V_LOG_LEGACY_F32 : VOP1_Real_ci <0x45>;
-defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>;
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass VOP1_Real_gfx6_gfx7<bits<9> op> :
+ VOP1_Real_e32_gfx6_gfx7<op>, VOP1_Real_e64_gfx6_gfx7<op>;
+
+multiclass VOP1_Real_gfx6_gfx7_gfx10<bits<9> op> :
+ VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10<op>;
+
+multiclass VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<bits<9> op> :
+ VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10_no_dpp8<op>;
+
+multiclass VOP1_Real_gfx6_gfx7_gfx10_no_dpp<bits<9> op> :
+ VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10_no_dpp<op>;
+
+defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>;
+defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>;
+defm V_RCP_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x029>;
+defm V_RSQ_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x02c>;
+defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>;
+defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>;
+defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>;
+
+defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10<0x000>;
+defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x001>;
+defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x003>;
+defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x004>;
+defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x005>;
+defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x006>;
+defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x007>;
+defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x008>;
+defm V_MOV_FED_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x009>;
+defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00a>;
+defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10<0x00b>;
+defm V_CVT_RPI_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00c>;
+defm V_CVT_FLR_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00d>;
+defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10<0x00e>;
+defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x00f>;
+defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x010>;
+defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10<0x011>;
+defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10<0x012>;
+defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10<0x013>;
+defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10<0x014>;
+defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x015>;
+defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x016>;
+defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x020>;
+defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x021>;
+defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x022>;
+defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x023>;
+defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x024>;
+defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x025>;
+defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x027>;
+defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02a>;
+defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02b>;
+defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02e>;
+defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x02f>;
+defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x031>;
+defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x033>;
+defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x034>;
+defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x035>;
+defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x036>;
+defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x037>;
+defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x038>;
+defm V_FFBH_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x039>;
+defm V_FFBL_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x03a>;
+defm V_FFBH_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x03b>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03c>;
+defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03d>;
+defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03e>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x03f>;
+defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x040>;
+defm V_CLREXCP : VOP1_Real_gfx6_gfx7_gfx10<0x041>;
+defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp<0x042>;
+defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<0x043>;
+defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<0x044>;
//===----------------------------------------------------------------------===//
-// VI
+// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
@@ -524,7 +683,7 @@ class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
}
multiclass VOP1Only_Real_vi <bits<10> op> {
- let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+ let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in {
def _vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
@@ -532,7 +691,7 @@ multiclass VOP1Only_Real_vi <bits<10> op> {
}
multiclass VOP1_Real_e32e64_vi <bits<10> op> {
- let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+ let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in {
def _e32_vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
@@ -649,7 +808,7 @@ def V_MOV_B32_indirect : VPseudoInstSI<(outs),
PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst,
getVOPSrc0ForVT<i32>.ret:$src0)> {
let VOP1 = 1;
- let SubtargetPredicate = isVI;
+ let SubtargetPredicate = isGFX8GFX9;
}
// This is a pseudo variant of the v_movreld_b32 instruction in which the
@@ -672,7 +831,7 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
-let OtherPredicates = [isVI] in {
+let OtherPredicates = [isGFX8GFX9] in {
def : GCNPat <
(i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
@@ -690,6 +849,9 @@ def : GCNPat <
(as_i1imm $bound_ctrl))
>;
+} // End OtherPredicates = [isGFX8GFX9]
+
+let OtherPredicates = [isGFX8Plus] in {
def : GCNPat<
(i32 (anyext i16:$src)),
(COPY $src)
@@ -712,14 +874,14 @@ def : GCNPat <
(EXTRACT_SUBREG $src, sub0)
>;
-} // End OtherPredicates = [isVI]
+} // End OtherPredicates = [isGFX8Plus]
//===----------------------------------------------------------------------===//
// GFX9
//===----------------------------------------------------------------------===//
multiclass VOP1_Real_gfx9 <bits<10> op> {
- let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
+ let AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9" in {
defm NAME : VOP1_Real_e32e64_vi <op>;
}
@@ -735,3 +897,30 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
}
defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
+
+//===----------------------------------------------------------------------===//
+// GFX10
+//===----------------------------------------------------------------------===//
+
+let OtherPredicates = [isGFX10Plus] in {
+def : GCNPat <
+ (i32 (int_amdgcn_mov_dpp8 i32:$src, imm:$dpp8)),
+ (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0))
+>;
+
+def : GCNPat <
+ (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
+ imm:$bound_ctrl)),
+ (V_MOV_B32_dpp_gfx10 $src, $src, (as_i32imm $dpp_ctrl),
+ (as_i32imm $row_mask), (as_i32imm $bank_mask),
+ (as_i1imm $bound_ctrl), (i32 0))
+>;
+
+def : GCNPat <
+ (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask,
+ imm:$bank_mask, imm:$bound_ctrl)),
+ (V_MOV_B32_dpp_gfx10 $old, $src, (as_i32imm $dpp_ctrl),
+ (as_i32imm $row_mask), (as_i32imm $bank_mask),
+ (as_i1imm $bound_ctrl), (i32 0))
+>;
+} // End OtherPredicates = [isGFX10Plus]
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index e3fd7b5f9fad..1b30cd2ed516 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1,9 +1,8 @@
//===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -69,7 +68,6 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let SubtargetPredicate = isGCN;
let VOP2 = 1;
let VALU = 1;
@@ -177,7 +175,9 @@ multiclass VOP2bInst <string opName,
let SchedRW = [Write32Bit, WriteSALU] in {
let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>,
- Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)> {
+ let usesCustomInserter = !eq(P.NumSrcArgs, 2);
+ }
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
let AsmMatchConverter = "cvtSdwaVOP2b";
@@ -192,6 +192,23 @@ multiclass VOP2bInst <string opName,
}
}
+class VOP2bInstAlias <VOP2_Pseudo ps, Instruction inst,
+ string OpName, string opnd> :
+ InstAlias <OpName#" "#!subst("vcc", opnd, ps.Pfl.Asm32),
+ (inst ps.Pfl.DstRC:$vdst, ps.Pfl.Src0RC32:$src0,
+ ps.Pfl.Src1RC32:$src1)>,
+ PredicateControl {
+}
+
+multiclass VOP2bInstAliases<VOP2_Pseudo ps, VOP2_Real inst, string OpName> {
+ let WaveSizePredicate = isWave32 in {
+ def : VOP2bInstAlias<ps, inst, OpName, "vcc_lo">;
+ }
+ let WaveSizePredicate = isWave64 in {
+ def : VOP2bInstAlias<ps, inst, OpName, "vcc">;
+ }
+}
+
multiclass VOP2eInst <string opName,
VOPProfile P,
SDPatternOperator node = null_frag,
@@ -216,6 +233,22 @@ multiclass VOP2eInst <string opName,
}
}
+class VOP2eInstAlias <VOP2_Pseudo ps, Instruction inst, string opnd> :
+ InstAlias <ps.OpName#" "#ps.Pfl.Asm32#", "#opnd,
+ (inst ps.Pfl.DstRC:$vdst, ps.Pfl.Src0RC32:$src0,
+ ps.Pfl.Src1RC32:$src1)>,
+ PredicateControl {
+}
+
+multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {
+ let WaveSizePredicate = isWave32 in {
+ def : VOP2eInstAlias<ps, inst, "vcc_lo">;
+ }
+ let WaveSizePredicate = isWave64 in {
+ def : VOP2eInstAlias<ps, inst, "vcc">;
+ }
+}
+
class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
@@ -244,15 +277,22 @@ def VOP_MADMK_F32 : VOP_MADMK <f32>;
// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
// and processing time but it makes it easier to convert to mad.
-class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
- 0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
+ 0, HasModifiers, HasModifiers, HasOMod,
+ Src0Mod, Src1Mod, Src2Mod>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
VGPR_32:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+
+ let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+ Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
+ VGPR_32:$src2, // stub argument
+ dpp8:$dpp8, FI:$fi);
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
@@ -260,11 +300,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
clampmod:$clamp, omod:$omod,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
- let Asm32 = getAsm32<1, 2, vt>.ret;
- let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt>.ret;
- let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
- let AsmSDWA = getAsmSDWA<1, 2, vt>.ret;
- let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret;
+ let Asm32 = getAsm32<1, 2, vt0>.ret;
+ let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt0>.ret;
+ let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt0>.ret;
+ let AsmDPP16 = getAsmDPP16<1, 2, HasModifiers, vt0>.ret;
+ let AsmDPP8 = getAsmDPP8<1, 2, 0, vt0>.ret;
+ let AsmSDWA = getAsmSDWA<1, 2, vt0>.ret;
+ let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt0>.ret;
let HasSrc2 = 0;
let HasSrc2Mods = 0;
@@ -272,38 +314,51 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
let HasExtDPP = 1;
let HasExtSDWA = 1;
let HasExtSDWA9 = 0;
+ let TieRegDPP = "$src2";
}
def VOP_MAC_F16 : VOP_MAC <f16>;
def VOP_MAC_F32 : VOP_MAC <f32>;
+class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> {
+ let HasClamp = 0;
+ let HasExtSDWA = 0;
+ let HasModifiers = 1;
+ let HasOpSel = 0;
+ let IsPacked = 0;
+}
+
+def VOP_DOT_ACC_F32_V2F16 : VOP_DOT_ACC<f32, v2f16> {
+ let Src0ModDPP = FPVRegInputMods;
+ let Src1ModDPP = FPVRegInputMods;
+}
+def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC<i32, i32>;
+
// Write out to vcc or arbitrary SGPR.
-def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp=*/1> {
let Asm32 = "$vdst, vcc, $src0, $src1";
- let Asm64 = "$vdst, $sdst, $src0, $src1";
+ let Asm64 = "$vdst, $sdst, $src0, $src1$clamp";
let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+ let AsmDPP8 = "$vdst, vcc, $src0, $src1 $dpp8$fi";
+ let AsmDPP16 = AsmDPP#"$fi";
let Outs32 = (outs DstRC:$vdst);
- let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
}
// Write out to vcc or arbitrary SGPR and read in from vcc or
// arbitrary SGPR.
-def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
- // We use VCSrc_b32 to exclude literal constants, even though the
- // encoding normally allows them since the implicit VCC use means
- // using one would always violate the constant bus
- // restriction. SGPRs are still allowed because it should
- // technically be possible to use VCC again as src0.
- let Src0RC32 = VCSrc_b32;
+def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> {
let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
- let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
+ let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+ let AsmDPP8 = "$vdst, vcc, $src0, $src1, vcc $dpp8$fi";
+ let AsmDPP16 = AsmDPP#"$fi";
let Outs32 = (outs DstRC:$vdst);
- let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
// Suppress src2 implied by type since the 32-bit encoding uses an
// implicit VCC use.
@@ -320,20 +375,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+
let HasExt = 1;
let HasExtDPP = 1;
let HasExtSDWA = 1;
let HasExtSDWA9 = 1;
}
-// Read in from vcc or arbitrary SGPR
-def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
- let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
- let Asm32 = "$vdst, $src0, $src1, vcc";
- let Asm64 = "$vdst, $src0, $src1, $src2";
+// Read in from vcc or arbitrary SGPR.
+def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/1> {
+ let Asm32 = "$vdst, $src0, $src1";
+ let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2";
let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+ let AsmDPP8 = "$vdst, $src0, $src1, vcc $dpp8$fi";
+ let AsmDPP16 = AsmDPP#"$fi";
let Outs32 = (outs DstRC:$vdst);
let Outs64 = (outs DstRC:$vdst);
@@ -349,10 +407,12 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
src0_sel:$src0_sel, src1_sel:$src1_sel);
let InsDPP = (ins DstRCDPP:$old,
- Src0DPP:$src0,
- Src1DPP:$src1,
+ Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+ Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+ let InsDPP16 = !con(InsDPP, (ins FI:$fi));
+
let HasExt = 1;
let HasExtDPP = 1;
let HasExtSDWA = 1;
@@ -362,7 +422,7 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
let Outs32 = (outs SReg_32:$vdst);
let Outs64 = Outs32;
- let Ins32 = (ins VGPR_32:$src0, SCSrc_b32:$src1);
+ let Ins32 = (ins VRegOrLds_32:$src0, SCSrc_b32:$src1);
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
@@ -393,8 +453,6 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
// VOP2 Instructions
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN, Predicates = [isGCN] in {
-
defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
@@ -414,9 +472,9 @@ defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>;
-defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">;
-defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">;
-defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">;
+defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, lshr_rev, "v_lshr_b32">;
+defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, ashr_rev, "v_ashr_i32">;
+defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, lshl_rev, "v_lshl_b32">;
defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
@@ -442,9 +500,9 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f
let SubtargetPredicate = HasAddNoCarryInsts in {
-defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32, null_frag, "v_add_u32", 1>;
-defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>;
-defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>;
+defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>;
+defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
+defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
}
} // End isCommutable = 1
@@ -472,32 +530,20 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16
defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>;
defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>;
-} // End SubtargetPredicate = isGCN, Predicates = [isGCN]
-
-def : GCNPat<
- (AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
- (V_ADDC_U32_e64 $src0, $src1, $src2)
->;
-
-def : GCNPat<
- (AMDGPUsube i32:$src0, i32:$src1, i1:$src2),
- (V_SUBB_U32_e64 $src0, $src1, $src2)
->;
-
-// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI, Predicates = [isSICI] in {
+let SubtargetPredicate = isGFX6GFX7 in {
defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>;
defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
+} // End SubtargetPredicate = isGFX6GFX7
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
let isCommutable = 1 in {
defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
-defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
-defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
} // End isCommutable = 1
-
-} // End let SubtargetPredicate = SICI, Predicates = [isSICI]
+} // End SubtargetPredicate = isGFX6GFX7GFX10
class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
GCNPat<
@@ -508,29 +554,29 @@ class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
)
>;
-let AddedComplexity = 1 in {
- def : DivergentBinOp<srl, V_LSHRREV_B32_e64>;
- def : DivergentBinOp<sra, V_ASHRREV_I32_e64>;
- def : DivergentBinOp<shl, V_LSHLREV_B32_e64>;
-}
+class DivergentClampingBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
+ GCNPat<
+ (getDivergentFrag<Op>.ret Inst.Pfl.Src0VT:$src0, Inst.Pfl.Src1VT:$src1),
+ !if(!cast<Commutable_REV>(Inst).IsOrig,
+ (Inst $src0, $src1, 0),
+ (Inst $src1, $src0, 0)
+ )
+ >;
+
+def : DivergentBinOp<srl, V_LSHRREV_B32_e64>;
+def : DivergentBinOp<sra, V_ASHRREV_I32_e64>;
+def : DivergentBinOp<shl, V_LSHLREV_B32_e64>;
let SubtargetPredicate = HasAddNoCarryInsts in {
- def : DivergentBinOp<add, V_ADD_U32_e32>;
- def : DivergentBinOp<sub, V_SUB_U32_e32>;
- def : DivergentBinOp<sub, V_SUBREV_U32_e32>;
+ def : DivergentClampingBinOp<add, V_ADD_U32_e64>;
+ def : DivergentClampingBinOp<sub, V_SUB_U32_e64>;
}
+let SubtargetPredicate = isGFX6GFX7GFX8GFX9, Predicates = [isGFX6GFX7GFX8GFX9] in {
+def : DivergentClampingBinOp<add, V_ADD_I32_e64>;
+def : DivergentClampingBinOp<sub, V_SUB_I32_e64>;
+}
-def : DivergentBinOp<add, V_ADD_I32_e32>;
-
-def : DivergentBinOp<add, V_ADD_I32_e64>;
-def : DivergentBinOp<sub, V_SUB_I32_e32>;
-
-def : DivergentBinOp<sub, V_SUBREV_I32_e32>;
-
-def : DivergentBinOp<srl, V_LSHRREV_B32_e32>;
-def : DivergentBinOp<sra, V_ASHRREV_I32_e32>;
-def : DivergentBinOp<shl, V_LSHLREV_B32_e32>;
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
def : DivergentBinOp<sube, V_SUBB_U32_e32>;
@@ -604,56 +650,133 @@ defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
} // End SubtargetPredicate = HasDLInsts
-// Note: 16-bit instructions produce a 0 result in the high 16-bits.
-multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
+let Constraints = "$vdst = $src2",
+ DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1,
+ isCommutable = 1 in {
+ let SubtargetPredicate = HasDot5Insts in
+ defm V_DOT2C_F32_F16 : VOP2Inst_e32<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>;
+ let SubtargetPredicate = HasDot6Insts in
+ defm V_DOT4C_I32_I8 : VOP2Inst_e32<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>;
+
+ let SubtargetPredicate = HasDot4Insts in
+ defm V_DOT2C_I32_I16 : VOP2Inst_e32<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>;
+ let SubtargetPredicate = HasDot3Insts in
+ defm V_DOT8C_I32_I4 : VOP2Inst_e32<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>;
+}
+
+let AddedComplexity = 30 in {
+ def : GCNPat<
+ (f32 (AMDGPUfdot2 v2f16:$src0, v2f16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))),
+ (f32 (V_DOT2C_F32_F16_e32 $src0, $src1, $src2))
+ > {
+ let SubtargetPredicate = HasDot5Insts;
+ }
+ def : GCNPat<
+ (i32 (int_amdgcn_sdot4 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))),
+ (i32 (V_DOT4C_I32_I8_e32 $src0, $src1, $src2))
+ > {
+ let SubtargetPredicate = HasDot6Insts;
+ }
+ def : GCNPat<
+ (i32 (int_amdgcn_sdot2 v2i16:$src0, v2i16:$src1, i32:$src2, (i1 DSTCLAMP.NONE))),
+ (i32 (V_DOT2C_I32_I16_e32 $src0, $src1, $src2))
+ > {
+ let SubtargetPredicate = HasDot4Insts;
+ }
+ def : GCNPat<
+ (i32 (int_amdgcn_sdot8 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))),
+ (i32 (V_DOT8C_I32_I4_e32 $src0, $src1, $src2))
+ > {
+ let SubtargetPredicate = HasDot3Insts;
+ }
+} // End AddedComplexity = 30
+
+let SubtargetPredicate = isGFX10Plus in {
+
+def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">;
+let FPDPRounding = 1 in
+def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
+
+let isCommutable = 1 in {
+def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">;
+let FPDPRounding = 1 in
+def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
+} // End isCommutable = 1
+
+let Constraints = "$vdst = $src2",
+ DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1,
+ isCommutable = 1 in {
+defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>;
+}
+
+} // End SubtargetPredicate = isGFX10Plus
+
+let SubtargetPredicate = HasPkFmacF16Inst in {
+defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>;
+} // End SubtargetPredicate = HasPkFmacF16Inst
+
+// Note: 16-bit instructions produce a 0 result in the high 16-bits
+// on GFX8 and GFX9 and preserve high 16 bits on GFX10+
+def ClearHI16 : OutPatFrag<(ops node:$op),
+ (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>;
+
+multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst,
+ bit PreservesHI16 = 0> {
def : GCNPat<
(op i16:$src0, i16:$src1),
- (inst $src0, $src1)
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
>;
def : GCNPat<
(i32 (zext (op i16:$src0, i16:$src1))),
- (inst $src0, $src1)
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
>;
def : GCNPat<
(i64 (zext (op i16:$src0, i16:$src1))),
(REG_SEQUENCE VReg_64,
- (inst $src0, $src1), sub0,
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)),
+ sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
-
}
-multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> {
+multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst,
+ bit PreservesHI16 = 0> {
def : GCNPat<
(op i16:$src0, i16:$src1),
- (inst $src1, $src0)
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0))
>;
def : GCNPat<
(i32 (zext (op i16:$src0, i16:$src1))),
- (inst $src1, $src0)
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0))
>;
def : GCNPat<
(i64 (zext (op i16:$src0, i16:$src1))),
(REG_SEQUENCE VReg_64,
- (inst $src1, $src0), sub0,
+ !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)),
+ sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
}
class ZExt_i16_i1_Pat <SDNode ext> : GCNPat <
(i16 (ext i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
+ (V_CNDMASK_B32_e64 (i32 0/*src0mod*/), (i32 0/*src0*/),
+ (i32 0/*src1mod*/), (i32 1/*src1*/),
+ $src)
>;
let Predicates = [Has16BitInsts] in {
+let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>;
@@ -661,6 +784,17 @@ defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>;
defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;
+}
+
+let Predicates = [Has16BitInsts, isGFX10Plus] in {
+defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64, 1>;
+defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64, 1>;
+defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64, 1>;
+defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64, 1>;
+defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64, 1>;
+defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64, 1>;
+defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64, 1>;
+}
def : GCNPat <
(and i16:$src0, i16:$src1),
@@ -677,16 +811,25 @@ def : GCNPat <
(V_XOR_B32_e64 $src0, $src1)
>;
+let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>;
defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>;
defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>;
+}
+
+let Predicates = [Has16BitInsts, isGFX10Plus] in {
+defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64, 1>;
+defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64, 1>;
+defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64, 1>;
+}
def : ZExt_i16_i1_Pat<zext>;
def : ZExt_i16_i1_Pat<anyext>;
def : GCNPat <
(i16 (sext i1:$src)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), $src)
>;
// Undo sub x, c -> add x, -c canonicalization since c is more likely
@@ -697,105 +840,334 @@ def : GCNPat<
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
>;
-} // End Predicates = [Has16BitInsts]
+} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9]
+
//===----------------------------------------------------------------------===//
-// SI
+// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
-let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+class VOP2_DPP<bits<6> op, VOP2_Pseudo ps,
+ string opName = ps.OpName, VOPProfile p = ps.Pfl,
+ bit IsDPP16 = 0> :
+ VOP_DPP<opName, p, IsDPP16> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
-multiclass VOP2_Real_si <bits<6> op> {
- def _si :
- VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+ bits<8> vdst;
+ bits<8> src1;
+ let Inst{8-0} = 0xfa;
+ let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0;
}
-multiclass VOP2_Real_MADK_si <bits<6> op> {
- def _si : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+class VOP2_DPP16<bits<6> op, VOP2_Pseudo ps,
+ string opName = ps.OpName, VOPProfile p = ps.Pfl> :
+ VOP2_DPP<op, ps, opName, p, 1> {
+ let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
+ let SubtargetPredicate = HasDPP16;
}
-multiclass VOP2_Real_e32_si <bits<6> op> {
- def _e32_si :
- VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
- VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
+ string opName = ps.OpName, VOPProfile p = ps.Pfl> :
+ VOP_DPP8<ps.OpName, p> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+
+ bits<8> vdst;
+ bits<8> src1;
+
+ let Inst{8-0} = fi;
+ let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0;
+
+ let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst);
+ let SubtargetPredicate = HasDPP8;
}
-multiclass VOP2_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
- def _e64_si :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3e_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
-}
-
-multiclass VOP2be_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
- def _e64_si :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3be_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
-}
-
-} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"
-
-defm V_CNDMASK_B32 : VOP2_Real_e32e64_si <0x0>;
-defm V_ADD_F32 : VOP2_Real_e32e64_si <0x3>;
-defm V_SUB_F32 : VOP2_Real_e32e64_si <0x4>;
-defm V_SUBREV_F32 : VOP2_Real_e32e64_si <0x5>;
-defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_si <0x7>;
-defm V_MUL_F32 : VOP2_Real_e32e64_si <0x8>;
-defm V_MUL_I32_I24 : VOP2_Real_e32e64_si <0x9>;
-defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_si <0xa>;
-defm V_MUL_U32_U24 : VOP2_Real_e32e64_si <0xb>;
-defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_si <0xc>;
-defm V_MIN_F32 : VOP2_Real_e32e64_si <0xf>;
-defm V_MAX_F32 : VOP2_Real_e32e64_si <0x10>;
-defm V_MIN_I32 : VOP2_Real_e32e64_si <0x11>;
-defm V_MAX_I32 : VOP2_Real_e32e64_si <0x12>;
-defm V_MIN_U32 : VOP2_Real_e32e64_si <0x13>;
-defm V_MAX_U32 : VOP2_Real_e32e64_si <0x14>;
-defm V_LSHRREV_B32 : VOP2_Real_e32e64_si <0x16>;
-defm V_ASHRREV_I32 : VOP2_Real_e32e64_si <0x18>;
-defm V_LSHLREV_B32 : VOP2_Real_e32e64_si <0x1a>;
-defm V_AND_B32 : VOP2_Real_e32e64_si <0x1b>;
-defm V_OR_B32 : VOP2_Real_e32e64_si <0x1c>;
-defm V_XOR_B32 : VOP2_Real_e32e64_si <0x1d>;
-defm V_MAC_F32 : VOP2_Real_e32e64_si <0x1f>;
-defm V_MADMK_F32 : VOP2_Real_MADK_si <0x20>;
-defm V_MADAK_F32 : VOP2_Real_MADK_si <0x21>;
-defm V_ADD_I32 : VOP2be_Real_e32e64_si <0x25>;
-defm V_SUB_I32 : VOP2be_Real_e32e64_si <0x26>;
-defm V_SUBREV_I32 : VOP2be_Real_e32e64_si <0x27>;
-defm V_ADDC_U32 : VOP2be_Real_e32e64_si <0x28>;
-defm V_SUBB_U32 : VOP2be_Real_e32e64_si <0x29>;
-defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>;
-
-defm V_READLANE_B32 : VOP2_Real_si <0x01>;
-
-let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
-defm V_WRITELANE_B32 : VOP2_Real_si <0x02>;
-}
-
-defm V_MAC_LEGACY_F32 : VOP2_Real_e32e64_si <0x6>;
-defm V_MIN_LEGACY_F32 : VOP2_Real_e32e64_si <0xd>;
-defm V_MAX_LEGACY_F32 : VOP2_Real_e32e64_si <0xe>;
-defm V_LSHR_B32 : VOP2_Real_e32e64_si <0x15>;
-defm V_ASHR_I32 : VOP2_Real_e32e64_si <0x17>;
-defm V_LSHL_B32 : VOP2_Real_e32e64_si <0x19>;
-
-defm V_BFM_B32 : VOP2_Real_e32e64_si <0x1e>;
-defm V_BCNT_U32_B32 : VOP2_Real_e32e64_si <0x22>;
-defm V_MBCNT_LO_U32_B32 : VOP2_Real_e32e64_si <0x23>;
-defm V_MBCNT_HI_U32_B32 : VOP2_Real_e32e64_si <0x24>;
-defm V_LDEXP_F32 : VOP2_Real_e32e64_si <0x2b>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e32e64_si <0x2c>;
-defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e32e64_si <0x2d>;
-defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e32e64_si <0x2e>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e32e64_si <0x2f>;
-defm V_CVT_PK_U16_U32 : VOP2_Real_e32e64_si <0x30>;
-defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>;
+//===----------------------------------------------------------------------===//
+// GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ //===------------------------------- VOP2 -------------------------------===//
+ multiclass VOP2Only_Real_MADK_gfx10<bits<6> op> {
+ def _gfx10 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP2Only_Real_MADK_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _gfx10 :
+ VOP2_Real<!cast<VOP2_Pseudo>(opName), SIEncodingFamily.GFX10>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(opName).Pfl> {
+ VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName);
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ multiclass VOP2_Real_e32_gfx10<bits<6> op> {
+ def _e32_gfx10 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+ }
+ multiclass VOP2_Real_e64_gfx10<bits<6> op> {
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+ multiclass VOP2_Real_sdwa_gfx10<bits<6> op> {
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+ let DecoderNamespace = "SDWA10";
+ }
+ }
+ multiclass VOP2_Real_dpp_gfx10<bits<6> op> {
+ def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
+ let DecoderNamespace = "SDWA10";
+ }
+ }
+ multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
+ def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
+ let DecoderNamespace = "DPP8";
+ }
+ }
+
+ //===------------------------- VOP2 (with name) -------------------------===//
+ multiclass VOP2_Real_e32_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _e32_gfx10 :
+ VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.GFX10>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl> {
+ VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ multiclass VOP2_Real_e64_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<{0, 1, 0, 0, op{5-0}},
+ !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ let DecoderNamespace = "SDWA10" in {
+ multiclass VOP2_Real_sdwa_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+ VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.Pfl.AsmDPP16;
+ }
+ }
+ multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName,
+ string asmName> {
+ def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+ VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.Pfl.AsmDPP8;
+ let DecoderNamespace = "DPP8";
+ }
+ }
+ } // End DecoderNamespace = "SDWA10"
+
+ //===------------------------------ VOP2be ------------------------------===//
+ multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> {
+ def _e32_gfx10 :
+ VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.GFX10>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl> {
+ VOP2_Pseudo Ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands);
+ }
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3be_gfx10<{0, 1, 0, 0, op{5-0}},
+ !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
+ VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let AsmString = asmName # Ps.AsmOperands;
+ }
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands);
+ let DecoderNamespace = "SDWA10";
+ }
+ def _dpp_gfx10 :
+ VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # !subst(", vcc", "", AsmDPP);
+ let DecoderNamespace = "SDWA10";
+ }
+ def _dpp8_gfx10 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
+ let DecoderNamespace = "DPP8";
+ }
+
+ let WaveSizePredicate = isWave32 in {
+ def _sdwa_w32_gfx10 :
+ Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands);
+ let isAsmParserOnly = 1;
+ let DecoderNamespace = "SDWA10";
+ }
+ def _dpp_w32_gfx10 :
+ VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
+ let isAsmParserOnly = 1;
+ }
+ def _dpp8_w32_gfx10 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
+ let isAsmParserOnly = 1;
+ }
+ } // End WaveSizePredicate = isWave32
+
+ let WaveSizePredicate = isWave64 in {
+ def _sdwa_w64_gfx10 :
+ Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # Ps.AsmOperands;
+ let isAsmParserOnly = 1;
+ let DecoderNamespace = "SDWA10";
+ }
+ def _dpp_w64_gfx10 :
+ VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # AsmDPP;
+ let isAsmParserOnly = 1;
+ }
+ def _dpp8_w64_gfx10 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # AsmDPP8;
+ let isAsmParserOnly = 1;
+ }
+ } // End WaveSizePredicate = isWave64
+ }
+ //===----------------------------- VOP3Only -----------------------------===//
+ multiclass VOP3Only_Real_gfx10<bits<10> op> {
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+
+ //===---------------------------- VOP3beOnly ----------------------------===//
+ multiclass VOP3beOnly_Real_gfx10<bits<10> op, string opName, string asmName> {
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3be_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
+ VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let AsmString = asmName # Ps.AsmOperands;
+ }
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+multiclass Base_VOP2_Real_gfx10<bits<6> op> :
+ VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>;
+
+multiclass VOP2_Real_gfx10<bits<6> op> :
+ VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>,
+ VOP2_Real_sdwa_gfx10<op>, VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>;
+
+multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName,
+ string asmName> :
+ VOP2_Real_e32_gfx10_with_name<op, opName, asmName>,
+ VOP2_Real_e64_gfx10_with_name<op, opName, asmName>,
+ VOP2_Real_sdwa_gfx10_with_name<op, opName, asmName>,
+ VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>,
+ VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>;
+
+defm V_CNDMASK_B32 : Base_VOP2_Real_gfx10<0x001>;
+defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>;
+defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>;
+defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>;
+defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10<0x02d>;
+defm V_ADD_F16 : VOP2_Real_gfx10<0x032>;
+defm V_SUB_F16 : VOP2_Real_gfx10<0x033>;
+defm V_SUBREV_F16 : VOP2_Real_gfx10<0x034>;
+defm V_MUL_F16 : VOP2_Real_gfx10<0x035>;
+defm V_FMAC_F16 : VOP2_Real_gfx10<0x036>;
+defm V_FMAMK_F16 : VOP2Only_Real_MADK_gfx10<0x037>;
+defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>;
+defm V_MAX_F16 : VOP2_Real_gfx10<0x039>;
+defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>;
+defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;
+defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
+
+// VOP2 no carry-in, carry-out.
+defm V_ADD_NC_U32 :
+ VOP2_Real_gfx10_with_name<0x025, "V_ADD_U32", "v_add_nc_u32">;
+defm V_SUB_NC_U32 :
+ VOP2_Real_gfx10_with_name<0x026, "V_SUB_U32", "v_sub_nc_u32">;
+defm V_SUBREV_NC_U32 :
+ VOP2_Real_gfx10_with_name<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">;
+
+// VOP2 carry-in, carry-out.
+defm V_ADD_CO_CI_U32 :
+ VOP2be_Real_gfx10<0x028, "V_ADDC_U32", "v_add_co_ci_u32">;
+defm V_SUB_CO_CI_U32 :
+ VOP2be_Real_gfx10<0x029, "V_SUBB_U32", "v_sub_co_ci_u32">;
+defm V_SUBREV_CO_CI_U32 :
+ VOP2be_Real_gfx10<0x02a, "V_SUBBREV_U32", "v_subrev_co_ci_u32">;
+
+// VOP3 only.
+defm V_BFM_B32 : VOP3Only_Real_gfx10<0x363>;
+defm V_BCNT_U32_B32 : VOP3Only_Real_gfx10<0x364>;
+defm V_MBCNT_LO_U32_B32 : VOP3Only_Real_gfx10<0x365>;
+defm V_MBCNT_HI_U32_B32 : VOP3Only_Real_gfx10<0x366>;
+defm V_LDEXP_F32 : VOP3Only_Real_gfx10<0x362>;
+defm V_CVT_PKNORM_I16_F32 : VOP3Only_Real_gfx10<0x368>;
+defm V_CVT_PKNORM_U16_F32 : VOP3Only_Real_gfx10<0x369>;
+defm V_CVT_PK_U16_U32 : VOP3Only_Real_gfx10<0x36a>;
+defm V_CVT_PK_I16_I32 : VOP3Only_Real_gfx10<0x36b>;
+
+// VOP3 carry-in, carry-out.
+defm V_ADD_CO_U32 :
+ VOP3beOnly_Real_gfx10<0x30f, "V_ADD_I32", "v_add_co_u32">;
+defm V_SUB_CO_U32 :
+ VOP3beOnly_Real_gfx10<0x310, "V_SUB_I32", "v_sub_co_u32">;
+defm V_SUBREV_CO_U32 :
+ VOP3beOnly_Real_gfx10<0x319, "V_SUBREV_I32", "v_subrev_co_u32">;
+
+let SubtargetPredicate = isGFX10Plus in {
+ defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx10>;
+
+ defm : VOP2bInstAliases<
+ V_ADDC_U32_e32, V_ADD_CO_CI_U32_e32_gfx10, "v_add_co_ci_u32">;
+ defm : VOP2bInstAliases<
+ V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx10, "v_sub_co_ci_u32">;
+ defm : VOP2bInstAliases<
+ V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx10, "v_subrev_co_ci_u32">;
+} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
-// VI
+// GFX6, GFX7, GFX10.
//===----------------------------------------------------------------------===//
class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
@@ -809,7 +1181,111 @@ class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
let Inst{31} = 0x0; //encoding
}
-let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass VOP2Only_Real_gfx6_gfx7<bits<6> op> {
+ def _gfx6_gfx7 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP2Only_Real_MADK_gfx6_gfx7<bits<6> op> {
+ def _gfx6_gfx7 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op> {
+ def _e32_gfx6_gfx7 :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+ }
+ multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op> {
+ def _e64_gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+ multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op> {
+ def _e64_gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass VOP2Only_Real_MADK_gfx6_gfx7_gfx10<bits<6> op> :
+ VOP2Only_Real_MADK_gfx6_gfx7<op>, VOP2Only_Real_MADK_gfx10<op>;
+
+multiclass VOP2_Real_gfx6_gfx7<bits<6> op> :
+ VOP2_Real_e32_gfx6_gfx7<op>, VOP2_Real_e64_gfx6_gfx7<op>;
+
+multiclass VOP2_Real_gfx6_gfx7_gfx10<bits<6> op> :
+ VOP2_Real_gfx6_gfx7<op>, VOP2_Real_gfx10<op>;
+
+multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> :
+ VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>;
+
+defm V_CNDMASK_B32 : VOP2_Real_gfx6_gfx7<0x000>;
+defm V_MIN_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00d>;
+defm V_MAX_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00e>;
+defm V_LSHR_B32 : VOP2_Real_gfx6_gfx7<0x015>;
+defm V_ASHR_I32 : VOP2_Real_gfx6_gfx7<0x017>;
+defm V_LSHL_B32 : VOP2_Real_gfx6_gfx7<0x019>;
+defm V_BFM_B32 : VOP2_Real_gfx6_gfx7<0x01e>;
+defm V_BCNT_U32_B32 : VOP2_Real_gfx6_gfx7<0x022>;
+defm V_MBCNT_LO_U32_B32 : VOP2_Real_gfx6_gfx7<0x023>;
+defm V_MBCNT_HI_U32_B32 : VOP2_Real_gfx6_gfx7<0x024>;
+defm V_LDEXP_F32 : VOP2_Real_gfx6_gfx7<0x02b>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_gfx6_gfx7<0x02c>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_gfx6_gfx7<0x02d>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_gfx6_gfx7<0x02e>;
+defm V_CVT_PK_U16_U32 : VOP2_Real_gfx6_gfx7<0x030>;
+defm V_CVT_PK_I16_I32 : VOP2_Real_gfx6_gfx7<0x031>;
+defm V_ADD_I32 : VOP2be_Real_gfx6_gfx7<0x025>;
+defm V_SUB_I32 : VOP2be_Real_gfx6_gfx7<0x026>;
+defm V_SUBREV_I32 : VOP2be_Real_gfx6_gfx7<0x027>;
+defm V_ADDC_U32 : VOP2be_Real_gfx6_gfx7<0x028>;
+defm V_SUBB_U32 : VOP2be_Real_gfx6_gfx7<0x029>;
+defm V_SUBBREV_U32 : VOP2be_Real_gfx6_gfx7<0x02a>;
+
+defm V_READLANE_B32 : VOP2Only_Real_gfx6_gfx7<0x001>;
+
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
+ defm V_WRITELANE_B32 : VOP2Only_Real_gfx6_gfx7<0x002>;
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in)
+
+let SubtargetPredicate = isGFX6GFX7 in {
+ defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx6_gfx7>;
+} // End SubtargetPredicate = isGFX6GFX7
+
+defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>;
+defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>;
+defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>;
+defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>;
+defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>;
+defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>;
+defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x009>;
+defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x00a>;
+defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00b>;
+defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00c>;
+defm V_MIN_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x00f>;
+defm V_MAX_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x010>;
+defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x011>;
+defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x012>;
+defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x013>;
+defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x014>;
+defm V_LSHRREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x016>;
+defm V_ASHRREV_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x018>;
+defm V_LSHLREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01a>;
+defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01b>;
+defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01c>;
+defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01d>;
+defm V_MAC_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x01f>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x02f>;
+defm V_MADMK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x020>;
+defm V_MADAK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x021>;
+
+//===----------------------------------------------------------------------===//
+// GFX8, GFX9 (VI).
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in {
multiclass VOP2_Real_MADK_vi <bits<6> op> {
def _vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
@@ -843,7 +1319,7 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
VOP2_Real_e32_vi<op>,
VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>;
-} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
+} // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8"
multiclass VOP2_SDWA_Real <bits<6> op> {
def _sdwa_vi :
@@ -857,7 +1333,7 @@ multiclass VOP2_SDWA9_Real <bits<6> op> {
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
}
-let AssemblerPredicates = [isVIOnly] in {
+let AssemblerPredicates = [isGFX8Only] in {
multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> {
def _e32_vi :
@@ -865,14 +1341,14 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "VI";
+ let DecoderNamespace = "GFX8";
}
def _e64_vi :
VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>,
VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
let AsmString = AsmName # ps.AsmOperands;
- let DecoderNamespace = "VI";
+ let DecoderNamespace = "GFX8";
}
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
@@ -890,7 +1366,7 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
}
}
-let AssemblerPredicates = [isGFX9] in {
+let AssemblerPredicates = [isGFX9Only] in {
multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
def _e32_gfx9 :
@@ -946,7 +1422,7 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
}
}
-} // AssemblerPredicates = [isGFX9]
+} // AssemblerPredicates = [isGFX9Only]
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
@@ -1035,7 +1511,7 @@ defm V_MIN_U16 : VOP2_Real_e32e64_vi <0x31>;
defm V_MIN_I16 : VOP2_Real_e32e64_vi <0x32>;
defm V_LDEXP_F16 : VOP2_Real_e32e64_vi <0x33>;
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8GFX9 in {
// Aliases to simplify matching of floating-point instructions that
// are VOP2 on SI and VOP3 on VI.
@@ -1055,7 +1531,20 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
-} // End SubtargetPredicate = isVI
+defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_vi>;
+
+} // End SubtargetPredicate = isGFX8GFX9
+
+let SubtargetPredicate = isGFX9Only in {
+
+defm : VOP2bInstAliases<V_ADD_I32_e32, V_ADD_CO_U32_e32_gfx9, "v_add_co_u32">;
+defm : VOP2bInstAliases<V_ADDC_U32_e32, V_ADDC_CO_U32_e32_gfx9, "v_addc_co_u32">;
+defm : VOP2bInstAliases<V_SUB_I32_e32, V_SUB_CO_U32_e32_gfx9, "v_sub_co_u32">;
+defm : VOP2bInstAliases<V_SUBB_U32_e32, V_SUBB_CO_U32_e32_gfx9, "v_subb_co_u32">;
+defm : VOP2bInstAliases<V_SUBREV_I32_e32, V_SUBREV_CO_U32_e32_gfx9, "v_subrev_co_u32">;
+defm : VOP2bInstAliases<V_SUBBREV_U32_e32, V_SUBBREV_CO_U32_e32_gfx9, "v_subbrev_co_u32">;
+
+} // End SubtargetPredicate = isGFX9Only
let SubtargetPredicate = HasDLInsts in {
@@ -1063,3 +1552,35 @@ defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>;
defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
} // End SubtargetPredicate = HasDLInsts
+
+multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
+ def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+}
+
+multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> :
+ VOP2_Real_e32_gfx10<op>,
+ VOP2_Real_dpp_gfx10<op>,
+ VOP2_Real_dpp8_gfx10<op>;
+
+let SubtargetPredicate = HasDot5Insts in {
+ defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx9<0x37>;
+ // NB: Opcode conflicts with V_DOT8C_I32_I4
+ // This opcode exists in gfx 10.1* only
+ defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx10<0x02>;
+}
+
+let SubtargetPredicate = HasDot6Insts in {
+ defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx9<0x39>;
+ defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx10<0x0d>;
+}
+
+let SubtargetPredicate = HasDot4Insts in {
+ defm V_DOT2C_I32_I16 : VOP2_Real_DOT_ACC_gfx9<0x38>;
+}
+let SubtargetPredicate = HasDot3Insts in {
+ defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx9<0x3a>;
+}
+
+let SubtargetPredicate = HasPkFmacF16Inst in {
+defm V_PK_FMAC_F16 : VOP2_Real_e32_vi<0x3c>;
+} // End SubtargetPredicate = HasPkFmacF16Inst
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 4b8c1f208a0e..21dbef9240e1 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1,9 +1,8 @@
//===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -111,6 +110,11 @@ class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
ret1));
}
+class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
+ list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
+ imm:$cbsz, imm:$abid, imm:$blgp))];
+}
+
class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
VOP3_Pseudo<OpName, P,
!if(P.HasOpSel,
@@ -121,7 +125,9 @@ class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag,
getVOP3ModPat<P, node>.ret,
!if(P.HasIntClamp,
getVOP3ClampPat<P, node>.ret,
- getVOP3Pat<P, node>.ret))),
+ !if (P.IsMAI,
+ getVOP3MAIPat<P, node>.ret,
+ getVOP3Pat<P, node>.ret)))),
VOP3Only, 0, P.HasOpSel> {
let IntClamp = P.HasIntClamp;
@@ -144,33 +150,27 @@ def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> {
}
}
-class getVOP3VCC<VOPProfile P, SDPatternOperator node> {
- list<dag> ret =
- [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
- (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
- (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)),
- (i1 VCC)))];
-}
-
-class VOP3Features<bit Clamp, bit OpSel, bit Packed> {
+class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> {
bit HasClamp = Clamp;
bit HasOpSel = OpSel;
bit IsPacked = Packed;
+ bit IsMAI = MAI;
}
-def VOP3_REGULAR : VOP3Features<0, 0, 0>;
-def VOP3_CLAMP : VOP3Features<1, 0, 0>;
-def VOP3_OPSEL : VOP3Features<1, 1, 0>;
-def VOP3_PACKED : VOP3Features<1, 1, 1>;
+def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>;
+def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
+def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
+def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
+def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> {
let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+ let IsMAI = !if(Features.IsMAI, 1, P.IsMAI);
let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
- let HasModifiers = !if(Features.IsPacked, 1, P.HasModifiers);
+ let HasModifiers = !if(Features.IsPacked, !if(Features.IsMAI, 0, 1), P.HasModifiers);
// FIXME: Hack to stop printing _e64
let Outs64 = (outs DstRC.RegClass:$vdst);
@@ -191,8 +191,9 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf
class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
// v_div_scale_{f32|f64} do not support input modifiers.
let HasModifiers = 0;
+ let HasClamp = 0;
let HasOMod = 0;
- let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
}
@@ -212,7 +213,7 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
// FIXME: Hack to stop printing _e64
let DstRC = RegisterOperand<VReg_64>;
- let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp";
}
@@ -303,7 +304,7 @@ def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_li
} // End SchedRW = [WriteDoubleAdd]
let SchedRW = [WriteQuarterRate32] in {
-def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
def V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
@@ -315,8 +316,7 @@ let Uses = [VCC, EXEC] in {
// if (vcc)
// result *= 2^32
//
-def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
- getVOP3VCC<VOP_F32_F32_F32_F32_VCC, AMDGPUdiv_fmas>.ret> {
+def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, []> {
let SchedRW = [WriteFloatFMA];
}
// v_div_fmas_f64:
@@ -324,8 +324,7 @@ def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
// if (vcc)
// result *= 2^64
//
-def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC,
- getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> {
+def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []> {
let SchedRW = [WriteDouble];
let FPDPRounding = 1;
}
@@ -386,22 +385,21 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
}
let SchedRW = [Write64Bit] in {
-// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI, Predicates = [isSICI] in {
+let SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] in {
def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, shl>;
def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, srl>;
def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, sra>;
def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-} // End SubtargetPredicate = isSICI, Predicates = [isSICI]
+} // End SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10]
-let SubtargetPredicate = isVI in {
-def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
-def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
-def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
-} // End SubtargetPredicate = isVI
+let SubtargetPredicate = isGFX8Plus in {
+def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
+def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshr_rev>;
+def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>;
+} // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write64Bit]
-let Predicates = [isVI] in {
+let Predicates = [isGFX8Plus] in {
def : GCNPat <
(getDivergentFrag<shl>.ret i64:$x, i32:$y),
(V_LSHLREV_B64 $y, $x)
@@ -417,7 +415,13 @@ def : AMDGPUPat <
}
-let SubtargetPredicate = isCIVI in {
+let SchedRW = [Write32Bit] in {
+let SubtargetPredicate = isGFX8Plus in {
+def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
+} // End SubtargetPredicate = isGFX8Plus
+} // End SchedRW = [Write32Bit]
+
+let SubtargetPredicate = isGFX7Plus in {
let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
@@ -431,27 +435,27 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
} // End SchedRW = [WriteDouble, WriteSALU]
} // End isCommutable = 1
-} // End SubtargetPredicate = isCIVI
+} // End SubtargetPredicate = isGFX7Plus
def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
- let Predicates = [Has16BitInsts, isVIOnly];
+ let Predicates = [Has16BitInsts, isGFX8Only];
let FPDPRounding = 1;
}
def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
let renamedInGFX9 = 1;
- let Predicates = [Has16BitInsts, isGFX9];
+ let Predicates = [Has16BitInsts, isGFX9Plus];
let FPDPRounding = 1;
}
def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma> {
- let Predicates = [Has16BitInsts, isVIOnly];
+ let Predicates = [Has16BitInsts, isGFX8Only];
let FPDPRounding = 1;
}
def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, fma> {
let renamedInGFX9 = 1;
- let Predicates = [Has16BitInsts, isGFX9];
+ let Predicates = [Has16BitInsts, isGFX9Plus];
let FPDPRounding = 1;
}
@@ -463,36 +467,58 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CL
let FPDPRounding = 1 in {
def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
let Uses = [M0, EXEC] in {
-def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>;
+def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
+ [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan),
+ (i32 imm:$attr),
+ (i32 imm:$src0_modifiers),
+ (f32 VRegSrc_32:$src2),
+ (i32 imm:$src2_modifiers),
+ (i1 imm:$high),
+ (i1 imm:$clamp)))]>;
} // End Uses = [M0, EXEC]
} // End FPDPRounding = 1
} // End renamedInGFX9 = 1
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = isGFX9Only in {
def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> {
let FPDPRounding = 1;
}
+} // End SubtargetPredicate = isGFX9Only
+
+let SubtargetPredicate = isGFX9Plus in {
def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
-} // End SubtargetPredicate = isGFX9
+} // End SubtargetPredicate = isGFX9Plus
let Uses = [M0, EXEC], FPDPRounding = 1 in {
-def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
-def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
+def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
+ [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan),
+ (i32 imm:$attr),
+ (i32 imm:$src0_modifiers),
+ (i1 imm:$high),
+ (i1 imm:$clamp),
+ (i32 imm:$omod)))]>;
+def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>,
+ [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan),
+ (i32 imm:$attr),
+ (i32 imm:$src0_modifiers),
+ (f32 VRegSrc_32:$src2),
+ (i32 imm:$src2_modifiers),
+ (i1 imm:$high),
+ (i1 imm:$clamp),
+ (i32 imm:$omod)))]>;
} // End Uses = [M0, EXEC], FPDPRounding = 1
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = isGFX8GFX9 in {
def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
+} // End SubtargetPredicate = isGFX8GFX9
-def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
-} // End SubtargetPredicate = isVI
-
-let Predicates = [Has16BitInsts] in {
+let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
Instruction inst, SDPatternOperator op3> {
@@ -506,7 +532,23 @@ def : GCNPat <
defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
-} // End Predicates = [Has16BitInsts]
+} // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9]
+
+let Predicates = [Has16BitInsts, isGFX10Plus] in {
+
+multiclass Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
+ Instruction inst, SDPatternOperator op3> {
+def : GCNPat <
+ (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
+ (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
+>;
+
+}
+
+defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9, zext>;
+defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9, sext>;
+
+} // End Predicates = [Has16BitInsts, isGFX10Plus]
class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
(ops node:$x, node:$y, node:$z),
@@ -528,7 +570,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
if (!Operands[i]->isDivergent() &&
!isInlineImmediate(Operands[i].getNode())) {
ConstantBusUses++;
- if (ConstantBusUses >= 2)
+ // This uses AMDGPU::V_ADD3_U32, but all three operand instructions
+ // have the same constant bus limit.
+ if (ConstantBusUses > Subtarget->getConstantBusLimit(AMDGPU::V_ADD3_U32))
return false;
}
}
@@ -539,7 +583,7 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
let PredicateCodeUsesOperands = 1;
}
-let SubtargetPredicate = isGFX9 in {
+let SubtargetPredicate = isGFX9Plus in {
def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -589,7 +633,38 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32>;
-} // End SubtargetPredicate = isGFX9
+} // End SubtargetPredicate = isGFX9Plus
+
+def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
+ let Src0RC64 = VRegSrc_32;
+ let Src1RC64 = SCSrc_b32;
+ let Src2RC64 = SCSrc_b32;
+ let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0,
+ IntOpSelMods:$src1_modifiers, SCSrc_b32:$src1,
+ IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2,
+ VGPR_32:$vdst_in, op_sel:$op_sel);
+ let HasClamp = 0;
+ let HasOMod = 0;
+}
+
+let SubtargetPredicate = isGFX10Plus in {
+ def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32>;
+
+ let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ def V_PERMLANE16_B32 : VOP3Inst <"v_permlane16_b32", VOP3_PERMLANE_Profile>;
+ def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
+ } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+
+ def : GCNPat<
+ (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
+ >;
+ def : GCNPat<
+ (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
+ >;
+} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
@@ -631,111 +706,239 @@ def : IntClampPat<V_MQSAD_PK_U16_U8, int_amdgcn_mqsad_pk_u16_u8>;
def : IntClampPat<V_QSAD_PK_U16_U8, int_amdgcn_qsad_pk_u16_u8>;
def : IntClampPat<V_MQSAD_U32_U8, int_amdgcn_mqsad_u32_u8>;
+
//===----------------------------------------------------------------------===//
-// Target
+// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// SI
+// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
-
-multiclass VOP3_Real_si<bits<9> op> {
- def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
-}
-
-multiclass VOP3be_Real_si<bits<9> op> {
- def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
-}
-
-} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"
-
-defm V_MAD_LEGACY_F32 : VOP3_Real_si <0x140>;
-defm V_MAD_F32 : VOP3_Real_si <0x141>;
-defm V_MAD_I32_I24 : VOP3_Real_si <0x142>;
-defm V_MAD_U32_U24 : VOP3_Real_si <0x143>;
-defm V_CUBEID_F32 : VOP3_Real_si <0x144>;
-defm V_CUBESC_F32 : VOP3_Real_si <0x145>;
-defm V_CUBETC_F32 : VOP3_Real_si <0x146>;
-defm V_CUBEMA_F32 : VOP3_Real_si <0x147>;
-defm V_BFE_U32 : VOP3_Real_si <0x148>;
-defm V_BFE_I32 : VOP3_Real_si <0x149>;
-defm V_BFI_B32 : VOP3_Real_si <0x14a>;
-defm V_FMA_F32 : VOP3_Real_si <0x14b>;
-defm V_FMA_F64 : VOP3_Real_si <0x14c>;
-defm V_LERP_U8 : VOP3_Real_si <0x14d>;
-defm V_ALIGNBIT_B32 : VOP3_Real_si <0x14e>;
-defm V_ALIGNBYTE_B32 : VOP3_Real_si <0x14f>;
-defm V_MULLIT_F32 : VOP3_Real_si <0x150>;
-defm V_MIN3_F32 : VOP3_Real_si <0x151>;
-defm V_MIN3_I32 : VOP3_Real_si <0x152>;
-defm V_MIN3_U32 : VOP3_Real_si <0x153>;
-defm V_MAX3_F32 : VOP3_Real_si <0x154>;
-defm V_MAX3_I32 : VOP3_Real_si <0x155>;
-defm V_MAX3_U32 : VOP3_Real_si <0x156>;
-defm V_MED3_F32 : VOP3_Real_si <0x157>;
-defm V_MED3_I32 : VOP3_Real_si <0x158>;
-defm V_MED3_U32 : VOP3_Real_si <0x159>;
-defm V_SAD_U8 : VOP3_Real_si <0x15a>;
-defm V_SAD_HI_U8 : VOP3_Real_si <0x15b>;
-defm V_SAD_U16 : VOP3_Real_si <0x15c>;
-defm V_SAD_U32 : VOP3_Real_si <0x15d>;
-defm V_CVT_PK_U8_F32 : VOP3_Real_si <0x15e>;
-defm V_DIV_FIXUP_F32 : VOP3_Real_si <0x15f>;
-defm V_DIV_FIXUP_F64 : VOP3_Real_si <0x160>;
-defm V_LSHL_B64 : VOP3_Real_si <0x161>;
-defm V_LSHR_B64 : VOP3_Real_si <0x162>;
-defm V_ASHR_I64 : VOP3_Real_si <0x163>;
-defm V_ADD_F64 : VOP3_Real_si <0x164>;
-defm V_MUL_F64 : VOP3_Real_si <0x165>;
-defm V_MIN_F64 : VOP3_Real_si <0x166>;
-defm V_MAX_F64 : VOP3_Real_si <0x167>;
-defm V_LDEXP_F64 : VOP3_Real_si <0x168>;
-defm V_MUL_LO_U32 : VOP3_Real_si <0x169>;
-defm V_MUL_HI_U32 : VOP3_Real_si <0x16a>;
-defm V_MUL_LO_I32 : VOP3_Real_si <0x16b>;
-defm V_MUL_HI_I32 : VOP3_Real_si <0x16c>;
-defm V_DIV_SCALE_F32 : VOP3be_Real_si <0x16d>;
-defm V_DIV_SCALE_F64 : VOP3be_Real_si <0x16e>;
-defm V_DIV_FMAS_F32 : VOP3_Real_si <0x16f>;
-defm V_DIV_FMAS_F64 : VOP3_Real_si <0x170>;
-defm V_MSAD_U8 : VOP3_Real_si <0x171>;
-defm V_MQSAD_PK_U16_U8 : VOP3_Real_si <0x173>;
-defm V_TRIG_PREOP_F64 : VOP3_Real_si <0x174>;
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass VOP3_Real_gfx10<bits<10> op> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<op, !cast<VOP_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP3_Real_gfx10_with_name<bits<10> op, string opName,
+ string asmName> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(opName), SIEncodingFamily.GFX10>,
+ VOP3e_gfx10<op, !cast<VOP3_Pseudo>(opName).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName);
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+ multiclass VOP3be_Real_gfx10<bits<10> op> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP3Interp_Real_gfx10<bits<10> op> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP3Interp_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP3OpSel_Real_gfx10<bits<10> op> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP3OpSel_Real_gfx10_with_name<bits<10> op, string opName,
+ string asmName> {
+ def _gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(opName), SIEncodingFamily.GFX10>,
+ VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(opName).Pfl> {
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName);
+ let AsmString = asmName # ps.AsmOperands;
+ }
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+defm V_READLANE_B32 : VOP3_Real_gfx10<0x360>;
+
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
+ defm V_WRITELANE_B32 : VOP3_Real_gfx10<0x361>;
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in)
+
+defm V_XOR3_B32 : VOP3_Real_gfx10<0x178>;
+defm V_LSHLREV_B64 : VOP3_Real_gfx10<0x2ff>;
+defm V_LSHRREV_B64 : VOP3_Real_gfx10<0x300>;
+defm V_ASHRREV_I64 : VOP3_Real_gfx10<0x301>;
+defm V_PERM_B32 : VOP3_Real_gfx10<0x344>;
+defm V_XAD_U32 : VOP3_Real_gfx10<0x345>;
+defm V_LSHL_ADD_U32 : VOP3_Real_gfx10<0x346>;
+defm V_ADD_LSHL_U32 : VOP3_Real_gfx10<0x347>;
+defm V_ADD3_U32 : VOP3_Real_gfx10<0x36d>;
+defm V_LSHL_OR_B32 : VOP3_Real_gfx10<0x36f>;
+defm V_AND_OR_B32 : VOP3_Real_gfx10<0x371>;
+defm V_OR3_B32 : VOP3_Real_gfx10<0x372>;
+
+// TODO-GFX10: add MC tests for v_add/sub_nc_i16
+defm V_ADD_NC_I16 :
+ VOP3OpSel_Real_gfx10_with_name<0x30d, "V_ADD_I16", "v_add_nc_i16">;
+defm V_SUB_NC_I16 :
+ VOP3OpSel_Real_gfx10_with_name<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
+defm V_SUB_NC_I32 :
+ VOP3_Real_gfx10_with_name<0x376, "V_SUB_I32_gfx9", "v_sub_nc_i32">;
+defm V_ADD_NC_I32 :
+ VOP3_Real_gfx10_with_name<0x37f, "V_ADD_I32_gfx9", "v_add_nc_i32">;
+
+defm V_INTERP_P1LL_F16 : VOP3Interp_Real_gfx10<0x342>;
+defm V_INTERP_P1LV_F16 : VOP3Interp_Real_gfx10<0x343>;
+defm V_INTERP_P2_F16 : VOP3Interp_Real_gfx10<0x35a>;
+
+defm V_PACK_B32_F16 : VOP3OpSel_Real_gfx10<0x311>;
+defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx10<0x312>;
+defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx10<0x313>;
+
+defm V_MIN3_F16 : VOP3OpSel_Real_gfx10<0x351>;
+defm V_MIN3_I16 : VOP3OpSel_Real_gfx10<0x352>;
+defm V_MIN3_U16 : VOP3OpSel_Real_gfx10<0x353>;
+defm V_MAX3_F16 : VOP3OpSel_Real_gfx10<0x354>;
+defm V_MAX3_I16 : VOP3OpSel_Real_gfx10<0x355>;
+defm V_MAX3_U16 : VOP3OpSel_Real_gfx10<0x356>;
+defm V_MED3_F16 : VOP3OpSel_Real_gfx10<0x357>;
+defm V_MED3_I16 : VOP3OpSel_Real_gfx10<0x358>;
+defm V_MED3_U16 : VOP3OpSel_Real_gfx10<0x359>;
+defm V_MAD_U32_U16 : VOP3OpSel_Real_gfx10<0x373>;
+defm V_MAD_I32_I16 : VOP3OpSel_Real_gfx10<0x375>;
+
+defm V_MAD_U16 :
+ VOP3OpSel_Real_gfx10_with_name<0x340, "V_MAD_U16_gfx9", "v_mad_u16">;
+defm V_FMA_F16 :
+ VOP3OpSel_Real_gfx10_with_name<0x34b, "V_FMA_F16_gfx9", "v_fma_f16">;
+defm V_MAD_I16 :
+ VOP3OpSel_Real_gfx10_with_name<0x35e, "V_MAD_I16_gfx9", "v_mad_i16">;
+defm V_DIV_FIXUP_F16 :
+ VOP3OpSel_Real_gfx10_with_name<0x35f, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
+
+// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these
+// (they do not support SDWA or DPP).
+defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16_e64", "v_add_nc_u16">;
+defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16_e64", "v_sub_nc_u16">;
+defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16_e64", "v_mul_lo_u16">;
+defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16_e64", "v_lshrrev_b16">;
+defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16_e64", "v_ashrrev_i16">;
+defm V_MAX_U16 : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16_e64", "v_max_u16">;
+defm V_MAX_I16 : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16_e64", "v_max_i16">;
+defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16_e64", "v_min_u16">;
+defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16_e64", "v_min_i16">;
+defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_e64", "v_lshlrev_b16">;
+defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>;
+defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>;
//===----------------------------------------------------------------------===//
-// CI
+// GFX7, GFX10.
//===----------------------------------------------------------------------===//
-multiclass VOP3_Real_ci<bits<9> op> {
- def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
- let AssemblerPredicates = [isCIOnly];
- let DecoderNamespace = "CI";
+let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
+ multiclass VOP3_Real_gfx7<bits<10> op> {
+ def _gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
}
-}
-
-multiclass VOP3be_Real_ci<bits<9> op> {
- def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
- VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
- let AssemblerPredicates = [isCIOnly];
- let DecoderNamespace = "CI";
+ multiclass VOP3be_Real_gfx7<bits<10> op> {
+ def _gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
}
-}
+} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
+
+multiclass VOP3_Real_gfx7_gfx10<bits<10> op> :
+ VOP3_Real_gfx7<op>, VOP3_Real_gfx10<op>;
+
+multiclass VOP3be_Real_gfx7_gfx10<bits<10> op> :
+ VOP3be_Real_gfx7<op>, VOP3be_Real_gfx10<op>;
+
+defm V_QSAD_PK_U16_U8 : VOP3_Real_gfx7_gfx10<0x172>;
+defm V_MQSAD_U32_U8 : VOP3_Real_gfx7_gfx10<0x175>;
+defm V_MAD_U64_U32 : VOP3be_Real_gfx7_gfx10<0x176>;
+defm V_MAD_I64_I32 : VOP3be_Real_gfx7_gfx10<0x177>;
-defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>;
-defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x175>;
-defm V_MAD_U64_U32 : VOP3be_Real_ci <0x176>;
-defm V_MAD_I64_I32 : VOP3be_Real_ci <0x177>;
+//===----------------------------------------------------------------------===//
+// GFX6, GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
+ multiclass VOP3_Real_gfx6_gfx7<bits<10> op> {
+ def _gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+ multiclass VOP3be_Real_gfx6_gfx7<bits<10> op> {
+ def _gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+ VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+
+multiclass VOP3_Real_gfx6_gfx7_gfx10<bits<10> op> :
+ VOP3_Real_gfx6_gfx7<op>, VOP3_Real_gfx10<op>;
+
+multiclass VOP3be_Real_gfx6_gfx7_gfx10<bits<10> op> :
+ VOP3be_Real_gfx6_gfx7<op>, VOP3be_Real_gfx10<op>;
+
+defm V_LSHL_B64 : VOP3_Real_gfx6_gfx7<0x161>;
+defm V_LSHR_B64 : VOP3_Real_gfx6_gfx7<0x162>;
+defm V_ASHR_I64 : VOP3_Real_gfx6_gfx7<0x163>;
+
+defm V_MAD_LEGACY_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x140>;
+defm V_MAD_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x141>;
+defm V_MAD_I32_I24 : VOP3_Real_gfx6_gfx7_gfx10<0x142>;
+defm V_MAD_U32_U24 : VOP3_Real_gfx6_gfx7_gfx10<0x143>;
+defm V_CUBEID_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x144>;
+defm V_CUBESC_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x145>;
+defm V_CUBETC_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x146>;
+defm V_CUBEMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x147>;
+defm V_BFE_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x148>;
+defm V_BFE_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x149>;
+defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>;
+defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>;
+defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>;
+defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>;
+defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>;
+defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>;
+defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>;
+defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>;
+defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>;
+defm V_MIN3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x153>;
+defm V_MAX3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x154>;
+defm V_MAX3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x155>;
+defm V_MAX3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x156>;
+defm V_MED3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x157>;
+defm V_MED3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x158>;
+defm V_MED3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x159>;
+defm V_SAD_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x15a>;
+defm V_SAD_HI_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x15b>;
+defm V_SAD_U16 : VOP3_Real_gfx6_gfx7_gfx10<0x15c>;
+defm V_SAD_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x15d>;
+defm V_CVT_PK_U8_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x15e>;
+defm V_DIV_FIXUP_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x15f>;
+defm V_DIV_FIXUP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x160>;
+defm V_ADD_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x164>;
+defm V_MUL_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x165>;
+defm V_MIN_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x166>;
+defm V_MAX_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x167>;
+defm V_LDEXP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x168>;
+defm V_MUL_LO_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x169>;
+defm V_MUL_HI_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x16a>;
+defm V_MUL_LO_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x16b>;
+defm V_MUL_HI_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x16c>;
+defm V_DIV_FMAS_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x16f>;
+defm V_DIV_FMAS_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x170>;
+defm V_MSAD_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x171>;
+defm V_MQSAD_PK_U16_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x173>;
+defm V_TRIG_PREOP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x174>;
+defm V_DIV_SCALE_F32 : VOP3be_Real_gfx6_gfx7_gfx10<0x16d>;
+defm V_DIV_SCALE_F64 : VOP3be_Real_gfx6_gfx7_gfx10<0x16e>;
//===----------------------------------------------------------------------===//
-// VI
+// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
-let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in {
multiclass VOP3_Real_vi<bits<10> op> {
def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
@@ -757,9 +960,9 @@ multiclass VOP3Interp_Real_vi<bits<10> op> {
VOP3Interp_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
}
-} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
+} // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8"
-let AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI" in {
+let AssemblerPredicates = [isGFX8Only], DecoderNamespace = "GFX8" in {
multiclass VOP3_F16_Real_vi<bits<10> op> {
def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
@@ -771,9 +974,9 @@ multiclass VOP3Interp_F16_Real_vi<bits<10> op> {
VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
}
-} // End AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI"
+} // End AssemblerPredicates = [isGFX8Only], DecoderNamespace = "GFX8"
-let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in {
+let AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9" in {
multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> {
def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>,
@@ -807,7 +1010,7 @@ multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> {
}
}
-} // End AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9"
+} // End AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9"
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>;
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index 91b45583c848..55ee5f6577cf 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1,9 +1,8 @@
//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -70,6 +69,16 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1
def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// The constant will be emitted as a mov, and folded later.
+// TODO: We could directly encode the immediate now
+def : GCNPat<
+ (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1),
+ (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp)
+>;
+
multiclass MadFmaMixPats<SDPatternOperator fma_like,
Instruction mix_inst,
Instruction mixlo_inst,
@@ -239,29 +248,39 @@ class UDot2Pat<Instruction Inst> : GCNPat <
(AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)),
(and i32:$src1, (i32 65535)))
),
- (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))
->;
+ (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> {
+ let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate;
+}
class SDot2Pat<Instruction Inst> : GCNPat <
(add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)),
(sra i32:$src1, (i32 16))), i32:$src2),
(AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16),
(sext_inreg i32:$src1, i16))),
- (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))
->;
+ (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> {
+ let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate;
+}
-let SubtargetPredicate = HasDotInsts in {
+let SubtargetPredicate = HasDot2Insts in {
def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
-def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
-def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot1Insts in {
+
+def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+
+} // End SubtargetPredicate = HasDot1Insts
+
multiclass DotPats<SDPatternOperator dot_op,
VOP3PInst dot_inst> {
+ let SubtargetPredicate = dot_inst.SubtargetPredicate in
def : GCNPat <
(dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)),
(dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)),
@@ -281,12 +300,14 @@ def : UDot2Pat<V_DOT2_U32_U16>;
def : SDot2Pat<V_DOT2_I32_I16>;
foreach Type = ["U", "I"] in
+ let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in
def : GCNPat <
!cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y,
(add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))),
(!cast<VOP3PInst>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
foreach Type = ["U", "I"] in
+ let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in
def : GCNPat <
!cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
[1, 2, 3, 4, 5, 6, 7], lhs, y,
@@ -296,19 +317,101 @@ foreach Type = ["U", "I"] in
// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase
// in the compile time. Directly handle the pattern generated by the FE here.
foreach Type = ["U", "I"] in
+ let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in
def : GCNPat <
!cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
[7, 1, 2, 3, 4, 5, 6], lhs, y,
(NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
(!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
-} // End SubtargetPredicate = HasDotInsts
+def ADst_32 : VOPDstOperand<AGPR_32>;
+def ADst_128 : VOPDstOperand<AReg_128>;
+def ADst_512 : VOPDstOperand<AReg_512>;
+def ADst_1024 : VOPDstOperand<AReg_1024>;
+
+def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+ let Src0RC64 = ARegSrc_32;
+}
+
+def VOPProfileAccWrite : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+ let DstRC = ADst_32;
+ let Src0RC64 = VISrc_b32;
+}
+
+class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC,
+ RegisterOperand SrcABRC = AVSrc_32>
+ : VOP3_Profile<P, VOP3_MAI> {
+ let DstRC = _DstRC;
+ let Src0RC64 = SrcABRC;
+ let Src1RC64 = SrcABRC;
+ let Src2RC64 = _SrcRC;
+ let HasOpSel = 0;
+ let HasClamp = 0;
+ let HasModifiers = 0;
+ let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp";
+ let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
+}
+
+def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>;
+def VOPProfileMAI_F32_F32_X16 : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, AISrc_512_f32, ADst_512>;
+def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, AISrc_1024_f32, ADst_1024>;
+def VOPProfileMAI_I32_I32_X4 : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, AISrc_128_b32, ADst_128>;
+def VOPProfileMAI_I32_I32_X16 : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, AISrc_512_b32, ADst_512>;
+def VOPProfileMAI_I32_I32_X32 : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, AISrc_1024_b32, ADst_1024>;
+def VOPProfileMAI_F32_V2I16_X4 : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, AISrc_128_b32, ADst_128>;
+def VOPProfileMAI_F32_V2I16_X16 : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, AISrc_512_b32, ADst_512>;
+def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, AISrc_1024_b32, ADst_1024>;
+def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
+
+let Predicates = [HasMAIInsts] in {
+def V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>;
+def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> {
+ let isMoveImm = 1;
+}
+
+let isConvergent = 1 in {
+def V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>;
+def V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>;
+def V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>;
+def V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>;
+def V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>;
+def V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>;
+def V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>;
+def V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>;
+def V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>;
+def V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>;
+def V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>;
+def V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>;
+def V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>;
+def V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>;
+def V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>;
+def V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>;
+def V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>;
+def V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>;
+def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
+def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
+} // End isConvergent = 1
+
+} // End SubtargetPredicate = HasMAIInsts
+
+def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
+def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
multiclass VOP3P_Real_vi<bits<10> op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
let AssemblerPredicates = [HasVOP3PInsts];
- let DecoderNamespace = "VI";
+ let DecoderNamespace = "GFX8";
+ }
+}
+
+multiclass VOP3P_Real_MAI<bits<10> op> {
+ def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+ let AssemblerPredicates = [HasMAIInsts];
+ let DecoderNamespace = "GFX8";
}
}
@@ -352,14 +455,97 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
}
-let SubtargetPredicate = HasDotInsts in {
+let SubtargetPredicate = HasDot2Insts in {
defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>;
-defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>;
defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>;
-defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>;
defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>;
-} // End SubtargetPredicate = HasDotInsts
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot1Insts in {
+
+defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>;
+defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>;
+
+} // End SubtargetPredicate = HasDot1Insts
+
+let SubtargetPredicate = HasMAIInsts in {
+
+defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x3d8>;
+defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x3d9>;
+defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MAI <0x3c0>;
+defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MAI <0x3c1>;
+defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MAI <0x3c2>;
+defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MAI <0x3c4>;
+defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MAI <0x3c5>;
+defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MAI <0x3c8>;
+defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MAI <0x3c9>;
+defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MAI <0x3ca>;
+defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MAI <0x3cc>;
+defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MAI <0x3cd>;
+defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MAI <0x3d0>;
+defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MAI <0x3d1>;
+defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MAI <0x3d2>;
+defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MAI <0x3d4>;
+defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MAI <0x3d5>;
+defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MAI <0x3e8>;
+defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MAI <0x3e9>;
+defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MAI <0x3eb>;
+defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MAI <0x3ec>;
+defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x3ed>;
+
+} // End SubtargetPredicate = HasMAIInsts
+
+//===----------------------------------------------------------------------===//
+// GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+ multiclass VOP3P_Real_gfx10<bits<10> op> {
+ def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>,
+ VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>;
+ }
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+
+defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x000>;
+defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x001>;
+defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x002>;
+defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x003>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x004>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x005>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x006>;
+defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x007>;
+defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x008>;
+defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x009>;
+defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x00a>;
+defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x00b>;
+defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x00c>;
+defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x00d>;
+defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x00e>;
+defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x00f>;
+defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x010>;
+defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x011>;
+defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x012>;
+defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x020>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x021>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x022>;
+
+let SubtargetPredicate = HasDot2Insts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x013>;
+defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x014>;
+defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x015>;
+defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x017>;
+defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x019>;
+
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot1Insts in {
+
+defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x016>;
+defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x018>;
+
+} // End SubtargetPredicate = HasDot1Insts
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index 091cac8cd35c..b3513e383d10 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1,9 +1,8 @@
//===-- VOPCInstructions.td - Vector Instruction Defintions ---------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -54,14 +53,29 @@ class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
// an explicit $dst.
class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt0> :
VOPProfile <[i1, vt0, vt1, untyped]> {
- let Asm32 = "vcc, $src0, $src1";
+ let Asm32 = "$src0, $src1";
// The destination for 32-bit encoding is implicit.
let HasDst32 = 0;
- let Outs64 = (outs VOPDstS64:$sdst);
+ let Outs64 = (outs VOPDstS64orS32:$sdst);
list<SchedReadWrite> Schedule = sched;
}
-class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[]> :
+class VOPC_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt0,
+ ValueType vt1 = vt0> :
+ VOPC_Profile<sched, vt0, vt1> {
+ let Outs64 = (outs );
+ let OutsSDWA = (outs );
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
+ let Asm64 = !if(isFloatType<Src0VT>.ret, "$src0_modifiers, $src1_modifiers$clamp",
+ "$src0, $src1");
+ let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
+ let EmitDst = 0;
+}
+
+class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[],
+ bit DefVcc = 1> :
InstSI<(outs), P.Ins32, "", pattern>,
VOP <opName>,
SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> {
@@ -81,9 +95,7 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[]> :
let VALU = 1;
let VOPC = 1;
let Uses = [EXEC];
- let Defs = [VCC];
-
- let SubtargetPredicate = isGCN;
+ let Defs = !if(DefVcc, [VCC], []);
VOPProfile Pfl = P;
}
@@ -115,8 +127,9 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
}
// This class is used only with VOPC instructions. Use $sdst for out operand
-class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
- InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {
+class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst,
+ string Asm32 = ps.Pfl.Asm32, VOPProfile p = ps.Pfl> :
+ InstAlias <ps.OpName#" "#Asm32, (inst)>, PredicateControl {
field bit isCompare;
field bit isCommutable;
@@ -149,6 +162,27 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
let SubtargetPredicate = AssemblerPredicate;
}
+multiclass VOPCInstAliases <string OpName, string Arch> {
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
+ !cast<Instruction>(OpName#"_e32_"#Arch)>;
+ let WaveSizePredicate = isWave32 in {
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
+ !cast<Instruction>(OpName#"_e32_"#Arch),
+ "vcc_lo, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>;
+ }
+ let WaveSizePredicate = isWave64 in {
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
+ !cast<Instruction>(OpName#"_e32_"#Arch),
+ "vcc, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>;
+ }
+}
+
+multiclass VOPCXInstAliases <string OpName, string Arch> {
+ def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"),
+ !cast<Instruction>(OpName#"_e32_"#Arch)>;
+}
+
+
class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set i1:$sdst,
@@ -161,6 +195,10 @@ class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
[(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]);
}
+class VCMPXNoSDstTable <bit has_sdst, string Name> {
+ bit HasSDst = has_sdst;
+ string NoSDstOp = Name;
+}
multiclass VOPC_Pseudos <string opName,
VOPC_Profile P,
@@ -169,7 +207,8 @@ multiclass VOPC_Pseudos <string opName,
bit DefExec = 0> {
def _e32 : VOPC_Pseudo <opName, P>,
- Commutable_REV<revOp#"_e32", !eq(revOp, opName)> {
+ Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
+ VCMPXNoSDstTable<1, opName#"_e32"> {
let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
let SchedRW = P.Schedule;
let isConvergent = DefExec;
@@ -178,7 +217,8 @@ multiclass VOPC_Pseudos <string opName,
}
def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
- Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
+ VCMPXNoSDstTable<1, opName#"_e64"> {
let Defs = !if(DefExec, [EXEC], []);
let SchedRW = P.Schedule;
let isCompare = 1;
@@ -193,6 +233,44 @@ multiclass VOPC_Pseudos <string opName,
}
}
+let SubtargetPredicate = HasSdstCMPX in {
+multiclass VOPCX_Pseudos <string opName,
+ VOPC_Profile P, VOPC_Profile P_NoSDst,
+ PatLeaf cond = COND_NULL,
+ string revOp = opName> :
+ VOPC_Pseudos <opName, P, cond, revOp, 1> {
+
+ def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
+ Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>,
+ VCMPXNoSDstTable<0, opName#"_e32"> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isConvergent = 1;
+ let isCompare = 1;
+ let isCommutable = 1;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+
+ def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
+ Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>,
+ VCMPXNoSDstTable<0, opName#"_e64"> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isCompare = 1;
+ let isCommutable = 1;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+
+ def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isConvergent = 1;
+ let isCompare = 1;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+}
+} // End SubtargetPredicate = HasSdstCMPX
+
def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>;
def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>;
@@ -200,6 +278,13 @@ def VOPC_I1_I16_I16 : VOPC_Profile<[Write32Bit], i16>;
def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>;
def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>;
+def VOPC_F16_F16 : VOPC_NoSdst_Profile<[Write32Bit], f16>;
+def VOPC_F32_F32 : VOPC_NoSdst_Profile<[Write32Bit], f32>;
+def VOPC_F64_F64 : VOPC_NoSdst_Profile<[Write64Bit], f64>;
+def VOPC_I16_I16 : VOPC_NoSdst_Profile<[Write32Bit], i16>;
+def VOPC_I32_I32 : VOPC_NoSdst_Profile<[Write32Bit], i32>;
+def VOPC_I64_I64 : VOPC_NoSdst_Profile<[Write64Bit], i64>;
+
multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>;
@@ -219,22 +304,22 @@ multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opN
VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
multiclass VOPCX_F16 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_F16_F16, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_F16_F16, VOPC_F16_F16, COND_NULL, revOp>;
multiclass VOPCX_F32 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_F32_F32, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_F32_F32, VOPC_F32_F32, COND_NULL, revOp>;
multiclass VOPCX_F64 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_F64_F64, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>;
multiclass VOPCX_I16 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_I16_I16, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_I16_I16, VOPC_I16_I16, COND_NULL, revOp>;
multiclass VOPCX_I32 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_I32_I32, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>;
multiclass VOPCX_I64 <string opName, string revOp = opName> :
- VOPC_Pseudos <opName, VOPC_I1_I64_I64, COND_NULL, revOp, 1>;
+ VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
//===----------------------------------------------------------------------===//
@@ -309,7 +394,7 @@ defm V_CMPX_NEQ_F64 : VOPCX_F64 <"v_cmpx_neq_f64">;
defm V_CMPX_NLT_F64 : VOPCX_F64 <"v_cmpx_nlt_f64">;
defm V_CMPX_TRU_F64 : VOPCX_F64 <"v_cmpx_tru_f64">;
-let SubtargetPredicate = isSICI in {
+let SubtargetPredicate = isGFX6GFX7 in {
defm V_CMPS_F_F32 : VOPC_F32 <"v_cmps_f_f32">;
defm V_CMPS_LT_F32 : VOPC_F32 <"v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">;
@@ -379,7 +464,7 @@ defm V_CMPSX_NEQ_F64 : VOPCX_F64 <"v_cmpsx_neq_f64">;
defm V_CMPSX_NLT_F64 : VOPCX_F64 <"v_cmpsx_nlt_f64">;
defm V_CMPSX_TRU_F64 : VOPCX_F64 <"v_cmpsx_tru_f64">;
-} // End SubtargetPredicate = isSICI
+} // End SubtargetPredicate = isGFX6GFX7
let SubtargetPredicate = Has16BitInsts in {
@@ -546,6 +631,18 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
let HasOMod = 0;
}
+class VOPC_Class_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt> :
+ VOPC_Class_Profile<sched, vt> {
+ let Outs64 = (outs );
+ let OutsSDWA = (outs );
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
+ let Asm64 = "$src0_modifiers, $src1";
+ let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
+ let EmitDst = 0;
+}
+
class getVOPCClassPat64 <VOPProfile P> {
list<dag> ret =
[(set i1:$sdst,
@@ -556,46 +653,85 @@ class getVOPCClassPat64 <VOPProfile P> {
// Special case for class instructions which only have modifiers on
// the 1st source operand.
-multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {
- def _e32 : VOPC_Pseudo <opName, p> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
+ bit DefVcc = 1> {
+ def _e32 : VOPC_Pseudo <opName, p>,
+ VCMPXNoSDstTable<1, opName#"_e32"> {
+ let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]),
+ !if(DefVcc, [VCC], []));
let SchedRW = p.Schedule;
let isConvergent = DefExec;
}
- def _e64 : VOP3_Pseudo<opName, p, getVOPCClassPat64<p>.ret> {
+ def _e64 : VOP3_Pseudo<opName, p, getVOPCClassPat64<p>.ret>,
+ VCMPXNoSDstTable<1, opName#"_e64"> {
let Defs = !if(DefExec, [EXEC], []);
let SchedRW = p.Schedule;
}
def _sdwa : VOPC_SDWA_Pseudo <opName, p> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]),
+ !if(DefVcc, [VCC], []));
let SchedRW = p.Schedule;
let isConvergent = DefExec;
}
}
+let SubtargetPredicate = HasSdstCMPX in {
+multiclass VOPCX_Class_Pseudos <string opName,
+ VOPC_Profile P,
+ VOPC_Profile P_NoSDst> :
+ VOPC_Class_Pseudos <opName, P, 1, 1> {
+
+ def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
+ VCMPXNoSDstTable<0, opName#"_e32"> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isConvergent = 1;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+
+ def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
+ VCMPXNoSDstTable<0, opName#"_e64"> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+
+ def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
+ let Defs = [EXEC];
+ let SchedRW = P_NoSDst.Schedule;
+ let isConvergent = 1;
+ let SubtargetPredicate = HasNoSdstCMPX;
+ }
+}
+} // End SubtargetPredicate = HasSdstCMPX
+
def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>;
def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>;
+def VOPC_F16_I32 : VOPC_Class_NoSdst_Profile<[Write32Bit], f16>;
+def VOPC_F32_I32 : VOPC_Class_NoSdst_Profile<[Write32Bit], f32>;
+def VOPC_F64_I32 : VOPC_Class_NoSdst_Profile<[Write64Bit], f64>;
+
multiclass VOPC_CLASS_F16 <string opName> :
VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>;
multiclass VOPCX_CLASS_F16 <string opName> :
- VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 1>;
+ VOPCX_Class_Pseudos <opName, VOPC_I1_F16_I32, VOPC_F16_I32>;
multiclass VOPC_CLASS_F32 <string opName> :
VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>;
multiclass VOPCX_CLASS_F32 <string opName> :
- VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>;
+ VOPCX_Class_Pseudos <opName, VOPC_I1_F32_I32, VOPC_F32_I32>;
multiclass VOPC_CLASS_F64 <string opName> :
VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 0>;
multiclass VOPCX_CLASS_F64 <string opName> :
- VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 1>;
+ VOPCX_Class_Pseudos <opName, VOPC_I1_F64_I32, VOPC_F64_I32>;
defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">;
defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
@@ -608,342 +744,471 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
// V_ICMPIntrinsic Pattern.
//===----------------------------------------------------------------------===//
-class ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat <
- (AMDGPUsetcc vt:$src0, vt:$src1, cond),
- (inst $src0, $src1)
->;
-
-def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
-def : ICMP_Pattern <COND_NE, V_CMP_NE_U32_e64, i32>;
-def : ICMP_Pattern <COND_UGT, V_CMP_GT_U32_e64, i32>;
-def : ICMP_Pattern <COND_UGE, V_CMP_GE_U32_e64, i32>;
-def : ICMP_Pattern <COND_ULT, V_CMP_LT_U32_e64, i32>;
-def : ICMP_Pattern <COND_ULE, V_CMP_LE_U32_e64, i32>;
-def : ICMP_Pattern <COND_SGT, V_CMP_GT_I32_e64, i32>;
-def : ICMP_Pattern <COND_SGE, V_CMP_GE_I32_e64, i32>;
-def : ICMP_Pattern <COND_SLT, V_CMP_LT_I32_e64, i32>;
-def : ICMP_Pattern <COND_SLE, V_CMP_LE_I32_e64, i32>;
-
-def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U64_e64, i64>;
-def : ICMP_Pattern <COND_NE, V_CMP_NE_U64_e64, i64>;
-def : ICMP_Pattern <COND_UGT, V_CMP_GT_U64_e64, i64>;
-def : ICMP_Pattern <COND_UGE, V_CMP_GE_U64_e64, i64>;
-def : ICMP_Pattern <COND_ULT, V_CMP_LT_U64_e64, i64>;
-def : ICMP_Pattern <COND_ULE, V_CMP_LE_U64_e64, i64>;
-def : ICMP_Pattern <COND_SGT, V_CMP_GT_I64_e64, i64>;
-def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
-def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
-def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
-
-def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_e64, i16>;
-def : ICMP_Pattern <COND_NE, V_CMP_NE_U16_e64, i16>;
-def : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_e64, i16>;
-def : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_e64, i16>;
-def : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_e64, i16>;
-def : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_e64, i16>;
-def : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_e64, i16>;
-def : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>;
-def : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>;
-def : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>;
-
-class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat <
- (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
- (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
- (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
- DSTCLAMP.NONE)
->;
-
-def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
-def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>;
-def : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>;
-def : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>;
-def : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>;
-def : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>;
-
-def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>;
-def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>;
-def : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>;
-def : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>;
-def : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>;
-def : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>;
-
-def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>;
-def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_e64, f16>;
-def : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_e64, f16>;
-def : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_e64, f16>;
-def : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_e64, f16>;
-def : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_e64, f16>;
-
-
-def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>;
-def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>;
-def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>;
-def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F32_e64, f32>;
-def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F32_e64, f32>;
-def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F32_e64, f32>;
-
-def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F64_e64, f64>;
-def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F64_e64, f64>;
-def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F64_e64, f64>;
-def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
-def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
-def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
-
-def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_e64, f16>;
-def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_e64, f16>;
-def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_e64, f16>;
-def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>;
-def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>;
-def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>;
+// We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith()
+// complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place.
+multiclass ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> {
+ let WaveSizePredicate = isWave64 in
+ def : GCNPat <
+ (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+ (i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64))
+ >;
+
+ let WaveSizePredicate = isWave32 in
+ def : GCNPat <
+ (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+ (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32))
+ >;
+}
+
+defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
+defm : ICMP_Pattern <COND_NE, V_CMP_NE_U32_e64, i32>;
+defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U32_e64, i32>;
+defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U32_e64, i32>;
+defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U32_e64, i32>;
+defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U32_e64, i32>;
+defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I32_e64, i32>;
+defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I32_e64, i32>;
+defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I32_e64, i32>;
+defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I32_e64, i32>;
+
+defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U64_e64, i64>;
+defm : ICMP_Pattern <COND_NE, V_CMP_NE_U64_e64, i64>;
+defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U64_e64, i64>;
+defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U64_e64, i64>;
+defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U64_e64, i64>;
+defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U64_e64, i64>;
+defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I64_e64, i64>;
+defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
+defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
+defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
+
+defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_e64, i16>;
+defm : ICMP_Pattern <COND_NE, V_CMP_NE_U16_e64, i16>;
+defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_e64, i16>;
+defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_e64, i16>;
+defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_e64, i16>;
+defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_e64, i16>;
+defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_e64, i16>;
+defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>;
+defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>;
+defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>;
+
+multiclass FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> {
+ let WaveSizePredicate = isWave64 in
+ def : GCNPat <
+ (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
+ (i64 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ DSTCLAMP.NONE), SReg_64))
+ >;
+
+ let WaveSizePredicate = isWave32 in
+ def : GCNPat <
+ (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
+ (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ DSTCLAMP.NONE), SReg_32))
+ >;
+}
+
+defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
+defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>;
+defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>;
+defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>;
+defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>;
+defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>;
+
+defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>;
+defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>;
+defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>;
+defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>;
+defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>;
+defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>;
+
+defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>;
+defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_e64, f16>;
+defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_e64, f16>;
+defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_e64, f16>;
+defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_e64, f16>;
+defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_e64, f16>;
+
+
+defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>;
+defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>;
+defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>;
+defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F32_e64, f32>;
+defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F32_e64, f32>;
+defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F32_e64, f32>;
+
+defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F64_e64, f64>;
+defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F64_e64, f64>;
+defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F64_e64, f64>;
+defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
+defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
+defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
+
+defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_e64, f16>;
+defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_e64, f16>;
+defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_e64, f16>;
+defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>;
+defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>;
+defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>;
//===----------------------------------------------------------------------===//
-// Target
+// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// SI
+// GFX10.
//===----------------------------------------------------------------------===//
-multiclass VOPC_Real_si <bits<9> op> {
- let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
- def _e32_si :
- VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
- VOPCe<op{7-0}>;
-
- def _e64_si :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
- VOP3a_si <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
- // Encoding used for VOPC instructions encoded as VOP3
- // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
- bits<8> sdst;
- let Inst{7-0} = sdst;
- }
+let AssemblerPredicate = isGFX10Plus in {
+ multiclass VOPC_Real_gfx10<bits<9> op> {
+ let DecoderNamespace = "GFX10" in {
+ def _e32_gfx10 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
+ VOPCe<op{7-0}>;
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+ VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
+ } // End DecoderNamespace = "GFX10"
+
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+ defm : VOPCInstAliases<NAME, "gfx10">;
}
- def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
- !cast<Instruction>(NAME#"_e32_si")> {
- let AssemblerPredicate = isSICI;
+
+ multiclass VOPCX_Real_gfx10<bits<9> op> {
+ let DecoderNamespace = "GFX10" in {
+ def _e32_gfx10 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>,
+ VOPCe<op{7-0}> {
+ let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr)
+ # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands;
+ }
+
+ def _e64_gfx10 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>,
+ VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> {
+ let Inst{7-0} = ?; // sdst
+ let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
+ # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
+ }
+ } // End DecoderNamespace = "GFX10"
+
+ def _sdwa_gfx10 :
+ VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa")>,
+ VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Pfl> {
+ let AsmString = !subst("_nosdst", "", !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Mnemonic)
+ # "{_sdwa} " # !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").AsmOperands9;
+ }
+
+ defm : VOPCXInstAliases<NAME, "gfx10">;
}
-}
+} // End AssemblerPredicate = isGFX10Plus
+
+defm V_CMP_LT_I16 : VOPC_Real_gfx10<0x089>;
+defm V_CMP_EQ_I16 : VOPC_Real_gfx10<0x08a>;
+defm V_CMP_LE_I16 : VOPC_Real_gfx10<0x08b>;
+defm V_CMP_GT_I16 : VOPC_Real_gfx10<0x08c>;
+defm V_CMP_NE_I16 : VOPC_Real_gfx10<0x08d>;
+defm V_CMP_GE_I16 : VOPC_Real_gfx10<0x08e>;
+defm V_CMP_CLASS_F16 : VOPC_Real_gfx10<0x08f>;
+defm V_CMPX_LT_I16 : VOPCX_Real_gfx10<0x099>;
+defm V_CMPX_EQ_I16 : VOPCX_Real_gfx10<0x09a>;
+defm V_CMPX_LE_I16 : VOPCX_Real_gfx10<0x09b>;
+defm V_CMPX_GT_I16 : VOPCX_Real_gfx10<0x09c>;
+defm V_CMPX_NE_I16 : VOPCX_Real_gfx10<0x09d>;
+defm V_CMPX_GE_I16 : VOPCX_Real_gfx10<0x09e>;
+defm V_CMPX_CLASS_F16 : VOPCX_Real_gfx10<0x09f>;
+defm V_CMP_LT_U16 : VOPC_Real_gfx10<0x0a9>;
+defm V_CMP_EQ_U16 : VOPC_Real_gfx10<0x0aa>;
+defm V_CMP_LE_U16 : VOPC_Real_gfx10<0x0ab>;
+defm V_CMP_GT_U16 : VOPC_Real_gfx10<0x0ac>;
+defm V_CMP_NE_U16 : VOPC_Real_gfx10<0x0ad>;
+defm V_CMP_GE_U16 : VOPC_Real_gfx10<0x0ae>;
+defm V_CMPX_LT_U16 : VOPCX_Real_gfx10<0x0b9>;
+defm V_CMPX_EQ_U16 : VOPCX_Real_gfx10<0x0ba>;
+defm V_CMPX_LE_U16 : VOPCX_Real_gfx10<0x0bb>;
+defm V_CMPX_GT_U16 : VOPCX_Real_gfx10<0x0bc>;
+defm V_CMPX_NE_U16 : VOPCX_Real_gfx10<0x0bd>;
+defm V_CMPX_GE_U16 : VOPCX_Real_gfx10<0x0be>;
+defm V_CMP_F_F16 : VOPC_Real_gfx10<0x0c8>;
+defm V_CMP_LT_F16 : VOPC_Real_gfx10<0x0c9>;
+defm V_CMP_EQ_F16 : VOPC_Real_gfx10<0x0ca>;
+defm V_CMP_LE_F16 : VOPC_Real_gfx10<0x0cb>;
+defm V_CMP_GT_F16 : VOPC_Real_gfx10<0x0cc>;
+defm V_CMP_LG_F16 : VOPC_Real_gfx10<0x0cd>;
+defm V_CMP_GE_F16 : VOPC_Real_gfx10<0x0ce>;
+defm V_CMP_O_F16 : VOPC_Real_gfx10<0x0cf>;
+defm V_CMPX_F_F16 : VOPCX_Real_gfx10<0x0d8>;
+defm V_CMPX_LT_F16 : VOPCX_Real_gfx10<0x0d9>;
+defm V_CMPX_EQ_F16 : VOPCX_Real_gfx10<0x0da>;
+defm V_CMPX_LE_F16 : VOPCX_Real_gfx10<0x0db>;
+defm V_CMPX_GT_F16 : VOPCX_Real_gfx10<0x0dc>;
+defm V_CMPX_LG_F16 : VOPCX_Real_gfx10<0x0dd>;
+defm V_CMPX_GE_F16 : VOPCX_Real_gfx10<0x0de>;
+defm V_CMPX_O_F16 : VOPCX_Real_gfx10<0x0df>;
+defm V_CMP_U_F16 : VOPC_Real_gfx10<0x0e8>;
+defm V_CMP_NGE_F16 : VOPC_Real_gfx10<0x0e9>;
+defm V_CMP_NLG_F16 : VOPC_Real_gfx10<0x0ea>;
+defm V_CMP_NGT_F16 : VOPC_Real_gfx10<0x0eb>;
+defm V_CMP_NLE_F16 : VOPC_Real_gfx10<0x0ec>;
+defm V_CMP_NEQ_F16 : VOPC_Real_gfx10<0x0ed>;
+defm V_CMP_NLT_F16 : VOPC_Real_gfx10<0x0ee>;
+defm V_CMP_TRU_F16 : VOPC_Real_gfx10<0x0ef>;
+defm V_CMPX_U_F16 : VOPCX_Real_gfx10<0x0f8>;
+defm V_CMPX_NGE_F16 : VOPCX_Real_gfx10<0x0f9>;
+defm V_CMPX_NLG_F16 : VOPCX_Real_gfx10<0x0fa>;
+defm V_CMPX_NGT_F16 : VOPCX_Real_gfx10<0x0fb>;
+defm V_CMPX_NLE_F16 : VOPCX_Real_gfx10<0x0fc>;
+defm V_CMPX_NEQ_F16 : VOPCX_Real_gfx10<0x0fd>;
+defm V_CMPX_NLT_F16 : VOPCX_Real_gfx10<0x0fe>;
+defm V_CMPX_TRU_F16 : VOPCX_Real_gfx10<0x0ff>;
-defm V_CMP_F_F32 : VOPC_Real_si <0x0>;
-defm V_CMP_LT_F32 : VOPC_Real_si <0x1>;
-defm V_CMP_EQ_F32 : VOPC_Real_si <0x2>;
-defm V_CMP_LE_F32 : VOPC_Real_si <0x3>;
-defm V_CMP_GT_F32 : VOPC_Real_si <0x4>;
-defm V_CMP_LG_F32 : VOPC_Real_si <0x5>;
-defm V_CMP_GE_F32 : VOPC_Real_si <0x6>;
-defm V_CMP_O_F32 : VOPC_Real_si <0x7>;
-defm V_CMP_U_F32 : VOPC_Real_si <0x8>;
-defm V_CMP_NGE_F32 : VOPC_Real_si <0x9>;
-defm V_CMP_NLG_F32 : VOPC_Real_si <0xa>;
-defm V_CMP_NGT_F32 : VOPC_Real_si <0xb>;
-defm V_CMP_NLE_F32 : VOPC_Real_si <0xc>;
-defm V_CMP_NEQ_F32 : VOPC_Real_si <0xd>;
-defm V_CMP_NLT_F32 : VOPC_Real_si <0xe>;
-defm V_CMP_TRU_F32 : VOPC_Real_si <0xf>;
-
-defm V_CMPX_F_F32 : VOPC_Real_si <0x10>;
-defm V_CMPX_LT_F32 : VOPC_Real_si <0x11>;
-defm V_CMPX_EQ_F32 : VOPC_Real_si <0x12>;
-defm V_CMPX_LE_F32 : VOPC_Real_si <0x13>;
-defm V_CMPX_GT_F32 : VOPC_Real_si <0x14>;
-defm V_CMPX_LG_F32 : VOPC_Real_si <0x15>;
-defm V_CMPX_GE_F32 : VOPC_Real_si <0x16>;
-defm V_CMPX_O_F32 : VOPC_Real_si <0x17>;
-defm V_CMPX_U_F32 : VOPC_Real_si <0x18>;
-defm V_CMPX_NGE_F32 : VOPC_Real_si <0x19>;
-defm V_CMPX_NLG_F32 : VOPC_Real_si <0x1a>;
-defm V_CMPX_NGT_F32 : VOPC_Real_si <0x1b>;
-defm V_CMPX_NLE_F32 : VOPC_Real_si <0x1c>;
-defm V_CMPX_NEQ_F32 : VOPC_Real_si <0x1d>;
-defm V_CMPX_NLT_F32 : VOPC_Real_si <0x1e>;
-defm V_CMPX_TRU_F32 : VOPC_Real_si <0x1f>;
-
-defm V_CMP_F_F64 : VOPC_Real_si <0x20>;
-defm V_CMP_LT_F64 : VOPC_Real_si <0x21>;
-defm V_CMP_EQ_F64 : VOPC_Real_si <0x22>;
-defm V_CMP_LE_F64 : VOPC_Real_si <0x23>;
-defm V_CMP_GT_F64 : VOPC_Real_si <0x24>;
-defm V_CMP_LG_F64 : VOPC_Real_si <0x25>;
-defm V_CMP_GE_F64 : VOPC_Real_si <0x26>;
-defm V_CMP_O_F64 : VOPC_Real_si <0x27>;
-defm V_CMP_U_F64 : VOPC_Real_si <0x28>;
-defm V_CMP_NGE_F64 : VOPC_Real_si <0x29>;
-defm V_CMP_NLG_F64 : VOPC_Real_si <0x2a>;
-defm V_CMP_NGT_F64 : VOPC_Real_si <0x2b>;
-defm V_CMP_NLE_F64 : VOPC_Real_si <0x2c>;
-defm V_CMP_NEQ_F64 : VOPC_Real_si <0x2d>;
-defm V_CMP_NLT_F64 : VOPC_Real_si <0x2e>;
-defm V_CMP_TRU_F64 : VOPC_Real_si <0x2f>;
-
-defm V_CMPX_F_F64 : VOPC_Real_si <0x30>;
-defm V_CMPX_LT_F64 : VOPC_Real_si <0x31>;
-defm V_CMPX_EQ_F64 : VOPC_Real_si <0x32>;
-defm V_CMPX_LE_F64 : VOPC_Real_si <0x33>;
-defm V_CMPX_GT_F64 : VOPC_Real_si <0x34>;
-defm V_CMPX_LG_F64 : VOPC_Real_si <0x35>;
-defm V_CMPX_GE_F64 : VOPC_Real_si <0x36>;
-defm V_CMPX_O_F64 : VOPC_Real_si <0x37>;
-defm V_CMPX_U_F64 : VOPC_Real_si <0x38>;
-defm V_CMPX_NGE_F64 : VOPC_Real_si <0x39>;
-defm V_CMPX_NLG_F64 : VOPC_Real_si <0x3a>;
-defm V_CMPX_NGT_F64 : VOPC_Real_si <0x3b>;
-defm V_CMPX_NLE_F64 : VOPC_Real_si <0x3c>;
-defm V_CMPX_NEQ_F64 : VOPC_Real_si <0x3d>;
-defm V_CMPX_NLT_F64 : VOPC_Real_si <0x3e>;
-defm V_CMPX_TRU_F64 : VOPC_Real_si <0x3f>;
-
-defm V_CMPS_F_F32 : VOPC_Real_si <0x40>;
-defm V_CMPS_LT_F32 : VOPC_Real_si <0x41>;
-defm V_CMPS_EQ_F32 : VOPC_Real_si <0x42>;
-defm V_CMPS_LE_F32 : VOPC_Real_si <0x43>;
-defm V_CMPS_GT_F32 : VOPC_Real_si <0x44>;
-defm V_CMPS_LG_F32 : VOPC_Real_si <0x45>;
-defm V_CMPS_GE_F32 : VOPC_Real_si <0x46>;
-defm V_CMPS_O_F32 : VOPC_Real_si <0x47>;
-defm V_CMPS_U_F32 : VOPC_Real_si <0x48>;
-defm V_CMPS_NGE_F32 : VOPC_Real_si <0x49>;
-defm V_CMPS_NLG_F32 : VOPC_Real_si <0x4a>;
-defm V_CMPS_NGT_F32 : VOPC_Real_si <0x4b>;
-defm V_CMPS_NLE_F32 : VOPC_Real_si <0x4c>;
-defm V_CMPS_NEQ_F32 : VOPC_Real_si <0x4d>;
-defm V_CMPS_NLT_F32 : VOPC_Real_si <0x4e>;
-defm V_CMPS_TRU_F32 : VOPC_Real_si <0x4f>;
-
-defm V_CMPSX_F_F32 : VOPC_Real_si <0x50>;
-defm V_CMPSX_LT_F32 : VOPC_Real_si <0x51>;
-defm V_CMPSX_EQ_F32 : VOPC_Real_si <0x52>;
-defm V_CMPSX_LE_F32 : VOPC_Real_si <0x53>;
-defm V_CMPSX_GT_F32 : VOPC_Real_si <0x54>;
-defm V_CMPSX_LG_F32 : VOPC_Real_si <0x55>;
-defm V_CMPSX_GE_F32 : VOPC_Real_si <0x56>;
-defm V_CMPSX_O_F32 : VOPC_Real_si <0x57>;
-defm V_CMPSX_U_F32 : VOPC_Real_si <0x58>;
-defm V_CMPSX_NGE_F32 : VOPC_Real_si <0x59>;
-defm V_CMPSX_NLG_F32 : VOPC_Real_si <0x5a>;
-defm V_CMPSX_NGT_F32 : VOPC_Real_si <0x5b>;
-defm V_CMPSX_NLE_F32 : VOPC_Real_si <0x5c>;
-defm V_CMPSX_NEQ_F32 : VOPC_Real_si <0x5d>;
-defm V_CMPSX_NLT_F32 : VOPC_Real_si <0x5e>;
-defm V_CMPSX_TRU_F32 : VOPC_Real_si <0x5f>;
-
-defm V_CMPS_F_F64 : VOPC_Real_si <0x60>;
-defm V_CMPS_LT_F64 : VOPC_Real_si <0x61>;
-defm V_CMPS_EQ_F64 : VOPC_Real_si <0x62>;
-defm V_CMPS_LE_F64 : VOPC_Real_si <0x63>;
-defm V_CMPS_GT_F64 : VOPC_Real_si <0x64>;
-defm V_CMPS_LG_F64 : VOPC_Real_si <0x65>;
-defm V_CMPS_GE_F64 : VOPC_Real_si <0x66>;
-defm V_CMPS_O_F64 : VOPC_Real_si <0x67>;
-defm V_CMPS_U_F64 : VOPC_Real_si <0x68>;
-defm V_CMPS_NGE_F64 : VOPC_Real_si <0x69>;
-defm V_CMPS_NLG_F64 : VOPC_Real_si <0x6a>;
-defm V_CMPS_NGT_F64 : VOPC_Real_si <0x6b>;
-defm V_CMPS_NLE_F64 : VOPC_Real_si <0x6c>;
-defm V_CMPS_NEQ_F64 : VOPC_Real_si <0x6d>;
-defm V_CMPS_NLT_F64 : VOPC_Real_si <0x6e>;
-defm V_CMPS_TRU_F64 : VOPC_Real_si <0x6f>;
-
-defm V_CMPSX_F_F64 : VOPC_Real_si <0x70>;
-defm V_CMPSX_LT_F64 : VOPC_Real_si <0x71>;
-defm V_CMPSX_EQ_F64 : VOPC_Real_si <0x72>;
-defm V_CMPSX_LE_F64 : VOPC_Real_si <0x73>;
-defm V_CMPSX_GT_F64 : VOPC_Real_si <0x74>;
-defm V_CMPSX_LG_F64 : VOPC_Real_si <0x75>;
-defm V_CMPSX_GE_F64 : VOPC_Real_si <0x76>;
-defm V_CMPSX_O_F64 : VOPC_Real_si <0x77>;
-defm V_CMPSX_U_F64 : VOPC_Real_si <0x78>;
-defm V_CMPSX_NGE_F64 : VOPC_Real_si <0x79>;
-defm V_CMPSX_NLG_F64 : VOPC_Real_si <0x7a>;
-defm V_CMPSX_NGT_F64 : VOPC_Real_si <0x7b>;
-defm V_CMPSX_NLE_F64 : VOPC_Real_si <0x7c>;
-defm V_CMPSX_NEQ_F64 : VOPC_Real_si <0x7d>;
-defm V_CMPSX_NLT_F64 : VOPC_Real_si <0x7e>;
-defm V_CMPSX_TRU_F64 : VOPC_Real_si <0x7f>;
-
-defm V_CMP_F_I32 : VOPC_Real_si <0x80>;
-defm V_CMP_LT_I32 : VOPC_Real_si <0x81>;
-defm V_CMP_EQ_I32 : VOPC_Real_si <0x82>;
-defm V_CMP_LE_I32 : VOPC_Real_si <0x83>;
-defm V_CMP_GT_I32 : VOPC_Real_si <0x84>;
-defm V_CMP_NE_I32 : VOPC_Real_si <0x85>;
-defm V_CMP_GE_I32 : VOPC_Real_si <0x86>;
-defm V_CMP_T_I32 : VOPC_Real_si <0x87>;
-
-defm V_CMPX_F_I32 : VOPC_Real_si <0x90>;
-defm V_CMPX_LT_I32 : VOPC_Real_si <0x91>;
-defm V_CMPX_EQ_I32 : VOPC_Real_si <0x92>;
-defm V_CMPX_LE_I32 : VOPC_Real_si <0x93>;
-defm V_CMPX_GT_I32 : VOPC_Real_si <0x94>;
-defm V_CMPX_NE_I32 : VOPC_Real_si <0x95>;
-defm V_CMPX_GE_I32 : VOPC_Real_si <0x96>;
-defm V_CMPX_T_I32 : VOPC_Real_si <0x97>;
-
-defm V_CMP_F_I64 : VOPC_Real_si <0xa0>;
-defm V_CMP_LT_I64 : VOPC_Real_si <0xa1>;
-defm V_CMP_EQ_I64 : VOPC_Real_si <0xa2>;
-defm V_CMP_LE_I64 : VOPC_Real_si <0xa3>;
-defm V_CMP_GT_I64 : VOPC_Real_si <0xa4>;
-defm V_CMP_NE_I64 : VOPC_Real_si <0xa5>;
-defm V_CMP_GE_I64 : VOPC_Real_si <0xa6>;
-defm V_CMP_T_I64 : VOPC_Real_si <0xa7>;
-
-defm V_CMPX_F_I64 : VOPC_Real_si <0xb0>;
-defm V_CMPX_LT_I64 : VOPC_Real_si <0xb1>;
-defm V_CMPX_EQ_I64 : VOPC_Real_si <0xb2>;
-defm V_CMPX_LE_I64 : VOPC_Real_si <0xb3>;
-defm V_CMPX_GT_I64 : VOPC_Real_si <0xb4>;
-defm V_CMPX_NE_I64 : VOPC_Real_si <0xb5>;
-defm V_CMPX_GE_I64 : VOPC_Real_si <0xb6>;
-defm V_CMPX_T_I64 : VOPC_Real_si <0xb7>;
-
-defm V_CMP_F_U32 : VOPC_Real_si <0xc0>;
-defm V_CMP_LT_U32 : VOPC_Real_si <0xc1>;
-defm V_CMP_EQ_U32 : VOPC_Real_si <0xc2>;
-defm V_CMP_LE_U32 : VOPC_Real_si <0xc3>;
-defm V_CMP_GT_U32 : VOPC_Real_si <0xc4>;
-defm V_CMP_NE_U32 : VOPC_Real_si <0xc5>;
-defm V_CMP_GE_U32 : VOPC_Real_si <0xc6>;
-defm V_CMP_T_U32 : VOPC_Real_si <0xc7>;
-
-defm V_CMPX_F_U32 : VOPC_Real_si <0xd0>;
-defm V_CMPX_LT_U32 : VOPC_Real_si <0xd1>;
-defm V_CMPX_EQ_U32 : VOPC_Real_si <0xd2>;
-defm V_CMPX_LE_U32 : VOPC_Real_si <0xd3>;
-defm V_CMPX_GT_U32 : VOPC_Real_si <0xd4>;
-defm V_CMPX_NE_U32 : VOPC_Real_si <0xd5>;
-defm V_CMPX_GE_U32 : VOPC_Real_si <0xd6>;
-defm V_CMPX_T_U32 : VOPC_Real_si <0xd7>;
-
-defm V_CMP_F_U64 : VOPC_Real_si <0xe0>;
-defm V_CMP_LT_U64 : VOPC_Real_si <0xe1>;
-defm V_CMP_EQ_U64 : VOPC_Real_si <0xe2>;
-defm V_CMP_LE_U64 : VOPC_Real_si <0xe3>;
-defm V_CMP_GT_U64 : VOPC_Real_si <0xe4>;
-defm V_CMP_NE_U64 : VOPC_Real_si <0xe5>;
-defm V_CMP_GE_U64 : VOPC_Real_si <0xe6>;
-defm V_CMP_T_U64 : VOPC_Real_si <0xe7>;
-
-defm V_CMPX_F_U64 : VOPC_Real_si <0xf0>;
-defm V_CMPX_LT_U64 : VOPC_Real_si <0xf1>;
-defm V_CMPX_EQ_U64 : VOPC_Real_si <0xf2>;
-defm V_CMPX_LE_U64 : VOPC_Real_si <0xf3>;
-defm V_CMPX_GT_U64 : VOPC_Real_si <0xf4>;
-defm V_CMPX_NE_U64 : VOPC_Real_si <0xf5>;
-defm V_CMPX_GE_U64 : VOPC_Real_si <0xf6>;
-defm V_CMPX_T_U64 : VOPC_Real_si <0xf7>;
-
-defm V_CMP_CLASS_F32 : VOPC_Real_si <0x88>;
-defm V_CMPX_CLASS_F32 : VOPC_Real_si <0x98>;
-defm V_CMP_CLASS_F64 : VOPC_Real_si <0xa8>;
-defm V_CMPX_CLASS_F64 : VOPC_Real_si <0xb8>;
+//===----------------------------------------------------------------------===//
+// GFX6, GFX7, GFX10.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX6GFX7 in {
+ multiclass VOPC_Real_gfx6_gfx7<bits<9> op> {
+ let DecoderNamespace = "GFX6GFX7" in {
+ def _e32_gfx6_gfx7 :
+ VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+ VOPCe<op{7-0}>;
+ def _e64_gfx6_gfx7 :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+ VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Encoding used for VOPC instructions encoded as VOP3 differs from
+ // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+ bits<8> sdst;
+ let Inst{7-0} = sdst;
+ }
+ } // End DecoderNamespace = "GFX6GFX7"
+
+ defm : VOPCInstAliases<NAME, "gfx6_gfx7">;
+ }
+} // End AssemblerPredicate = isGFX6GFX7
+
+multiclass VOPC_Real_gfx6_gfx7_gfx10<bits<9> op> :
+ VOPC_Real_gfx6_gfx7<op>, VOPC_Real_gfx10<op>;
+
+multiclass VOPCX_Real_gfx6_gfx7<bits<9> op> :
+ VOPC_Real_gfx6_gfx7<op>;
+
+multiclass VOPCX_Real_gfx6_gfx7_gfx10 <bits<9> op> :
+ VOPC_Real_gfx6_gfx7<op>, VOPCX_Real_gfx10<op>;
+
+defm V_CMP_F_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x000>;
+defm V_CMP_LT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x001>;
+defm V_CMP_EQ_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x002>;
+defm V_CMP_LE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x003>;
+defm V_CMP_GT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x004>;
+defm V_CMP_LG_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x005>;
+defm V_CMP_GE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x006>;
+defm V_CMP_O_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x007>;
+defm V_CMP_U_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x008>;
+defm V_CMP_NGE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x009>;
+defm V_CMP_NLG_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00a>;
+defm V_CMP_NGT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00b>;
+defm V_CMP_NLE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00c>;
+defm V_CMP_NEQ_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00d>;
+defm V_CMP_NLT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00e>;
+defm V_CMP_TRU_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00f>;
+defm V_CMPX_F_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x010>;
+defm V_CMPX_LT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x011>;
+defm V_CMPX_EQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x012>;
+defm V_CMPX_LE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x013>;
+defm V_CMPX_GT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x014>;
+defm V_CMPX_LG_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x015>;
+defm V_CMPX_GE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x016>;
+defm V_CMPX_O_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x017>;
+defm V_CMPX_U_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x018>;
+defm V_CMPX_NGE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x019>;
+defm V_CMPX_NLG_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01a>;
+defm V_CMPX_NGT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01b>;
+defm V_CMPX_NLE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01c>;
+defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01d>;
+defm V_CMPX_NLT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01e>;
+defm V_CMPX_TRU_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01f>;
+defm V_CMP_F_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x020>;
+defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x021>;
+defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x022>;
+defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x023>;
+defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x024>;
+defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x025>;
+defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x026>;
+defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x027>;
+defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x028>;
+defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x029>;
+defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02a>;
+defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02b>;
+defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02c>;
+defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02d>;
+defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02e>;
+defm V_CMP_TRU_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02f>;
+defm V_CMPX_F_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x030>;
+defm V_CMPX_LT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x031>;
+defm V_CMPX_EQ_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x032>;
+defm V_CMPX_LE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x033>;
+defm V_CMPX_GT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x034>;
+defm V_CMPX_LG_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x035>;
+defm V_CMPX_GE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x036>;
+defm V_CMPX_O_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x037>;
+defm V_CMPX_U_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x038>;
+defm V_CMPX_NGE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x039>;
+defm V_CMPX_NLG_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03a>;
+defm V_CMPX_NGT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03b>;
+defm V_CMPX_NLE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03c>;
+defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03d>;
+defm V_CMPX_NLT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03e>;
+defm V_CMPX_TRU_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03f>;
+defm V_CMPS_F_F32 : VOPC_Real_gfx6_gfx7<0x040>;
+defm V_CMPS_LT_F32 : VOPC_Real_gfx6_gfx7<0x041>;
+defm V_CMPS_EQ_F32 : VOPC_Real_gfx6_gfx7<0x042>;
+defm V_CMPS_LE_F32 : VOPC_Real_gfx6_gfx7<0x043>;
+defm V_CMPS_GT_F32 : VOPC_Real_gfx6_gfx7<0x044>;
+defm V_CMPS_LG_F32 : VOPC_Real_gfx6_gfx7<0x045>;
+defm V_CMPS_GE_F32 : VOPC_Real_gfx6_gfx7<0x046>;
+defm V_CMPS_O_F32 : VOPC_Real_gfx6_gfx7<0x047>;
+defm V_CMPS_U_F32 : VOPC_Real_gfx6_gfx7<0x048>;
+defm V_CMPS_NGE_F32 : VOPC_Real_gfx6_gfx7<0x049>;
+defm V_CMPS_NLG_F32 : VOPC_Real_gfx6_gfx7<0x04a>;
+defm V_CMPS_NGT_F32 : VOPC_Real_gfx6_gfx7<0x04b>;
+defm V_CMPS_NLE_F32 : VOPC_Real_gfx6_gfx7<0x04c>;
+defm V_CMPS_NEQ_F32 : VOPC_Real_gfx6_gfx7<0x04d>;
+defm V_CMPS_NLT_F32 : VOPC_Real_gfx6_gfx7<0x04e>;
+defm V_CMPS_TRU_F32 : VOPC_Real_gfx6_gfx7<0x04f>;
+defm V_CMPSX_F_F32 : VOPCX_Real_gfx6_gfx7<0x050>;
+defm V_CMPSX_LT_F32 : VOPCX_Real_gfx6_gfx7<0x051>;
+defm V_CMPSX_EQ_F32 : VOPCX_Real_gfx6_gfx7<0x052>;
+defm V_CMPSX_LE_F32 : VOPCX_Real_gfx6_gfx7<0x053>;
+defm V_CMPSX_GT_F32 : VOPCX_Real_gfx6_gfx7<0x054>;
+defm V_CMPSX_LG_F32 : VOPCX_Real_gfx6_gfx7<0x055>;
+defm V_CMPSX_GE_F32 : VOPCX_Real_gfx6_gfx7<0x056>;
+defm V_CMPSX_O_F32 : VOPCX_Real_gfx6_gfx7<0x057>;
+defm V_CMPSX_U_F32 : VOPCX_Real_gfx6_gfx7<0x058>;
+defm V_CMPSX_NGE_F32 : VOPCX_Real_gfx6_gfx7<0x059>;
+defm V_CMPSX_NLG_F32 : VOPCX_Real_gfx6_gfx7<0x05a>;
+defm V_CMPSX_NGT_F32 : VOPCX_Real_gfx6_gfx7<0x05b>;
+defm V_CMPSX_NLE_F32 : VOPCX_Real_gfx6_gfx7<0x05c>;
+defm V_CMPSX_NEQ_F32 : VOPCX_Real_gfx6_gfx7<0x05d>;
+defm V_CMPSX_NLT_F32 : VOPCX_Real_gfx6_gfx7<0x05e>;
+defm V_CMPSX_TRU_F32 : VOPCX_Real_gfx6_gfx7<0x05f>;
+defm V_CMPS_F_F64 : VOPC_Real_gfx6_gfx7<0x060>;
+defm V_CMPS_LT_F64 : VOPC_Real_gfx6_gfx7<0x061>;
+defm V_CMPS_EQ_F64 : VOPC_Real_gfx6_gfx7<0x062>;
+defm V_CMPS_LE_F64 : VOPC_Real_gfx6_gfx7<0x063>;
+defm V_CMPS_GT_F64 : VOPC_Real_gfx6_gfx7<0x064>;
+defm V_CMPS_LG_F64 : VOPC_Real_gfx6_gfx7<0x065>;
+defm V_CMPS_GE_F64 : VOPC_Real_gfx6_gfx7<0x066>;
+defm V_CMPS_O_F64 : VOPC_Real_gfx6_gfx7<0x067>;
+defm V_CMPS_U_F64 : VOPC_Real_gfx6_gfx7<0x068>;
+defm V_CMPS_NGE_F64 : VOPC_Real_gfx6_gfx7<0x069>;
+defm V_CMPS_NLG_F64 : VOPC_Real_gfx6_gfx7<0x06a>;
+defm V_CMPS_NGT_F64 : VOPC_Real_gfx6_gfx7<0x06b>;
+defm V_CMPS_NLE_F64 : VOPC_Real_gfx6_gfx7<0x06c>;
+defm V_CMPS_NEQ_F64 : VOPC_Real_gfx6_gfx7<0x06d>;
+defm V_CMPS_NLT_F64 : VOPC_Real_gfx6_gfx7<0x06e>;
+defm V_CMPS_TRU_F64 : VOPC_Real_gfx6_gfx7<0x06f>;
+defm V_CMPSX_F_F64 : VOPCX_Real_gfx6_gfx7<0x070>;
+defm V_CMPSX_LT_F64 : VOPCX_Real_gfx6_gfx7<0x071>;
+defm V_CMPSX_EQ_F64 : VOPCX_Real_gfx6_gfx7<0x072>;
+defm V_CMPSX_LE_F64 : VOPCX_Real_gfx6_gfx7<0x073>;
+defm V_CMPSX_GT_F64 : VOPCX_Real_gfx6_gfx7<0x074>;
+defm V_CMPSX_LG_F64 : VOPCX_Real_gfx6_gfx7<0x075>;
+defm V_CMPSX_GE_F64 : VOPCX_Real_gfx6_gfx7<0x076>;
+defm V_CMPSX_O_F64 : VOPCX_Real_gfx6_gfx7<0x077>;
+defm V_CMPSX_U_F64 : VOPCX_Real_gfx6_gfx7<0x078>;
+defm V_CMPSX_NGE_F64 : VOPCX_Real_gfx6_gfx7<0x079>;
+defm V_CMPSX_NLG_F64 : VOPCX_Real_gfx6_gfx7<0x07a>;
+defm V_CMPSX_NGT_F64 : VOPCX_Real_gfx6_gfx7<0x07b>;
+defm V_CMPSX_NLE_F64 : VOPCX_Real_gfx6_gfx7<0x07c>;
+defm V_CMPSX_NEQ_F64 : VOPCX_Real_gfx6_gfx7<0x07d>;
+defm V_CMPSX_NLT_F64 : VOPCX_Real_gfx6_gfx7<0x07e>;
+defm V_CMPSX_TRU_F64 : VOPCX_Real_gfx6_gfx7<0x07f>;
+defm V_CMP_F_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x080>;
+defm V_CMP_LT_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x081>;
+defm V_CMP_EQ_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x082>;
+defm V_CMP_LE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x083>;
+defm V_CMP_GT_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x084>;
+defm V_CMP_NE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x085>;
+defm V_CMP_GE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x086>;
+defm V_CMP_T_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x087>;
+defm V_CMP_CLASS_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x088>;
+defm V_CMPX_F_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x090>;
+defm V_CMPX_LT_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x091>;
+defm V_CMPX_EQ_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x092>;
+defm V_CMPX_LE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x093>;
+defm V_CMPX_GT_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x094>;
+defm V_CMPX_NE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x095>;
+defm V_CMPX_GE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x096>;
+defm V_CMPX_T_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x097>;
+defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x098>;
+defm V_CMP_F_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a0>;
+defm V_CMP_LT_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a1>;
+defm V_CMP_EQ_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a2>;
+defm V_CMP_LE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a3>;
+defm V_CMP_GT_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a4>;
+defm V_CMP_NE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a5>;
+defm V_CMP_GE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a6>;
+defm V_CMP_T_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a7>;
+defm V_CMP_CLASS_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a8>;
+defm V_CMPX_F_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b0>;
+defm V_CMPX_LT_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b1>;
+defm V_CMPX_EQ_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b2>;
+defm V_CMPX_LE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b3>;
+defm V_CMPX_GT_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b4>;
+defm V_CMPX_NE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b5>;
+defm V_CMPX_GE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b6>;
+defm V_CMPX_T_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b7>;
+defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b8>;
+defm V_CMP_F_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c0>;
+defm V_CMP_LT_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c1>;
+defm V_CMP_EQ_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c2>;
+defm V_CMP_LE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c3>;
+defm V_CMP_GT_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c4>;
+defm V_CMP_NE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c5>;
+defm V_CMP_GE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c6>;
+defm V_CMP_T_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c7>;
+defm V_CMPX_F_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d0>;
+defm V_CMPX_LT_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d1>;
+defm V_CMPX_EQ_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d2>;
+defm V_CMPX_LE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d3>;
+defm V_CMPX_GT_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d4>;
+defm V_CMPX_NE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d5>;
+defm V_CMPX_GE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d6>;
+defm V_CMPX_T_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d7>;
+defm V_CMP_F_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e0>;
+defm V_CMP_LT_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e1>;
+defm V_CMP_EQ_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e2>;
+defm V_CMP_LE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e3>;
+defm V_CMP_GT_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e4>;
+defm V_CMP_NE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e5>;
+defm V_CMP_GE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e6>;
+defm V_CMP_T_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e7>;
+defm V_CMPX_F_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f0>;
+defm V_CMPX_LT_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f1>;
+defm V_CMPX_EQ_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f2>;
+defm V_CMPX_LE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f3>;
+defm V_CMPX_GT_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f4>;
+defm V_CMPX_NE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f5>;
+defm V_CMPX_GE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f6>;
+defm V_CMPX_T_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f7>;
//===----------------------------------------------------------------------===//
-// VI
+// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
multiclass VOPC_Real_vi <bits<10> op> {
- let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+ let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in {
def _e32_vi :
VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
VOPCe<op{7-0}>;
@@ -966,9 +1231,8 @@ multiclass VOPC_Real_vi <bits<10> op> {
VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
- def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
- !cast<Instruction>(NAME#"_e32_vi")> {
- let AssemblerPredicate = isVI;
+ let AssemblerPredicate = isGFX8GFX9 in {
+ defm : VOPCInstAliases<NAME, "vi">;
}
}
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 7de7d90d27b3..677095a354be 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -1,9 +1,8 @@
//===-- VOPInstructions.td - Vector Instruction Defintions ----------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
@@ -91,6 +90,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let VOP3_OPSEL = isVop3OpSel;
let IsPacked = P.IsPacked;
+ let IsMAI = P.IsMAI;
let AsmOperands = !if(isVop3OpSel,
P.AsmVOP3OpSel,
@@ -100,7 +100,6 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
- let SubtargetPredicate = isGCN;
// Because SGPRs may be allowed if there are multiple operands, we
// need a post-isel hook to insert copies in order to avoid
@@ -190,9 +189,15 @@ class VOP3a<VOPProfile P> : Enc64 {
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
}
-class VOP3a_si <bits<9> op, VOPProfile P> : VOP3a<P> {
+class VOP3a_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3a<p> {
+ let Inst{11} = !if(p.HasClamp, clamp{0}, 0);
let Inst{25-17} = op;
- let Inst{11} = !if(P.HasClamp, clamp{0}, 0);
+}
+
+class VOP3a_gfx10<bits<10> op, VOPProfile p> : VOP3a<p> {
+ let Inst{15} = !if(p.HasClamp, clamp{0}, 0);
+ let Inst{25-16} = op;
+ let Inst{31-26} = 0x35;
}
class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> {
@@ -200,9 +205,14 @@ class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> {
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
}
-class VOP3e_si <bits<9> op, VOPProfile P> : VOP3a_si <op, P> {
+class VOP3e_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3a_gfx6_gfx7<op, p> {
bits<8> vdst;
- let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0);
+}
+
+class VOP3e_gfx10<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p> {
+ bits<8> vdst;
+ let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0);
}
class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
@@ -217,6 +227,13 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
let Inst{14} = !if(P.HasDst, src0_modifiers{3}, 0);
}
+class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+ let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0);
+ let Inst{12} = !if(p.HasSrc1, src1_modifiers{2}, 0);
+ let Inst{13} = !if(p.HasSrc2, src2_modifiers{2}, 0);
+ let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0);
+}
+
// NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa
class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
bits<2> attrchan;
@@ -236,6 +253,21 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
let Inst{49-41} = src0;
}
+class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+ bits<6> attr;
+ bits<2> attrchan;
+ bits<1> high;
+
+ let Inst{8} = 0;
+ let Inst{9} = !if(p.HasSrc0Mods, src0_modifiers{1}, 0);
+ let Inst{37-32} = attr;
+ let Inst{39-38} = attrchan;
+ let Inst{40} = !if(p.HasHigh, high, 0);
+ let Inst{49-41} = src0;
+ let Inst{61} = 0;
+ let Inst{62} = !if(p.HasSrc0Mods, src0_modifiers{0}, 0);
+}
+
class VOP3be <VOPProfile P> : Enc64 {
bits<8> vdst;
bits<2> src0_modifiers;
@@ -295,10 +327,51 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
}
-class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> {
+class VOP3Pe_MAI <bits<10> op, VOPProfile P> : Enc64 {
+ bits<8> vdst;
+ bits<10> src0;
+ bits<10> src1;
+ bits<9> src2;
+ bits<3> blgp;
+ bits<3> cbsz;
+ bits<4> abid;
+ bits<1> clamp;
+
+ let Inst{7-0} = vdst;
+
+ let Inst{10-8} = !if(P.HasSrc1, cbsz, 0);
+ let Inst{14-11} = !if(P.HasSrc1, abid, 0);
+
+ let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+
+ let Inst{25-16} = op;
+ let Inst{31-26} = 0x34; //encoding
+ let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
+ let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+
+ let Inst{59} = !if(P.HasSrc0, src0{9}, 0); // acc(0)
+ let Inst{60} = !if(P.HasSrc1, src1{9}, 0); // acc(1)
+
+ let Inst{63-61} = !if(P.HasSrc1, blgp, 0);
+}
+
+
+class VOP3Pe_gfx10 <bits<10> op, VOPProfile P> : VOP3Pe<op, P> {
+ let Inst{31-26} = 0x33; //encoding
+}
+
+class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> {
let Inst{25-17} = op;
}
+class VOP3be_gfx10<bits<10> op, VOPProfile p> : VOP3be<p> {
+ bits<1> clamp;
+ let Inst{15} = !if(p.HasClamp, clamp{0}, 0);
+ let Inst{25-16} = op;
+ let Inst{31-26} = 0x35;
+}
+
class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> {
bits<1> clamp;
let Inst{25-16} = op;
@@ -393,7 +466,7 @@ class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> {
class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> {
bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}}
- let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0);
+ let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, ?);
let Inst{47} = !if(P.EmitDst, sdst{7}, 0);
}
@@ -456,9 +529,8 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
let TSFlags = ps.TSFlags;
}
-class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []>,
- SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9> {
+class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []> {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -485,7 +557,20 @@ class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
let TSFlags = ps.TSFlags;
}
-class VOP_DPPe<VOPProfile P> : Enc64 {
+class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
+ Base_VOP_SDWA9_Real <ps >,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9>;
+
+class Base_VOP_SDWA10_Real<VOP_SDWA_Pseudo ps> : Base_VOP_SDWA9_Real<ps> {
+ let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst);
+ let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst);
+ let DecoderNamespace = "SDWA10";
+}
+
+class VOP_SDWA10_Real<VOP_SDWA_Pseudo ps> :
+ Base_VOP_SDWA10_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SDWA10>;
+
+class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 {
bits<2> src0_modifiers;
bits<8> src0;
bits<2> src1_modifiers;
@@ -493,9 +578,11 @@ class VOP_DPPe<VOPProfile P> : Enc64 {
bits<1> bound_ctrl;
bits<4> bank_mask;
bits<4> row_mask;
+ bit fi;
let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
let Inst{48-40} = dpp_ctrl;
+ let Inst{50} = !if(IsDPP16, fi, ?);
let Inst{51} = bound_ctrl;
let Inst{52} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg
let Inst{53} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs
@@ -533,8 +620,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst);
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
- let Constraints = !if(P.NumSrcArgs, "$old = $vdst", "");
- let DisableEncoding = !if(P.NumSrcArgs, "$old", "");
+ let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
+ let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
let DecoderNamespace = "DPP";
VOPProfile Pfl = P;
@@ -568,6 +655,67 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
let TSFlags = ps.TSFlags;
}
+class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
+ dag InsDPP = !if(IsDPP16, P.InsDPP16, P.InsDPP),
+ string AsmDPP = !if(IsDPP16, P.AsmDPP16, P.AsmDPP)> :
+ InstSI <P.OutsDPP, InsDPP, OpName#AsmDPP, []>,
+ VOP_DPPe<P, IsDPP16> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+
+ let VALU = 1;
+ let DPP = 1;
+ let Size = 8;
+
+ let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
+ let SubtargetPredicate = HasDPP;
+ let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst);
+ let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
+ AMDGPUAsmVariants.Disable);
+ let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
+ let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
+ let DecoderNamespace = "DPP";
+}
+
+class VOP_DPP8e<VOPProfile P> : Enc64 {
+ bits<8> src0;
+ bits<24> dpp8;
+ bits<9> fi;
+
+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{63-40} = dpp8{23-0};
+}
+
+class VOP_DPP8<string OpName, VOPProfile P> :
+ InstSI<P.OutsDPP8, P.InsDPP8, OpName#P.AsmDPP8, []>,
+ VOP_DPP8e<P> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+
+ let VALU = 1;
+ let DPP = 1;
+ let Size = 8;
+
+ let AsmMatchConverter = "cvtDPP8";
+ let SubtargetPredicate = HasDPP8;
+ let AssemblerPredicate = !if(P.HasExt, HasDPP8, DisableInst);
+ let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
+ AMDGPUAsmVariants.Disable);
+ let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
+ let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
+}
+
+def DPP8Mode {
+ int FI_0 = 0xE9;
+ int FI_1 = 0xEA;
+}
+
class getNumNodeArgs<SDPatternOperator Op> {
SDNode N = !cast<SDNode>(Op);
SDTypeProfile TP = N.TypeProfile;