diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
commit | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch) | |
tree | 599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Target/AMDGPU | |
parent | 1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff) | |
download | src-e6d1592492a3a379186bfb02bd0f4eda0669c0d5.tar.gz src-e6d1592492a3a379186bfb02bd0f4eda0669c0d5.zip |
Notes
Diffstat (limited to 'lib/Target/AMDGPU')
212 files changed, 29044 insertions, 9242 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index bb7801c172f6..19a8bd901629 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -1,9 +1,8 @@ //===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// @@ -51,14 +50,16 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); -FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitcntsPass(); -FunctionPass *createSIFixWWMLivenessPass(); +FunctionPass *createSIPreAllocateWWMRegsPass(); FunctionPass *createSIFormMemoryClausesPass(); -FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &); +FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &, + const TargetMachine *); FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); +FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); +ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); FunctionPass *createAMDGPURewriteOutArgumentsPass(); FunctionPass *createSIModeRegisterPass(); @@ -93,6 +94,12 @@ ModulePass *createAMDGPULowerKernelAttributesPass(); void initializeAMDGPULowerKernelAttributesPass(PassRegistry &); extern char &AMDGPULowerKernelAttributesID; +void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &); +extern char &AMDGPUPropagateAttributesEarlyID; + +void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &); +extern char &AMDGPUPropagateAttributesLateID; + void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; @@ -135,6 +142,9 @@ extern char &SIFixupVectorISelID; void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; +void initializeSILowerSGPRSpillsPass(PassRegistry &); +extern char &SILowerSGPRSpillsID; + void initializeSILoadStoreOptimizerPass(PassRegistry &); extern char &SILoadStoreOptimizerID; @@ -150,8 +160,8 @@ extern char &SIInsertSkipsPassID; void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; -void initializeSIFixWWMLivenessPass(PassRegistry &); -extern char &SIFixWWMLivenessID; +void initializeSIPreAllocateWWMRegsPass(PassRegistry &); +extern char &SIPreAllocateWWMRegsID; void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &); extern char &AMDGPUSimplifyLibCallsID; @@ -197,9 +207,6 @@ extern char &SIAnnotateControlFlowPassID; void initializeSIMemoryLegalizerPass(PassRegistry&); extern char &SIMemoryLegalizerID; -void initializeSIDebuggerInsertNopsPass(PassRegistry&); -extern char &SIDebuggerInsertNopsID; - void initializeSIModeRegisterPass(PassRegistry&); extern char &SIModeRegisterID; @@ -226,8 +233,11 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; -Target &getTheAMDGPUTarget(); -Target &getTheGCNTarget(); +void initializeGCNRegBankReassignPass(PassRegistry &); +extern char &GCNRegBankReassignID; + +void initializeGCNNSAReassignPass(PassRegistry &); +extern char &GCNNSAReassignID; namespace AMDGPU { enum TargetIndex { @@ -250,21 +260,23 @@ enum TargetIndex { namespace AMDGPUAS { enum : unsigned { // The maximum value for flat, generic, local, private, constant and region. - MAX_AMDGPU_ADDRESS = 6, + MAX_AMDGPU_ADDRESS = 7, FLAT_ADDRESS = 0, ///< Address space for flat memory. GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - REGION_ADDRESS = 2, ///< Address space for region memory. + REGION_ADDRESS = 2, ///< Address space for region memory. (GDS) - CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2) + CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2). LOCAL_ADDRESS = 3, ///< Address space for local memory. PRIVATE_ADDRESS = 5, ///< Address space for private memory. - CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory + CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory. + + BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers. - /// Address space for direct addressible parameter memory (CONST0) + /// Address space for direct addressible parameter memory (CONST0). PARAM_D_ADDRESS = 6, - /// Address space for indirect addressible parameter memory (VTX1) + /// Address space for indirect addressible parameter memory (VTX1). PARAM_I_ADDRESS = 7, // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 6a4cfe08e491..baeba534012c 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -1,9 +1,8 @@ //===-- AMDGPU.td - AMDGPU Tablegen files --------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===------------------------------------------------------------===// @@ -61,6 +60,12 @@ def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", "Have scratch_* flat memory instructions" >; +def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts", + "ScalarFlatScratchInsts", + "true", + "Have s_scratch_* flat memory instructions" +>; + def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", "AddNoCarryInsts", "true", @@ -103,6 +108,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; +def FeatureDoesNotSupportXNACK : SubtargetFeature<"no-xnack-support", + "DoesNotSupportXNACK", + "true", + "Hardware does not support XNACK" +>; + // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support // XNACK. The current default kernel driver setting is: // - graphics ring: XNACK disabled @@ -116,12 +127,78 @@ def FeatureXNACK : SubtargetFeature<"xnack", "Enable XNACK support" >; +def FeatureCuMode : SubtargetFeature<"cumode", + "EnableCuMode", + "true", + "Enable CU wavefront execution mode" +>; + def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; +def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", + "LDSMisalignedBug", + "true", + "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode" +>; + +def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard", + "HasVcmpxPermlaneHazard", + "true", + "TODO: describe me" +>; + +def FeatureVMEMtoScalarWriteHazard : SubtargetFeature<"vmem-to-scalar-write-hazard", + "HasVMEMtoScalarWriteHazard", + "true", + "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution." +>; + +def FeatureSMEMtoVectorWriteHazard : SubtargetFeature<"smem-to-vector-write-hazard", + "HasSMEMtoVectorWriteHazard", + "true", + "s_load_dword followed by v_cmp page faults" +>; + +def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug", + "HasInstFwdPrefetchBug", + "true", + "S_INST_PREFETCH instruction causes shader to hang" +>; + +def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", + "HasVcmpxExecWARHazard", + "true", + "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)" +>; + +def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard", + "HasLdsBranchVmemWARHazard", + "true", + "Switching between LDS and VMEM-tex not waiting VM_VSRC=0" +>; + +def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug", + "HasNSAtoVMEMBug", + "true", + "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero" +>; + +def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug", + "HasFlatSegmentOffsetBug", + "true", + "GFX10 bug, inst_offset ignored in flat segment" +>; + +def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug", + "HasOffset3fBug", + "true", + "Branch offset of 3f hardware bug" +>; + class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -144,10 +221,10 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts", "Additional instructions for CI+" >; -def FeatureVIInsts : SubtargetFeature<"vi-insts", - "VIInsts", +def FeatureGFX8Insts : SubtargetFeature<"gfx8-insts", + "GFX8Insts", "true", - "Additional instructions for VI+" + "Additional instructions for GFX8+" >; def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", @@ -156,6 +233,18 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", "Additional instructions for GFX9+" >; +def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", + "GFX10Insts", + "true", + "Additional instructions for GFX10+" +>; + +def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts", + "GFX7GFX8GFX9Insts", + "true", + "Instructions shared in GFX7, GFX8, GFX9" +>; + def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime", "HasSMemRealTime", "true", @@ -246,12 +335,25 @@ def FeatureDPP : SubtargetFeature<"dpp", "Support DPP (Data Parallel Primitives) extension" >; +// DPP8 allows arbitrary cross-lane swizzling withing groups of 8 lanes. +def FeatureDPP8 : SubtargetFeature<"dpp8", + "HasDPP8", + "true", + "Support DPP8 (Data Parallel Primitives) extension" +>; + def FeatureR128A16 : SubtargetFeature<"r128-a16", "HasR128A16", "true", "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9" >; +def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", + "HasNSAEncoding", + "true", + "Support NSA encoding for image instructions" +>; + def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", "HasIntClamp", "true", @@ -270,10 +372,65 @@ def FeatureDLInsts : SubtargetFeature<"dl-insts", "Has v_fmac_f32 and v_xnor_b32 instructions" >; -def FeatureDotInsts : SubtargetFeature<"dot-insts", - "HasDotInsts", +def FeatureDot1Insts : SubtargetFeature<"dot1-insts", + "HasDot1Insts", + "true", + "Has v_dot4_i32_i8 and v_dot8_i32_i4 instructions" +>; + +def FeatureDot2Insts : SubtargetFeature<"dot2-insts", + "HasDot2Insts", + "true", + "Has v_dot2_f32_f16, v_dot2_i32_i16, v_dot2_u32_u16, v_dot4_u32_u8, v_dot8_u32_u4 instructions" +>; + +def FeatureDot3Insts : SubtargetFeature<"dot3-insts", + "HasDot3Insts", + "true", + "Has v_dot8c_i32_i4 instruction" +>; + +def FeatureDot4Insts : SubtargetFeature<"dot4-insts", + "HasDot4Insts", + "true", + "Has v_dot2c_i32_i16 instruction" +>; + +def FeatureDot5Insts : SubtargetFeature<"dot5-insts", + "HasDot5Insts", "true", - "Has v_dot* instructions" + "Has v_dot2c_f32_f16 instruction" +>; + +def FeatureDot6Insts : SubtargetFeature<"dot6-insts", + "HasDot6Insts", + "true", + "Has v_dot4c_i32_i8 instruction" +>; + +def FeatureMAIInsts : SubtargetFeature<"mai-insts", + "HasMAIInsts", + "true", + "Has mAI instructions" +>; + +def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", + "HasPkFmacF16Inst", + "true", + "Has v_pk_fmac_f16 instruction" +>; + +def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts", + "HasAtomicFaddInsts", + "true", + "Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, " + "global_atomic_pk_add_f16 instructions" +>; + +def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support", + "DoesNotSupportSRAMECC", + "true", + "Hardware does not support SRAM ECC" >; def FeatureSRAMECC : SubtargetFeature<"sram-ecc", @@ -282,6 +439,36 @@ def FeatureSRAMECC : SubtargetFeature<"sram-ecc", "Enable SRAM ECC" >; +def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx", + "HasNoSdstCMPX", + "true", + "V_CMPX does not write VCC/SGPR in addition to EXEC" +>; + +def FeatureVscnt : SubtargetFeature<"vscnt", + "HasVscnt", + "true", + "Has separate store vscnt counter" +>; + +def FeatureRegisterBanking : SubtargetFeature<"register-banking", + "HasRegisterBanking", + "true", + "Has register banking" +>; + +def FeatureVOP3Literal : SubtargetFeature<"vop3-literal", + "HasVOP3Literal", + "true", + "Can use one literal in VOP3" +>; + +def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard", + "HasNoDataDepHazard", + "true", + "Does not need SW waitstates" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -327,13 +514,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; -def FeatureEnableHugePrivateBuffer : SubtargetFeature< - "huge-private-buffer", - "EnableHugePrivateBuffer", - "true", - "Enable private/scratch buffer sizes greater than 128 GB" ->; - def FeatureDumpCode : SubtargetFeature <"DumpCode", "DumpCode", "true", @@ -425,103 +605,123 @@ def FeatureDisable : SubtargetFeature<"", "Dummy feature to disable assembler instructions" >; -def FeatureGCN : SubtargetFeature<"gcn", - "IsGCN", - "true", - "GCN or newer GPU" ->; - class GCNSubtargetFeatureGeneration <string Value, - list<SubtargetFeature> Implies> : - SubtargetFeatureGeneration <Value, "GCNSubtarget", Implies>; + string FeatureName, + list<SubtargetFeature> Implies> : + SubtargetFeatureGeneration <Value, FeatureName, "GCNSubtarget", Implies>; def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", + "southern-islands", [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, - FeatureWavefrontSize64, FeatureGCN, - FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange] + FeatureWavefrontSize64, + FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange, + FeatureDoesNotSupportSRAMECC, FeatureDoesNotSupportXNACK] >; def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", + "sea-islands", [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, - FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, - FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange] + FeatureWavefrontSize64, FeatureFlatAddressSpace, + FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, + FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportSRAMECC] >; def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", + "volcanic-islands", [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, - FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts, + FeatureWavefrontSize64, FeatureFlatAddressSpace, + FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, - FeatureIntClamp, FeatureTrigReducedRange + FeatureIntClamp, FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC, + FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts ] >; def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", + "gfx9", [FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts, + FeatureWavefrontSize64, FeatureFlatAddressSpace, + FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, - FeatureAddNoCarryInsts, FeatureScalarAtomics, FeatureR128A16 + FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, + FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16 ] >; -class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping, - list<SubtargetFeature> Implies> - : SubtargetFeature < - "isaver"#Major#"."#Minor#"."#Stepping, - "IsaVersion", - "ISAVersion"#Major#"_"#Minor#"_"#Stepping, - "Instruction set version number", - Implies +def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", + "gfx10", + [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, + FeatureFlatAddressSpace, + FeatureCIInsts, Feature16BitInsts, + FeatureSMemRealTime, FeatureInv2PiInlineImm, + FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3P, + FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, + FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, + FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, + FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, + FeatureVOP3Literal, FeatureDPP8, + FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC + ] >; -def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0, - [FeatureSouthernIslands, +class FeatureSet<list<SubtargetFeature> Features_> { + list<SubtargetFeature> Features = Features_; +} + +def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1, +def FeatureISAVersion6_0_1 : FeatureSet< [FeatureSouthernIslands, FeatureLDSBankCount32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0, +def FeatureISAVersion7_0_0 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1, +def FeatureISAVersion7_0_1 : FeatureSet< [FeatureSeaIslands, HalfRate64Ops, FeatureLDSBankCount32, FeatureFastFMAF32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2, +def FeatureISAVersion7_0_2 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount16, FeatureFastFMAF32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3, +def FeatureISAVersion7_0_3 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount16, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4, +def FeatureISAVersion7_0_4 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount32, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, +def FeatureISAVersion8_0_1 : FeatureSet< [FeatureVolcanicIslands, FeatureFastFMAF32, HalfRate64Ops, @@ -530,78 +730,151 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, FeatureUnpackedD16VMem, FeatureCodeObjectV3]>; -def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2, +def FeatureISAVersion8_0_2 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureSGPRInitBug, FeatureUnpackedD16VMem, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3, +def FeatureISAVersion8_0_3 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureUnpackedD16VMem, + FeatureDoesNotSupportXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, +def FeatureISAVersion8_1_0 : FeatureSet< [FeatureVolcanicIslands, FeatureLDSBankCount16, FeatureXNACK, FeatureCodeObjectV3]>; -def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0, +def FeatureISAVersion9_0_0 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureCodeObjectV3]>; + FeatureCodeObjectV3, + FeatureDoesNotSupportXNACK, + FeatureDoesNotSupportSRAMECC]>; -def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2, +def FeatureISAVersion9_0_2 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, FeatureXNACK, + FeatureDoesNotSupportSRAMECC, FeatureCodeObjectV3]>; -def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4, +def FeatureISAVersion9_0_4 : FeatureSet< [FeatureGFX9, FeatureLDSBankCount32, FeatureFmaMixInsts, + FeatureDoesNotSupportXNACK, + FeatureDoesNotSupportSRAMECC, FeatureCodeObjectV3]>; -def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6, +def FeatureISAVersion9_0_6 : FeatureSet< [FeatureGFX9, HalfRate64Ops, FeatureFmaMixInsts, FeatureLDSBankCount32, FeatureDLInsts, - FeatureDotInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDoesNotSupportXNACK, + FeatureCodeObjectV3]>; + +def FeatureISAVersion9_0_8 : FeatureSet< + [FeatureGFX9, + HalfRate64Ops, + FeatureFmaMixInsts, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot3Insts, + FeatureDot4Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureMAIInsts, + FeaturePkFmacF16Inst, + FeatureAtomicFaddInsts, FeatureSRAMECC, FeatureCodeObjectV3]>; -def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9, +def FeatureISAVersion9_0_9 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, FeatureXNACK, FeatureCodeObjectV3]>; -//===----------------------------------------------------------------------===// -// Debugger related subtarget features. -//===----------------------------------------------------------------------===// - -def FeatureDebuggerInsertNops : SubtargetFeature< - "amdgpu-debugger-insert-nops", - "DebuggerInsertNops", - "true", - "Insert one nop instruction for each high level source statement" ->; +// TODO: Organize more features into groups. +def FeatureGroup { + // Bugs present on gfx10.1. + list<SubtargetFeature> GFX10_1_Bugs = [ + FeatureVcmpxPermlaneHazard, + FeatureVMEMtoScalarWriteHazard, + FeatureSMEMtoVectorWriteHazard, + FeatureInstFwdPrefetchBug, + FeatureVcmpxExecWARHazard, + FeatureLdsBranchVmemWARHazard, + FeatureNSAtoVMEMBug, + FeatureOffset3fBug, + FeatureFlatSegmentOffsetBug + ]; +} -def FeatureDebuggerEmitPrologue : SubtargetFeature< - "amdgpu-debugger-emit-prologue", - "DebuggerEmitPrologue", - "true", - "Emit debugger prologue" ->; +def FeatureISAVersion10_1_0 : FeatureSet< + !listconcat(FeatureGroup.GFX10_1_Bugs, + [FeatureGFX10, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureNSAEncoding, + FeatureWavefrontSize32, + FeatureScalarStores, + FeatureScalarAtomics, + FeatureScalarFlatScratchInsts, + FeatureLdsMisalignedBug, + FeatureDoesNotSupportXNACK, + FeatureCodeObjectV3])>; + +def FeatureISAVersion10_1_1 : FeatureSet< + !listconcat(FeatureGroup.GFX10_1_Bugs, + [FeatureGFX10, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureNSAEncoding, + FeatureWavefrontSize32, + FeatureScalarStores, + FeatureScalarAtomics, + FeatureScalarFlatScratchInsts, + FeatureDoesNotSupportXNACK, + FeatureCodeObjectV3])>; + +def FeatureISAVersion10_1_2 : FeatureSet< + !listconcat(FeatureGroup.GFX10_1_Bugs, + [FeatureGFX10, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureNSAEncoding, + FeatureWavefrontSize32, + FeatureScalarStores, + FeatureScalarAtomics, + FeatureScalarFlatScratchInsts, + FeatureLdsMisalignedBug, + FeatureDoesNotSupportXNACK, + FeatureCodeObjectV3])>; //===----------------------------------------------------------------------===// @@ -682,23 +955,71 @@ def NullALU : InstrItinClass; // Predicate helper class //===----------------------------------------------------------------------===// -def isSICI : Predicate< - "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" - "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" ->, AssemblerPredicate<"!FeatureGCN3Encoding">; +def isGFX6 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureSouthernIslands">; + +def isGFX6GFX7 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"!FeatureGCN3Encoding,!FeatureGFX10Insts">; + +def isGFX6GFX7GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, + AssemblerPredicate<"!FeatureGCN3Encoding">; + +def isGFX7Only : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts,!FeatureGFX10Insts">; + +def isGFX7GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts">; + +def isGFX7GFX8GFX9 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"FeatureGFX7GFX8GFX9Insts">; + +def isGFX6GFX7GFX8GFX9 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"!FeatureGFX10Insts">; + +def isGFX7Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"FeatureCIInsts">; + +def isGFX8Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate<"FeatureGFX8Insts">; -def isVI : Predicate < - "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate<"FeatureGCN3Encoding">; +def isGFX8Only : Predicate<"Subtarget->getGeneration() ==" + "AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate <"FeatureVolcanicIslands">; -def isGFX9 : Predicate < - "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, +def isGFX9Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, AssemblerPredicate<"FeatureGFX9Insts">; -// TODO: Either the name to be changed or we simply use IsCI! -def isCIVI : Predicate < - "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, - AssemblerPredicate<"FeatureCIInsts">; +def isGFX9Only : Predicate < + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts">; + +def isGFX8GFX9 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"FeatureGFX8Insts,FeatureGCN3Encoding">; + +def isGFX10Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">, + AssemblerPredicate<"FeatureGFX10Insts">; def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, AssemblerPredicate<"FeatureFlatAddressSpace">; @@ -707,6 +1028,8 @@ def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, AssemblerPredicate<"FeatureFlatGlobalInsts">; def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">, AssemblerPredicate<"FeatureFlatScratchInsts">; +def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">, + AssemblerPredicate<"FeatureScalarFlatScratchInsts">; def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<"FeatureGFX9Insts">; @@ -716,7 +1039,7 @@ def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, AssemblerPredicate<"!FeatureUnpackedD16VMem">; def D16PreservesUnusedBits : - Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">, + Predicate<"Subtarget->d16PreservesUnusedBits()">, AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">; def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; @@ -728,38 +1051,54 @@ def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9 def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">, AssemblerPredicate<"FeatureAddNoCarryInsts">; -def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">, - AssemblerPredicate<"!FeatureAddNoCarryInsts">; +def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">; def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, AssemblerPredicate<"Feature16BitInsts">; def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<"FeatureVOP3P">; -def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">, - AssemblerPredicate<"!FeatureVOP3P">; - def HasSDWA : Predicate<"Subtarget->hasSDWA()">, AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; -def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, - AssemblerPredicate<"FeatureSDWA,FeatureGFX9">; +def HasSDWA9 : + Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts,FeatureSDWA">; + +def HasSDWA10 : + Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureSDWA">; def HasDPP : Predicate<"Subtarget->hasDPP()">, - AssemblerPredicate<"FeatureDPP">; + AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">; + +def HasDPP8 : Predicate<"Subtarget->hasDPP8()">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP8">; def HasR128A16 : Predicate<"Subtarget->hasR128A16()">, AssemblerPredicate<"FeatureR128A16">; +def HasDPP16 : Predicate<"Subtarget->hasDPP()">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP">; + def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">, AssemblerPredicate<"FeatureIntClamp">; def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">, AssemblerPredicate<"FeatureMadMixInsts">; +def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">, + AssemblerPredicate<"FeatureScalarStores">; + def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">, AssemblerPredicate<"FeatureScalarAtomics">; +def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">, + AssemblerPredicate<"FeatureNoSdstCMPX">; + +def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">, + AssemblerPredicate<"!FeatureNoSdstCMPX">; + def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, @@ -773,9 +1112,35 @@ def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, AssemblerPredicate<"FeatureDLInsts">; -def HasDotInsts : Predicate<"Subtarget->hasDotInsts()">, - AssemblerPredicate<"FeatureDotInsts">; +def HasDot1Insts : Predicate<"Subtarget->hasDot1Insts()">, + AssemblerPredicate<"FeatureDot1Insts">; + +def HasDot2Insts : Predicate<"Subtarget->hasDot2Insts()">, + AssemblerPredicate<"FeatureDot2Insts">; + +def HasDot3Insts : Predicate<"Subtarget->hasDot3Insts()">, + AssemblerPredicate<"FeatureDot3Insts">; + +def HasDot4Insts : Predicate<"Subtarget->hasDot4Insts()">, + AssemblerPredicate<"FeatureDot4Insts">; + +def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">, + AssemblerPredicate<"FeatureDot5Insts">; + +def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">, + AssemblerPredicate<"FeatureDot6Insts">; + +def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">, + AssemblerPredicate<"FeatureMAIInsts">; + +def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">, + AssemblerPredicate<"FeaturePkFmacF16Inst">; + +def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">, + AssemblerPredicate<"FeatureAtomicFaddInsts">; +def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">, + AssemblerPredicate<"FeatureOffset3fBug">; def EnableLateCFGStructurize : Predicate< "EnableLateStructurizeCFG">; @@ -784,7 +1149,6 @@ def EnableLateCFGStructurize : Predicate< include "SISchedule.td" include "GCNProcessors.td" include "AMDGPUInstrInfo.td" -include "SIIntrinsics.td" include "AMDGPURegisterInfo.td" include "AMDGPURegisterBanks.td" include "AMDGPUInstructions.td" diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index 73709ba13643..bba132c3bc46 100644 --- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -1,9 +1,8 @@ //===- AMDGPUAliasAnalysis ------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -54,20 +53,21 @@ void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); } -// These arrays are indexed by address space value enum elements 0 ... to 6 -static const AliasResult ASAliasRules[7][7] = { - /* Flat Global Region Group Constant Private Constant 32-bit */ - /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, - /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias}, - /* Region */ {MayAlias, NoAlias , NoAlias , NoAlias, MayAlias, NoAlias , MayAlias}, - /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias}, - /* Constant */ {MayAlias, MayAlias, MayAlias, NoAlias , NoAlias, NoAlias , MayAlias}, - /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias}, - /* Constant 32-bit */ {MayAlias, MayAlias, MayAlias, NoAlias , MayAlias, NoAlias , NoAlias} +// These arrays are indexed by address space value enum elements 0 ... to 7 +static const AliasResult ASAliasRules[8][8] = { + /* Flat Global Region Group Constant Private Constant 32-bit Buffer Fat Ptr */ + /* Flat */ {MayAlias, MayAlias, NoAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, + /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias}, + /* Region */ {NoAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias , NoAlias, NoAlias}, + /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias , NoAlias}, + /* Constant */ {MayAlias, MayAlias, NoAlias, NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}, + /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias , NoAlias}, + /* Constant 32-bit */ {MayAlias, MayAlias, NoAlias, NoAlias , MayAlias, NoAlias , NoAlias , MayAlias}, + /* Buffer Fat Ptr */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias} }; static AliasResult getAliasResult(unsigned AS1, unsigned AS2) { - static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 6, "Addr space out of range"); + static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 7, "Addr space out of range"); if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS) return MayAlias; @@ -76,7 +76,8 @@ static AliasResult getAliasResult(unsigned AS1, unsigned AS2) { } AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) { + const MemoryLocation &LocB, + AAQueryInfo &AAQI) { unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace(); unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace(); @@ -85,11 +86,11 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, return Result; // Forward the query to the next alias analysis. - return AAResultBase::alias(LocA, LocB); + return AAResultBase::alias(LocA, LocB, AAQI); } bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) { + AAQueryInfo &AAQI, bool OrLocal) { const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); unsigned AS = Base->getType()->getPointerAddressSpace(); if (AS == AMDGPUAS::CONSTANT_ADDRESS || @@ -106,7 +107,7 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, // Only assume constant memory for arguments on kernels. switch (F->getCallingConv()) { default: - return AAResultBase::pointsToConstantMemory(Loc, OrLocal); + return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal); case CallingConv::AMDGPU_LS: case CallingConv::AMDGPU_HS: case CallingConv::AMDGPU_ES: @@ -133,5 +134,5 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, return true; } } - return AAResultBase::pointsToConstantMemory(Loc, OrLocal); + return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal); } diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h index d76c9fc48199..fb722920900f 100644 --- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -1,9 +1,8 @@ //===- AMDGPUAliasAnalysis --------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -45,8 +44,10 @@ public: /// By definition, this result is stateless and so remains valid. bool invalidate(Function &, const PreservedAnalyses &) { return false; } - AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB); - bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal); + AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB, + AAQueryInfo &AAQI); + bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI, + bool OrLocal); private: bool Aliases(const MDNode *A, const MDNode *B) const; diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index fc65430b745f..4c1dbd4c5304 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 896ac9c87779..419ebb2240ad 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -1,9 +1,8 @@ //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -46,8 +45,11 @@ namespace { class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: const TargetMachine *TM = nullptr; + SmallVector<CallGraphNode*, 8> NodeList; bool addFeatureAttributes(Function &F); + bool processUniformWorkGroupAttribute(); + bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); public: static char ID; @@ -186,7 +188,6 @@ static bool handleAttr(Function &Parent, const Function &Callee, Parent.addFnAttr(Name); return true; } - return false; } @@ -213,6 +214,56 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee, handleAttr(Parent, Callee, AttrName); } +bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { + bool Changed = false; + + for (auto *Node : reverse(NodeList)) { + Function *Caller = Node->getFunction(); + + for (auto I : *Node) { + Function *Callee = std::get<1>(I)->getFunction(); + if (Callee) + Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); + } + } + + return Changed; +} + +bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( + Function &Caller, Function &Callee) { + + // Check for externally defined function + if (!Callee.hasExactDefinition()) { + Callee.addFnAttr("uniform-work-group-size", "false"); + if (!Caller.hasFnAttribute("uniform-work-group-size")) + Caller.addFnAttr("uniform-work-group-size", "false"); + + return true; + } + // Check if the Caller has the attribute + if (Caller.hasFnAttribute("uniform-work-group-size")) { + // Check if the value of the attribute is true + if (Caller.getFnAttribute("uniform-work-group-size") + .getValueAsString().equals("true")) { + // Propagate the attribute to the Callee, if it does not have it + if (!Callee.hasFnAttribute("uniform-work-group-size")) { + Callee.addFnAttr("uniform-work-group-size", "true"); + return true; + } + } else { + Callee.addFnAttr("uniform-work-group-size", "false"); + return true; + } + } else { + // If the attribute is absent, set it as false + Caller.addFnAttr("uniform-work-group-size", "false"); + Callee.addFnAttr("uniform-work-group-size", "false"); + return true; + } + return false; +} + bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); bool HasFlat = ST.hasFlatAddressSpace(); @@ -293,15 +344,21 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { } bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { - Module &M = SCC.getCallGraph().getModule(); - Triple TT(M.getTargetTriple()); - bool Changed = false; + for (CallGraphNode *I : SCC) { + // Build a list of CallGraphNodes from most number of uses to least + if (I->getNumReferences()) + NodeList.push_back(I); + else { + processUniformWorkGroupAttribute(); + NodeList.clear(); + } + Function *F = I->getFunction(); + // Add feature attributes if (!F || F->isDeclaration()) continue; - Changed |= addFeatureAttributes(*F); } diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index f88e3b0dac86..71121ade0a49 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "AMDGPUIntrinsicInfo.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index 7465cf22b5a4..99a01ca3a2fd 100644 --- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -1,15 +1,15 @@ //===----------------------------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUArgumentUsageInfo.h" #include "SIRegisterInfo.h" +#include "llvm/Support/NativeFormatting.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -27,9 +27,16 @@ void ArgDescriptor::print(raw_ostream &OS, } if (isRegister()) - OS << "Reg " << printReg(getRegister(), TRI) << '\n'; + OS << "Reg " << printReg(getRegister(), TRI); else - OS << "Stack offset " << getStackOffset() << '\n'; + OS << "Stack offset " << getStackOffset(); + + if (isMasked()) { + OS << " & "; + llvm::write_hex(OS, Mask, llvm::HexPrintStyle::PrefixLower); + } + + OS << '\n'; } char AMDGPUArgumentUsageInfo::ID = 0; diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index f0e6d1b83f15..097730441ed8 100644 --- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -1,9 +1,8 @@ //==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -11,6 +10,7 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H #include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/Register.h" #include "llvm/IR/Function.h" #include "llvm/Pass.h" @@ -29,22 +29,31 @@ private: friend class AMDGPUArgumentUsageInfo; union { - unsigned Register; + Register Reg; unsigned StackOffset; }; + // Bitmask to locate argument within the register. + unsigned Mask; + bool IsStack : 1; bool IsSet : 1; - ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false) - : Register(Val), IsStack(IsStack), IsSet(IsSet) {} public: - static ArgDescriptor createRegister(unsigned Reg) { - return ArgDescriptor(Reg, false, true); + ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, + bool IsStack = false, bool IsSet = false) + : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} + + static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) { + return ArgDescriptor(Reg, Mask, false, true); + } + + static ArgDescriptor createStack(Register Reg, unsigned Mask = ~0u) { + return ArgDescriptor(Reg, Mask, true, true); } - static ArgDescriptor createStack(unsigned Reg) { - return ArgDescriptor(Reg, true, true); + static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { + return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); } bool isSet() const { @@ -59,9 +68,9 @@ public: return !IsStack; } - unsigned getRegister() const { + Register getRegister() const { assert(!IsStack); - return Register; + return Reg; } unsigned getStackOffset() const { @@ -69,6 +78,14 @@ public: return StackOffset; } + unsigned getMask() const { + return Mask; + } + + bool isMasked() const { + return Mask != ~0u; + } + void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const; }; diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2ded7cdb6489..743ac64b8f10 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -20,7 +19,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" -#include "InstPrinter/AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "R600AsmPrinter.h" @@ -31,10 +30,12 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" @@ -100,7 +101,7 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() { AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)) { - if (IsaInfo::hasCodeObjectV3(getSTI())) + if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) HSAMetadataStream.reset(new MetadataStreamerV3()); else HSAMetadataStream.reset(new MetadataStreamerV2()); @@ -110,7 +111,7 @@ StringRef AMDGPUAsmPrinter::getPassName() const { return "AMDGPU Assembly Printer"; } -const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const { +const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { return TM.getMCSubtargetInfo(); } @@ -121,10 +122,10 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { } void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { - if (IsaInfo::hasCodeObjectV3(getSTI())) { + if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) { std::string ExpectedTarget; raw_string_ostream ExpectedTargetOS(ExpectedTarget); - IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS); + IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS); getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget); } @@ -137,9 +138,9 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { HSAMetadataStream->begin(M); if (TM.getTargetTriple().getOS() == Triple::AMDPAL) - readPALMetadata(M); + getTargetStreamer()->getPALMetadata()->readFromIR(M); - if (IsaInfo::hasCodeObjectV3(getSTI())) + if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) return; // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2. @@ -147,7 +148,7 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2. - IsaVersion Version = getIsaVersion(getSTI()->getCPU()); + IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU()); getTargetStreamer()->EmitDirectiveHSACodeObjectISA( Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); } @@ -157,11 +158,11 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { if (!getTargetStreamer()) return; - if (!IsaInfo::hasCodeObjectV3(getSTI())) { + if (!IsaInfo::hasCodeObjectV3(getGlobalSTI())) { // Emit ISA Version (NT_AMD_AMDGPU_ISA). std::string ISAVersionString; raw_string_ostream ISAVersionStream(ISAVersionString); - IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream); + IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream); getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); } @@ -172,20 +173,6 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { (void)Success; assert(Success && "Malformed HSA Metadata"); } - - if (!IsaInfo::hasCodeObjectV3(getSTI())) { - // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA). - if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { - // Copy the PAL metadata from the map where we collected it into a vector, - // then write it as a .note. - PALMD::Metadata PALMetadataVector; - for (auto i : PALMetadataMap) { - PALMetadataVector.push_back(i.first); - PALMetadataVector.push_back(i.second); - } - getTargetStreamer()->EmitPALMetadata(PALMetadataVector); - } - } } bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( @@ -225,7 +212,8 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); if (!MFI.isEntryFunction()) return; - if (!IsaInfo::hasCodeObjectV3(getSTI()) || + + if (!IsaInfo::hasCodeObjectV3(getGlobalSTI()) || TM.getTargetTriple().getOS() != Triple::AMDHSA) return; @@ -243,23 +231,25 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { if (ReadOnlySection.getAlignment() < 64) ReadOnlySection.setAlignment(64); + const MCSubtargetInfo &STI = MF->getSubtarget(); + SmallString<128> KernelName; getNameWithPrefix(KernelName, &MF->getFunction()); getTargetStreamer()->EmitAmdhsaKernelDescriptor( - *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), + STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), CurrentProgramInfo.NumVGPRsForWavesPerEU, CurrentProgramInfo.NumSGPRsForWavesPerEU - - IsaInfo::getNumExtraSGPRs(getSTI(), + IsaInfo::getNumExtraSGPRs(&STI, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed), CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, - hasXNACK(*getSTI())); + hasXNACK(STI)); Streamer.PopSection(); } void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { - if (IsaInfo::hasCodeObjectV3(getSTI()) && + if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) && TM.getTargetTriple().getOS() == Triple::AMDHSA) { AsmPrinter::EmitFunctionEntryLabel(); return; @@ -273,8 +263,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { getTargetStreamer()->EmitAMDGPUSymbolType( SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); } - const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>(); - if (STI.dumpCode()) { + if (DumpCodeInstEmitter) { // Disassemble function name label to text. DisasmLines.push_back(MF->getName().str() + ":"); DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); @@ -285,8 +274,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { } void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { - const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>(); - if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) { + if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { // Write a line for the basic block label if it is not only fallthrough. DisasmLines.push_back( (Twine("BB") + Twine(getFunctionNumber()) @@ -298,38 +286,57 @@ void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { } void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) { + OutContext.reportError({}, + Twine(GV->getName()) + + ": unsupported initializer for address space"); + return; + } + + // LDS variables aren't emitted in HSA or PAL yet. + const Triple::OSType OS = TM.getTargetTriple().getOS(); + if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) + return; - // Group segment variables aren't emitted in HSA. - if (AMDGPU::isGroupSegment(GV)) + MCSymbol *GVSym = getSymbol(GV); + + GVSym->redefineIfPossible(); + if (GVSym->isDefined() || GVSym->isVariable()) + report_fatal_error("symbol '" + Twine(GVSym->getName()) + + "' is already defined"); + + const DataLayout &DL = GV->getParent()->getDataLayout(); + uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); + unsigned Align = GV->getAlignment(); + if (!Align) + Align = 4; + + EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); + EmitLinkage(GV, GVSym); + if (auto TS = getTargetStreamer()) + TS->emitAMDGPULDS(GVSym, Size, Align); return; + } AsmPrinter::EmitGlobalVariable(GV); } bool AMDGPUAsmPrinter::doFinalization(Module &M) { CallGraphResourceInfo.clear(); - return AsmPrinter::doFinalization(M); -} -// For the amdpal OS type, read the amdgpu.pal.metadata supplied by the -// frontend into our PALMetadataMap, ready for per-function modification. It -// is a NamedMD containing an MDTuple containing a number of MDNodes each of -// which is an integer value, and each two integer values forms a key=value -// pair that we store as PALMetadataMap[key]=value in the map. -void AMDGPUAsmPrinter::readPALMetadata(Module &M) { - auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata"); - if (!NamedMD || !NamedMD->getNumOperands()) - return; - auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0)); - if (!Tuple) - return; - for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) { - auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I)); - auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1)); - if (!Key || !Val) - continue; - PALMetadataMap[Key->getZExtValue()] = Val->getZExtValue(); + // Pad with s_code_end to help tools and guard against instruction prefetch + // causing stale data in caches. Arguably this should be done by the linker, + // which is why this isn't done for Mesa. + const MCSubtargetInfo &STI = *getGlobalSTI(); + if (AMDGPU::isGFX10(STI) && + (STI.getTargetTriple().getOS() == Triple::AMDHSA || + STI.getTargetTriple().getOS() == Triple::AMDPAL)) { + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + getTargetStreamer()->EmitCodeEnd(); } + + return AsmPrinter::doFinalization(M); } // Print comments that apply to both callable functions and entry points. @@ -376,6 +383,10 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; } + if (MF.getSubtarget<GCNSubtarget>().isWave32()) { + KernelCodeProperties |= + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; + } return KernelCodeProperties; } @@ -435,6 +446,18 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { EmitProgramInfoSI(MF, CurrentProgramInfo); } + DumpCodeInstEmitter = nullptr; + if (STM.dumpCode()) { + // For -dumpcode, get the assembler out of the streamer, even if it does + // not really want to let us have it. This only works with -filetype=obj. + bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing(); + OutStreamer->setUseAssemblerInfoForParsing(true); + MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); + OutStreamer->setUseAssemblerInfoForParsing(SaveFlag); + if (Assembler) + DumpCodeInstEmitter = Assembler->getEmitterPtr(); + } + DisasmLines.clear(); HexLines.clear(); DisasmLineMaxLen = 0; @@ -486,15 +509,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); - if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) { - OutStreamer->emitRawComment( - " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + - Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); - OutStreamer->emitRawComment( - " DebuggerPrivateSegmentBufferSGPR: s" + - Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false); - } - OutStreamer->emitRawComment( " COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); @@ -516,7 +530,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); } - if (STM.dumpCode()) { + if (DumpCodeInstEmitter) { OutStreamer->SwitchSection( Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); @@ -620,6 +634,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( HighestVGPRReg = Reg; break; } + MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg); + if (MRI.isPhysRegUsed(AReg)) { + HighestVGPRReg = AReg; + break; + } } MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; @@ -665,8 +684,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::SRC_SHARED_LIMIT: case AMDGPU::SRC_PRIVATE_BASE: case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SGPR_NULL: continue; + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + llvm_unreachable("src_pops_exiting_wave_id should not be used"); + case AMDGPU::NoRegister: assert(MI.isDebugInstr()); continue; @@ -687,6 +710,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::XNACK_MASK_HI: llvm_unreachable("xnack_mask registers should not be used"); + case AMDGPU::LDS_DIRECT: + llvm_unreachable("lds_direct register should not be used"); + case AMDGPU::TBA: case AMDGPU::TBA_LO: case AMDGPU::TBA_HI: @@ -695,6 +721,15 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::TMA_HI: llvm_unreachable("trap handler registers should not be used"); + case AMDGPU::SRC_VCCZ: + llvm_unreachable("src_vccz register should not be used"); + + case AMDGPU::SRC_EXECZ: + llvm_unreachable("src_execz register should not be used"); + + case AMDGPU::SRC_SCC: + llvm_unreachable("src_scc register should not be used"); + default: break; } @@ -707,6 +742,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) { IsSGPR = false; Width = 1; + } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) { + IsSGPR = false; + Width = 1; } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -715,9 +753,14 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { IsSGPR = false; Width = 2; + } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { + IsSGPR = false; + Width = 2; } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { IsSGPR = false; Width = 3; + } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { + Width = 3; } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -726,6 +769,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { IsSGPR = false; Width = 4; + } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { + IsSGPR = false; + Width = 4; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -742,6 +788,18 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { IsSGPR = false; Width = 16; + } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { + IsSGPR = false; + Width = 16; + } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { + IsSGPR = true; + Width = 32; + } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + Width = 32; + } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + Width = 32; } else { llvm_unreachable("Unknown register class"); } @@ -767,8 +825,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( // 48 SGPRs - vcc, - flat_scr, -xnack int MaxSGPRGuess = - 47 - IsaInfo::getNumExtraSGPRs(getSTI(), true, - ST.hasFlatAddressSpace()); + 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace()); MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); MaxVGPR = std::max(MaxVGPR, 23); @@ -779,9 +836,19 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else { // We force CodeGen to run in SCC order, so the callee's register // usage etc. should be the cumulative usage of all callees. + auto I = CallGraphResourceInfo.find(Callee); - assert(I != CallGraphResourceInfo.end() && - "callee should have been handled before caller"); + if (I == CallGraphResourceInfo.end()) { + // Avoid crashing on undefined behavior with an illegal call to a + // kernel. If a callsite's calling convention doesn't match the + // function's, it's undefined behavior. If the callsite calling + // convention does match, that would have errored earlier. + // FIXME: The verifier shouldn't allow this. + if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) + report_fatal_error("invalid call to entry function"); + + llvm_unreachable("callee should have been handled before caller"); + } MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); @@ -825,14 +892,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SIInstrInfo *TII = STM.getInstrInfo(); - const SIRegisterInfo *RI = &TII->getRegisterInfo(); // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be // unified. unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( - getSTI(), ProgInfo.VCCUsed, ProgInfo.FlatUsed); + &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed); // Check the addressable register limit before we add ExtraSGPRs. if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && @@ -918,24 +983,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( &STM, ProgInfo.NumVGPRsForWavesPerEU); - // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and - // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" - // attribute was requested. - if (STM.debuggerEmitPrologue()) { - ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = - RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); - ProgInfo.DebuggerPrivateSegmentBufferSGPR = - RI->getHWRegIndex(MFI->getScratchRSrcReg()); - } - // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. ProgInfo.FloatMode = getFPMode(MF); - ProgInfo.IEEEMode = STM.enableIEEEBit(MF); + const SIModeRegisterDefaults Mode = MFI->getMode(); + ProgInfo.IEEEMode = Mode.IEEE; // Make clamp modifier on NaN input returns 0. - ProgInfo.DX10Clamp = STM.enableDX10Clamp(); + ProgInfo.DX10Clamp = Mode.DX10Clamp; unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { @@ -963,6 +1019,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 1ULL << ScratchAlignShift) >> ScratchAlignShift; + if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { + ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; + ProgInfo.MemOrdered = 1; + } + ProgInfo.ComputePGMRSrc1 = S_00B848_VGPRS(ProgInfo.VGPRBlocks) | S_00B848_SGPRS(ProgInfo.SGPRBlocks) | @@ -971,7 +1032,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | - S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + S_00B848_IEEE_MODE(ProgInfo.IEEEMode) | + S_00B848_WGP_MODE(ProgInfo.WgpMode) | + S_00B848_MEM_ORDERED(ProgInfo.MemOrdered); // 0 = X, 1 = XY, 2 = XYZ unsigned TIDIGCompCnt = 0; @@ -1053,71 +1116,38 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, // This is the equivalent of EmitProgramInfoSI above, but for when the OS type // is AMDPAL. It stores each compute/SPI register setting and other PAL -// metadata items into the PALMetadataMap, combining with any provided by the -// frontend as LLVM metadata. Once all functions are written, PALMetadataMap is -// then written as a single block in the .note section. +// metadata items into the PALMD::Metadata, combining with any provided by the +// frontend as LLVM metadata. Once all functions are written, the PAL metadata +// is then written as a single block in the .note section. void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - // Given the calling convention, calculate the register number for rsrc1. In - // principle the register number could change in future hardware, but we know - // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so - // we can use the same fixed value that .AMDGPU.config has for Mesa. Note - // that we use a register number rather than a byte offset, so we need to - // divide by 4. - unsigned Rsrc1Reg = getRsrcReg(MF.getFunction().getCallingConv()) / 4; - unsigned Rsrc2Reg = Rsrc1Reg + 1; - // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used - // with a constant offset to access any non-register shader-specific PAL - // metadata key. - unsigned ScratchSizeKey = PALMD::Key::CS_SCRATCH_SIZE; - switch (MF.getFunction().getCallingConv()) { - case CallingConv::AMDGPU_PS: - ScratchSizeKey = PALMD::Key::PS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_VS: - ScratchSizeKey = PALMD::Key::VS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_GS: - ScratchSizeKey = PALMD::Key::GS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_ES: - ScratchSizeKey = PALMD::Key::ES_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_HS: - ScratchSizeKey = PALMD::Key::HS_SCRATCH_SIZE; - break; - case CallingConv::AMDGPU_LS: - ScratchSizeKey = PALMD::Key::LS_SCRATCH_SIZE; - break; - } - unsigned NumUsedVgprsKey = ScratchSizeKey + - PALMD::Key::VS_NUM_USED_VGPRS - PALMD::Key::VS_SCRATCH_SIZE; - unsigned NumUsedSgprsKey = ScratchSizeKey + - PALMD::Key::VS_NUM_USED_SGPRS - PALMD::Key::VS_SCRATCH_SIZE; - PALMetadataMap[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU; - PALMetadataMap[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU; + auto CC = MF.getFunction().getCallingConv(); + auto MD = getTargetStreamer()->getPALMetadata(); + + MD->setEntryPoint(CC, MF.getFunction().getName()); + MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU); + MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { - PALMetadataMap[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1; - PALMetadataMap[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2; - // ScratchSize is in bytes, 16 aligned. - PALMetadataMap[ScratchSizeKey] |= - alignTo(CurrentProgramInfo.ScratchSize, 16); + MD->setRsrc1(CC, CurrentProgramInfo.ComputePGMRSrc1); + MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2); } else { - PALMetadataMap[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | - S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks); + MD->setRsrc1(CC, S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | + S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks)); if (CurrentProgramInfo.ScratchBlocks > 0) - PALMetadataMap[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1); - // ScratchSize is in bytes, 16 aligned. - PALMetadataMap[ScratchSizeKey] |= - alignTo(CurrentProgramInfo.ScratchSize, 16); + MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); } + // ScratchSize is in bytes, 16 aligned. + MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { - PALMetadataMap[Rsrc2Reg] |= - S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks); - PALMetadataMap[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable(); - PALMetadataMap[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr(); + MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); + MD->setSpiPsInputEna(MFI->getPSInputEnable()); + MD->setSpiPsInputAddr(MFI->getPSInputAddr()); } + + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); + if (STM.isWave32()) + MD->setWave32(MF.getFunction().getCallingConv()); } // This is supposed to be log2(Size) @@ -1144,12 +1174,12 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); - AMDGPU::initDefaultAMDKernelCodeT(Out, getSTI()); + AMDGPU::initDefaultAMDKernelCodeT(Out, &STM); Out.compute_pgm_resource_registers = CurrentProgramInfo.ComputePGMRSrc1 | (CurrentProgramInfo.ComputePGMRSrc2 << 32); - Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; if (CurrentProgramInfo.DynamicCallStack) Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; @@ -1181,9 +1211,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (MFI->hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; - if (STM.debuggerSupported()) - Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; - if (STM.isXNACKEnabled()) Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; @@ -1196,22 +1223,14 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. - Out.kernarg_segment_alignment = std::max((size_t)4, + Out.kernarg_segment_alignment = std::max<size_t>(4, countTrailingZeros(MaxKernArgAlign)); - - if (STM.debuggerEmitPrologue()) { - Out.debug_wavefront_private_segment_offset_sgpr = - CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; - Out.debug_private_segment_buffer_sgpr = - CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR; - } } bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { // First try the generic code, which knows about modifiers like 'c' and 'n'. - if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O)) + if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O)) return false; if (ExtraCode && ExtraCode[0]) { diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 167ac4b21e1e..cf77034329ef 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -1,9 +1,8 @@ //===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -33,6 +32,7 @@ namespace llvm { class AMDGPUMachineFunction; class AMDGPUTargetStreamer; +class MCCodeEmitter; class MCOperand; class GCNSubtarget; @@ -57,12 +57,12 @@ private: DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo; std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream; - std::map<uint32_t, uint32_t> PALMetadataMap; + + MCCodeEmitter *DumpCodeInstEmitter = nullptr; uint64_t getFunctionCodeSize(const MachineFunction &MF) const; SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const; - void readPALMetadata(Module &M); void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, const MachineFunction &MF) const; @@ -95,7 +95,7 @@ public: StringRef getPassName() const override; - const MCSubtargetInfo* getSTI() const; + const MCSubtargetInfo* getGlobalSTI() const; AMDGPUTargetStreamer* getTargetStreamer() const; @@ -137,8 +137,7 @@ public: const MachineBasicBlock *MBB) const override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; protected: mutable std::vector<std::string> DisasmLines, HexLines; diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 644e4fd558ba..8a92e7d923fb 100644 --- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -31,6 +30,7 @@ namespace { enum DPP_CTRL { DPP_ROW_SR1 = 0x111, DPP_ROW_SR2 = 0x112, + DPP_ROW_SR3 = 0x113, DPP_ROW_SR4 = 0x114, DPP_ROW_SR8 = 0x118, DPP_WF_SR1 = 0x138, @@ -40,7 +40,7 @@ enum DPP_CTRL { struct ReplacementInfo { Instruction *I; - Instruction::BinaryOps Op; + AtomicRMWInst::BinOp Op; unsigned ValIdx; bool ValDivergent; }; @@ -55,10 +55,8 @@ private: bool HasDPP; bool IsPixelShader; - void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op, - unsigned ValIdx, bool ValDivergent) const; - - void setConvergent(CallInst *const CI) const; + void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, + bool ValDivergent) const; public: static char ID; @@ -122,16 +120,20 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { break; } - Instruction::BinaryOps Op; + AtomicRMWInst::BinOp Op = I.getOperation(); - switch (I.getOperation()) { + switch (Op) { default: return; case AtomicRMWInst::Add: - Op = Instruction::Add; - break; case AtomicRMWInst::Sub: - Op = Instruction::Sub; + case AtomicRMWInst::And: + case AtomicRMWInst::Or: + case AtomicRMWInst::Xor: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: break; } @@ -163,7 +165,7 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { } void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { - Instruction::BinaryOps Op; + AtomicRMWInst::BinOp Op; switch (I.getIntrinsicID()) { default: @@ -171,12 +173,47 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { case Intrinsic::amdgcn_buffer_atomic_add: case Intrinsic::amdgcn_struct_buffer_atomic_add: case Intrinsic::amdgcn_raw_buffer_atomic_add: - Op = Instruction::Add; + Op = AtomicRMWInst::Add; break; case Intrinsic::amdgcn_buffer_atomic_sub: case Intrinsic::amdgcn_struct_buffer_atomic_sub: case Intrinsic::amdgcn_raw_buffer_atomic_sub: - Op = Instruction::Sub; + Op = AtomicRMWInst::Sub; + break; + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_raw_buffer_atomic_and: + Op = AtomicRMWInst::And; + break; + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_raw_buffer_atomic_or: + Op = AtomicRMWInst::Or; + break; + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + Op = AtomicRMWInst::Xor; + break; + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + Op = AtomicRMWInst::Min; + break; + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + Op = AtomicRMWInst::UMin; + break; + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + Op = AtomicRMWInst::Max; + break; + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + Op = AtomicRMWInst::UMax; break; } @@ -208,12 +245,68 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { ToReplace.push_back(Info); } +// Use the builder to create the non-atomic counterpart of the specified +// atomicrmw binary op. +static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *LHS, Value *RHS) { + CmpInst::Predicate Pred; + + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::Add: + return B.CreateBinOp(Instruction::Add, LHS, RHS); + case AtomicRMWInst::Sub: + return B.CreateBinOp(Instruction::Sub, LHS, RHS); + case AtomicRMWInst::And: + return B.CreateBinOp(Instruction::And, LHS, RHS); + case AtomicRMWInst::Or: + return B.CreateBinOp(Instruction::Or, LHS, RHS); + case AtomicRMWInst::Xor: + return B.CreateBinOp(Instruction::Xor, LHS, RHS); + + case AtomicRMWInst::Max: + Pred = CmpInst::ICMP_SGT; + break; + case AtomicRMWInst::Min: + Pred = CmpInst::ICMP_SLT; + break; + case AtomicRMWInst::UMax: + Pred = CmpInst::ICMP_UGT; + break; + case AtomicRMWInst::UMin: + Pred = CmpInst::ICMP_ULT; + break; + } + Value *Cond = B.CreateICmp(Pred, LHS, RHS); + return B.CreateSelect(Cond, LHS, RHS); +} + +static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, + unsigned BitWidth) { + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + case AtomicRMWInst::Or: + case AtomicRMWInst::Xor: + case AtomicRMWInst::UMax: + return APInt::getMinValue(BitWidth); + case AtomicRMWInst::And: + case AtomicRMWInst::UMin: + return APInt::getMaxValue(BitWidth); + case AtomicRMWInst::Max: + return APInt::getSignedMinValue(BitWidth); + case AtomicRMWInst::Min: + return APInt::getSignedMaxValue(BitWidth); + } +} + void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, - Instruction::BinaryOps Op, + AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const { - LLVMContext &Context = I.getContext(); - // Start building just before the instruction. IRBuilder<> B(&I); @@ -251,115 +344,130 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, Value *const V = I.getOperand(ValIdx); // We need to know how many lanes are active within the wavefront, and we do - // this by getting the exec register, which tells us all the lanes that are - // active. - MDNode *const RegName = - llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec")); - Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName); - CallInst *const Exec = - B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata}); - setConvergent(Exec); + // this by doing a ballot of active lanes. + CallInst *const Ballot = B.CreateIntrinsic( + Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()}, + {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)}); // We need to know how many lanes are active within the wavefront that are // below us. If we counted each lane linearly starting from 0, a lane is // below us only if its associated index was less than ours. We do this by // using the mbcnt intrinsic. - Value *const BitCast = B.CreateBitCast(Exec, VecTy); + Value *const BitCast = B.CreateBitCast(Ballot, VecTy); Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); CallInst *const PartialMbcnt = B.CreateIntrinsic( Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)}); - CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, - {ExtractHi, PartialMbcnt}); + Value *const Mbcnt = + B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, + {ExtractHi, PartialMbcnt}), + Ty, false); - Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false); + Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); - Value *LaneOffset = nullptr; + Value *ExclScan = nullptr; Value *NewV = nullptr; // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { - // First we need to set all inactive invocations to 0, so that they can - // correctly contribute to the final result. - CallInst *const SetInactive = B.CreateIntrinsic( - Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)}); - setConvergent(SetInactive); - NewV = SetInactive; - - const unsigned Iters = 6; - const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2, - DPP_ROW_SR4, DPP_ROW_SR8, - DPP_ROW_BCAST15, DPP_ROW_BCAST31}; - const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; - - // This loop performs an inclusive scan across the wavefront, with all lanes + // First we need to set all inactive invocations to the identity value, so + // that they can correctly contribute to the final result. + CallInst *const SetInactive = + B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); + + CallInst *const FirstDPP = + B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty, + {Identity, SetInactive, B.getInt32(DPP_WF_SR1), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); + ExclScan = FirstDPP; + + const unsigned Iters = 7; + const unsigned DPPCtrl[Iters] = { + DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4, + DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31}; + const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; + const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf}; + + // This loop performs an exclusive scan across the wavefront, with all lanes // active (by using the WWM intrinsic). for (unsigned Idx = 0; Idx < Iters; Idx++) { - CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty, - {NewV, B.getInt32(DPPCtrl[Idx]), - B.getInt32(RowMask[Idx]), - B.getInt32(0xf), B.getFalse()}); - setConvergent(DPP); - Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); - - NewV = B.CreateBinOp(Op, NewV, WWM); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); + Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan; + CallInst *const DPP = B.CreateIntrinsic( + Intrinsic::amdgcn_update_dpp, Ty, + {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]), + B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()}); + + ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP); } - // NewV has returned the inclusive scan of V, but for the lane offset we - // require an exclusive scan. We do this by shifting the values from the - // entire wavefront right by 1, and by setting the bound_ctrl (last argument - // to the intrinsic below) to true, we can guarantee that 0 will be shifted - // into the 0'th invocation. - CallInst *const DPP = - B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty}, - {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf), - B.getInt32(0xf), B.getTrue()}); - setConvergent(DPP); - LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); + NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan); // Read the value from the last lane, which has accumlated the values of - // each active lane in the wavefront. This will be our new value with which - // we will provide to the atomic operation. + // each active lane in the wavefront. This will be our new value which we + // will provide to the atomic operation. if (TyBitWidth == 64) { Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty()); CallInst *const ReadLaneLo = B.CreateIntrinsic( Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)}); - setConvergent(ReadLaneLo); CallInst *const ReadLaneHi = B.CreateIntrinsic( Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)}); - setConvergent(ReadLaneHi); Value *const PartialInsert = B.CreateInsertElement( UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); Value *const Insert = B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); NewV = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { - CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, - {}, {NewV, B.getInt32(63)}); - setConvergent(ReadLane); - NewV = ReadLane; + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {NewV, B.getInt32(63)}); } else { llvm_unreachable("Unhandled atomic bit width"); } + + // Finally mark the readlanes in the WWM section. + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); } else { - // Get the total number of active lanes we have by using popcount. - Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec); - Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false); - - // Calculate the new value we will be contributing to the atomic operation - // for the entire wavefront. - NewV = B.CreateMul(V, CtpopCast); - LaneOffset = B.CreateMul(V, MbcntCast); + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: { + // The new value we will be contributing to the atomic operation is the + // old value times the number of active lanes. + Value *const Ctpop = B.CreateIntCast( + B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); + NewV = B.CreateMul(V, Ctpop); + break; + } + + case AtomicRMWInst::And: + case AtomicRMWInst::Or: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + // These operations with a uniform value are idempotent: doing the atomic + // operation multiple times has the same effect as doing it once. + NewV = V; + break; + + case AtomicRMWInst::Xor: + // The new value we will be contributing to the atomic operation is the + // old value times the parity of the number of active lanes. + Value *const Ctpop = B.CreateIntCast( + B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); + NewV = B.CreateMul(V, B.CreateAnd(Ctpop, 1)); + break; + } } // We only want a single lane to enter our new control flow, and we do this // by checking if there are any active lanes below us. Only one lane will // have 0 active lanes below us, so that will be the only one to progress. - Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0)); + Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); // Store I's original basic block before we split the block. BasicBlock *const EntryBB = I.getParent(); @@ -401,20 +509,16 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty()); CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); - setConvergent(ReadFirstLaneLo); CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); - setConvergent(ReadFirstLaneHi); Value *const PartialInsert = B.CreateInsertElement( UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); Value *const Insert = B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); BroadcastI = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { - CallInst *const ReadFirstLane = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); - setConvergent(ReadFirstLane); - BroadcastI = ReadFirstLane; + + BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); } else { llvm_unreachable("Unhandled atomic bit width"); } @@ -423,7 +527,31 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // get our individual lane's slice into the result. We use the lane offset we // previously calculated combined with the atomic result value we got from the // first lane, to get our lane's index into the atomic result. - Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset); + Value *LaneOffset = nullptr; + if (ValDivergent) { + LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); + } else { + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + LaneOffset = B.CreateMul(V, Mbcnt); + break; + case AtomicRMWInst::And: + case AtomicRMWInst::Or: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + LaneOffset = B.CreateSelect(Cond, Identity, V); + break; + case AtomicRMWInst::Xor: + LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1)); + break; + } + } + Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); if (IsPixelShader) { // Need a final PHI to reconverge to above the helper lane branch mask. @@ -442,10 +570,6 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, I.eraseFromParent(); } -void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const { - CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent); -} - INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index daef37f9c21f..b107c357196d 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -1,9 +1,8 @@ //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -21,28 +20,98 @@ #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/LowLevelTypeImpl.h" using namespace llvm; +namespace { + +struct OutgoingArgHandler : public CallLowering::ValueHandler { + OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + MachineInstrBuilder MIB, CCAssignFn *AssignFn) + : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + + MachineInstrBuilder MIB; + + Register getStackAddress(uint64_t Size, int64_t Offset, + MachinePointerInfo &MPO) override { + llvm_unreachable("not implemented"); + } + + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, + MachinePointerInfo &MPO, CCValAssign &VA) override { + llvm_unreachable("not implemented"); + } + + void assignValueToReg(Register ValVReg, Register PhysReg, + CCValAssign &VA) override { + MIB.addUse(PhysReg); + MIRBuilder.buildCopy(PhysReg, ValVReg); + } + + bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + const CallLowering::ArgInfo &Info, + CCState &State) override { + return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + } +}; + +} + AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) : CallLowering(&TLI) { } bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef<unsigned> VRegs) const { - // FIXME: Add support for non-void returns. - if (Val) + ArrayRef<Register> VRegs) const { + + MachineFunction &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MFI->setIfReturnsVoid(!Val); + + if (!Val) { + MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0); + return true; + } + + Register VReg = VRegs[0]; + + const Function &F = MF.getFunction(); + auto &DL = F.getParent()->getDataLayout(); + if (!AMDGPU::isShader(F.getCallingConv())) return false; - MIRBuilder.buildInstr(AMDGPU::S_ENDPGM); + + const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); + SmallVector<EVT, 4> SplitVTs; + SmallVector<uint64_t, 4> Offsets; + ArgInfo OrigArg{VReg, Val->getType()}; + setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); + ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); + + SmallVector<ArgInfo, 8> SplitArgs; + CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false); + for (unsigned i = 0, e = Offsets.size(); i != e; ++i) { + Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext()); + SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed}); + } + auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG); + OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + return false; + MIRBuilder.insertInstr(RetInstr); + return true; } -unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, +Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, uint64_t Offset) const { @@ -53,12 +122,12 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, const DataLayout &DL = F.getParent()->getDataLayout(); PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); LLT PtrType = getLLTForType(*PtrTy, DL); - unsigned DstReg = MRI.createGenericVirtualRegister(PtrType); - unsigned KernArgSegmentPtr = + Register DstReg = MRI.createGenericVirtualRegister(PtrType); + Register KernArgSegmentPtr = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); - unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); + Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); - unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); MIRBuilder.buildConstant(OffsetReg, Offset); MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); @@ -69,14 +138,14 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, uint64_t Offset, unsigned Align, - unsigned DstReg) const { + Register DstReg) const { MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); unsigned TypeSize = DL.getTypeStoreSize(ParamTy); - unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); + Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | @@ -87,93 +156,233 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, MIRBuilder.buildLoad(DstReg, PtrReg, *MMO); } -bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, - const Function &F, - ArrayRef<unsigned> VRegs) const { - // AMDGPU_GS and AMDGP_HS are not supported yet. - if (F.getCallingConv() == CallingConv::AMDGPU_GS || - F.getCallingConv() == CallingConv::AMDGPU_HS) - return false; +static Register findFirstFreeSGPR(CCState &CCInfo) { + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { + if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { + return AMDGPU::SGPR0 + Reg; + } + } + llvm_unreachable("Cannot allocate sgpr"); +} - MachineFunction &MF = MIRBuilder.getMF(); - const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); +static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + const LLT S32 = LLT::scalar(32); MachineRegisterInfo &MRI = MF.getRegInfo(); - SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); - const DataLayout &DL = F.getParent()->getDataLayout(); - SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); + if (Info.hasWorkItemIDX()) { + Register Reg = AMDGPU::VGPR0; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); + } + + if (Info.hasWorkItemIDY()) { + Register Reg = AMDGPU::VGPR1; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); + } + + if (Info.hasWorkItemIDZ()) { + Register Reg = AMDGPU::VGPR2; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); + } +} +// Allocate special inputs passed in user SGPRs. +static void allocateHSAUserSGPRs(CCState &CCInfo, + MachineIRBuilder &MIRBuilder, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { // FIXME: How should these inputs interact with inreg / custom SGPR inputs? - if (Info->hasPrivateSegmentBuffer()) { - unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); - MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + if (Info.hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } - if (Info->hasDispatchPtr()) { - unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); - // FIXME: Need to add reg as live-in + if (Info.hasDispatchPtr()) { + unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } - if (Info->hasQueuePtr()) { - unsigned QueuePtrReg = Info->addQueuePtr(*TRI); - // FIXME: Need to add reg as live-in + if (Info.hasQueuePtr()) { + unsigned QueuePtrReg = Info.addQueuePtr(TRI); + MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } - if (Info->hasKernargSegmentPtr()) { - unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); - const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); - unsigned VReg = MRI.createGenericVirtualRegister(P2); + if (Info.hasKernargSegmentPtr()) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register InputPtrReg = Info.addKernargSegmentPtr(TRI); + const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register VReg = MRI.createGenericVirtualRegister(P4); MRI.addLiveIn(InputPtrReg, VReg); MIRBuilder.getMBB().addLiveIn(InputPtrReg); MIRBuilder.buildCopy(VReg, InputPtrReg); CCInfo.AllocateReg(InputPtrReg); } - if (Info->hasDispatchID()) { - unsigned DispatchIDReg = Info->addDispatchID(*TRI); - // FIXME: Need to add reg as live-in + if (Info.hasDispatchID()) { + unsigned DispatchIDReg = Info.addDispatchID(TRI); + MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } - if (Info->hasFlatScratchInit()) { - unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); - // FIXME: Need to add reg as live-in + if (Info.hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); + MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); } + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. +} + +static void allocateSystemSGPRs(CCState &CCInfo, + MachineFunction &MF, + SIMachineFunctionInfo &Info, + CallingConv::ID CallConv, + bool IsShader) { + const LLT S32 = LLT::scalar(32); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + if (Info.hasWorkGroupIDX()) { + Register Reg = Info.addWorkGroupIDX(); + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); + CCInfo.AllocateReg(Reg); + } + + if (Info.hasWorkGroupIDY()) { + Register Reg = Info.addWorkGroupIDY(); + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); + CCInfo.AllocateReg(Reg); + } + + if (Info.hasWorkGroupIDZ()) { + unsigned Reg = Info.addWorkGroupIDZ(); + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); + CCInfo.AllocateReg(Reg); + } + + if (Info.hasWorkGroupInfo()) { + unsigned Reg = Info.addWorkGroupInfo(); + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); + CCInfo.AllocateReg(Reg); + } + + if (Info.hasPrivateSegmentWaveByteOffset()) { + // Scratch wave offset passed in system SGPR. + unsigned PrivateSegmentWaveByteOffsetReg; + + if (IsShader) { + PrivateSegmentWaveByteOffsetReg = + Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); + + // This is true if the scratch wave byte offset doesn't have a fixed + // location. + if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { + PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); + Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + } + } else + PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); + + MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); + } +} + +bool AMDGPUCallLowering::lowerFormalArgumentsKernel( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef<ArrayRef<Register>> VRegs) const { + MachineFunction &MF = MIRBuilder.getMF(); + const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); + const DataLayout &DL = F.getParent()->getDataLayout(); + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); + + allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info); + + unsigned i = 0; + const unsigned KernArgBaseAlign = 16; + const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); + uint64_t ExplicitArgOffset = 0; + + // TODO: Align down to dword alignment and extract bits for extending loads. + for (auto &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + unsigned AllocSize = DL.getTypeAllocSize(ArgTy); + if (AllocSize == 0) + continue; + + unsigned ABIAlign = DL.getABITypeAlignment(ArgTy); + + uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; + + ArrayRef<Register> OrigArgRegs = VRegs[i]; + Register ArgReg = + OrigArgRegs.size() == 1 + ? OrigArgRegs[0] + : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); + unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset); + ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); + lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg); + if (OrigArgRegs.size() > 1) + unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder); + ++i; + } + + allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); + allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); + return true; +} + +bool AMDGPUCallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef<ArrayRef<Register>> VRegs) const { // The infrastructure for normal calling convention lowering is essentially // useless for kernels. We want to avoid any kind of legalization or argument // splitting. - if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) { - unsigned i = 0; - const unsigned KernArgBaseAlign = 16; - const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); - uint64_t ExplicitArgOffset = 0; - - // TODO: Align down to dword alignment and extract bits for extending loads. - for (auto &Arg : F.args()) { - Type *ArgTy = Arg.getType(); - unsigned AllocSize = DL.getTypeAllocSize(ArgTy); - if (AllocSize == 0) - continue; + if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) + return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs); - unsigned ABIAlign = DL.getABITypeAlignment(ArgTy); + // AMDGPU_GS and AMDGP_HS are not supported yet. + if (F.getCallingConv() == CallingConv::AMDGPU_GS || + F.getCallingConv() == CallingConv::AMDGPU_HS) + return false; + + MachineFunction &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); + const DataLayout &DL = F.getParent()->getDataLayout(); - uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; - ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; + bool IsShader = AMDGPU::isShader(F.getCallingConv()); - unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset); - ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); - lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]); - ++i; - } + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); - return true; + if (Info->hasImplicitBufferPtr()) { + unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); + MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(ImplicitBufferPtrReg); } unsigned NumArgs = F.arg_size(); @@ -186,7 +395,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, // We can only hanlde simple value types at the moment. ISD::ArgFlagsTy Flags; - ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()}; + assert(VRegs[i].size() == 1 && "Can't lower into more than one register"); + ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()}; setArgFlags(OrigArg, i + 1, DL, F); Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); @@ -239,11 +449,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) { if (Skipped.test(OrigArgIdx)) continue; - CCValAssign &VA = ArgLocs[i++]; - MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx]); - MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); - MIRBuilder.buildCopy(VRegs[OrigArgIdx], VA.getLocReg()); + assert(VRegs[OrigArgIdx].size() == 1 && + "Can't lower into more than 1 reg"); + CCValAssign &VA = ArgLocs[i++]; + MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]); + MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); + MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg()); } + + allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader); return true; } diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h index ed859716218e..3599659cac6a 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -1,9 +1,8 @@ //===- lib/Target/AMDGPU/AMDGPUCallLowering.h - Call lowering -*- C++ -*---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// @@ -23,20 +22,25 @@ namespace llvm { class AMDGPUTargetLowering; class AMDGPUCallLowering: public CallLowering { - unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, + Register lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, uint64_t Offset) const; void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, uint64_t Offset, unsigned Align, - unsigned DstReg) const; + Register DstReg) const; public: AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef<unsigned> VRegs) const override; + ArrayRef<Register> VRegs) const override; + + bool lowerFormalArgumentsKernel(MachineIRBuilder &MIRBuilder, + const Function &F, + ArrayRef<ArrayRef<Register>> VRegs) const; + bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef<unsigned> VRegs) const override; + ArrayRef<ArrayRef<Register>> VRegs) const override; static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); }; diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index 367f120b5fa6..3688cd77542e 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -1,9 +1,8 @@ //===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -24,7 +23,16 @@ def CC_SI : CallingConv<[ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, - SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39, + SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47, + SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55, + SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63, + SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71, + SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79, + SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87, + SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95, + SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103, + SGPR104, SGPR105 ]>>>, // We have no way of referring to the generated register tuples @@ -60,7 +68,16 @@ def RetCC_SI_Shader : CallingConv<[ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, - SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39, + SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47, + SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55, + SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63, + SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71, + SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79, + SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87, + SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95, + SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103, + SGPR104, SGPR105 ]>>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. @@ -93,12 +110,22 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs< (sequence "VGPR%u", 32, 255) >; -def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs< - (sequence "SGPR%u", 32, 103) +def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs< + (sequence "SGPR%u", 32, 105) +>; + +// Just to get the regmask, not for calling convention purposes. +def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs< + (sequence "VGPR%u", 0, 255) +>; + +// Just to get the regmask, not for calling convention purposes. +def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs< + (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI) >; def CSR_AMDGPU_HighRegs : CalleeSavedRegs< - (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103) + (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105) >; // Calling convention for leaf functions @@ -111,10 +138,12 @@ def CC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, + CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v5i32, v5f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, + CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>, CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, + CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>, CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> ]>; diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 4dc1e67c573d..b750c6b5f6d2 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -62,6 +61,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass, AssumptionCache *AC = nullptr; LegacyDivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; + const DataLayout *DL = nullptr; bool HasUnsafeFPMath = false; /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to @@ -134,6 +134,16 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// \returns True. bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; + + unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const; + unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const; + bool isI24(Value *V, unsigned ScalarSize) const; + bool isU24(Value *V, unsigned ScalarSize) const; + + /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. + /// SelectionDAG has an issue where an and asserting the bits are known + bool replaceMulWithMul24(BinaryOperator &I) const; + /// Expands 24 bit div or rem. Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den, @@ -393,6 +403,118 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( return true; } +unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op, + unsigned ScalarSize) const { + KnownBits Known = computeKnownBits(Op, *DL, 0, AC); + return ScalarSize - Known.countMinLeadingZeros(); +} + +unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op, + unsigned ScalarSize) const { + // In order for this to be a signed 24-bit value, bit 23, must + // be a sign bit. + return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC); +} + +bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const { + return ScalarSize >= 24 && // Types less than 24-bit should be treated + // as unsigned 24-bit values. + numBitsSigned(V, ScalarSize) < 24; +} + +bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const { + return numBitsUnsigned(V, ScalarSize) <= 24; +} + +static void extractValues(IRBuilder<> &Builder, + SmallVectorImpl<Value *> &Values, Value *V) { + VectorType *VT = dyn_cast<VectorType>(V->getType()); + if (!VT) { + Values.push_back(V); + return; + } + + for (int I = 0, E = VT->getNumElements(); I != E; ++I) + Values.push_back(Builder.CreateExtractElement(V, I)); +} + +static Value *insertValues(IRBuilder<> &Builder, + Type *Ty, + SmallVectorImpl<Value *> &Values) { + if (Values.size() == 1) + return Values[0]; + + Value *NewVal = UndefValue::get(Ty); + for (int I = 0, E = Values.size(); I != E; ++I) + NewVal = Builder.CreateInsertElement(NewVal, Values[I], I); + + return NewVal; +} + +bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { + if (I.getOpcode() != Instruction::Mul) + return false; + + Type *Ty = I.getType(); + unsigned Size = Ty->getScalarSizeInBits(); + if (Size <= 16 && ST->has16BitInsts()) + return false; + + // Prefer scalar if this could be s_mul_i32 + if (DA->isUniform(&I)) + return false; + + Value *LHS = I.getOperand(0); + Value *RHS = I.getOperand(1); + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Intrinsic::ID IntrID = Intrinsic::not_intrinsic; + + // TODO: Should this try to match mulhi24? + if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) { + IntrID = Intrinsic::amdgcn_mul_u24; + } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) { + IntrID = Intrinsic::amdgcn_mul_i24; + } else + return false; + + SmallVector<Value *, 4> LHSVals; + SmallVector<Value *, 4> RHSVals; + SmallVector<Value *, 4> ResultVals; + extractValues(Builder, LHSVals, LHS); + extractValues(Builder, RHSVals, RHS); + + + IntegerType *I32Ty = Builder.getInt32Ty(); + FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID); + for (int I = 0, E = LHSVals.size(); I != E; ++I) { + Value *LHS, *RHS; + if (IntrID == Intrinsic::amdgcn_mul_u24) { + LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); + RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); + } else { + LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty); + RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty); + } + + Value *Result = Builder.CreateCall(Intrin, {LHS, RHS}); + + if (IntrID == Intrinsic::amdgcn_mul_u24) { + ResultVals.push_back(Builder.CreateZExtOrTrunc(Result, + LHSVals[I]->getType())); + } else { + ResultVals.push_back(Builder.CreateSExtOrTrunc(Result, + LHSVals[I]->getType())); + } + } + + I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals)); + I.eraseFromParent(); + + return true; +} + static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); if (!CNum) @@ -757,6 +879,9 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { DA->isUniform(&I) && promoteUniformOpToI32(I)) return true; + if (replaceMulWithMul24(I)) + return true; + bool Changed = false; Instruction::BinaryOps Opc = I.getOpcode(); Type *Ty = I.getType(); @@ -807,7 +932,7 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { Type *I32Ty = Builder.getInt32Ty(); Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); - LoadInst *WidenLoad = Builder.CreateLoad(BitCast); + LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast); WidenLoad->copyMetadata(I); // If we have range metadata, we need to convert the type, and not make @@ -883,6 +1008,7 @@ bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Mod = &M; + DL = &Mod->getDataLayout(); return false; } diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td index 3c7d8a8fc550..ea3952c316e4 100644 --- a/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -1,9 +1,8 @@ //===-- AMDGPUFeatures.td - AMDGPU Feature Definitions -----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -50,17 +49,12 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; -class SubtargetFeatureGeneration <string Value, string Subtarget, +class SubtargetFeatureGeneration <string Value, string FeatureName, + string Subtarget, list<SubtargetFeature> Implies> : - SubtargetFeature <Value, "Gen", Subtarget#"::"#Value, + SubtargetFeature <FeatureName, "Gen", Subtarget#"::"#Value, Value#" GPU generation", Implies>; -def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", - "DX10Clamp", - "true", - "clamp modifier clamps NaNs to 0.0" ->; - def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", "EnablePromoteAlloca", "true", diff --git a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp index 6e2a981d3396..9ba04d113c70 100644 --- a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp +++ b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index e32ca9653b3a..e80797736363 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -1,9 +1,8 @@ //===----------------------- AMDGPUFrameLowering.cpp ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index ee836bf8a631..48b64488303e 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -1,9 +1,8 @@ //===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td index 59bb2a16e0f3..cad4c2ef404c 100644 --- a/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/lib/Target/AMDGPU/AMDGPUGISel.td @@ -1,9 +1,8 @@ //===-- AMDGPUGIsel.td - AMDGPU GlobalISel Patterns---------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This files contains patterns that should only be used by GlobalISel. For @@ -13,6 +12,10 @@ include "AMDGPU.td" +def p0 : PtrValueType<i64, 0>; +def p1 : PtrValueType<i64, 1>; +def p4 : PtrValueType<i64, 4>; + def sd_vsrc0 : ComplexPattern<i32, 1, "">; def gi_vsrc0 : GIComplexOperandMatcher<s32, "selectVSRC0">, @@ -35,6 +38,33 @@ def gi_vop3omods : GIComplexOperandMatcher<s32, "selectVOP3OMods">, GIComplexPatternEquiv<VOP3OMods>; +def gi_smrd_imm : + GIComplexOperandMatcher<s64, "selectSmrdImm">, + GIComplexPatternEquiv<SMRDImm>; + +def gi_smrd_imm32 : + GIComplexOperandMatcher<s64, "selectSmrdImm32">, + GIComplexPatternEquiv<SMRDImm32>; + +def gi_smrd_sgpr : + GIComplexOperandMatcher<s64, "selectSmrdSgpr">, + GIComplexPatternEquiv<SMRDSgpr>; + +def gi_flat_offset : + GIComplexOperandMatcher<s64, "selectFlatOffset">, + GIComplexPatternEquiv<FLATOffset>; +def gi_flat_offset_signed : + GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">, + GIComplexPatternEquiv<FLATOffsetSigned>; + +def gi_mubuf_scratch_offset : + GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, + GIComplexPatternEquiv<MUBUFScratchOffset>; +def gi_mubuf_scratch_offen : + GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">, + GIComplexPatternEquiv<MUBUFScratchOffen>; + + class GISelSop2Pat < SDPatternOperator node, Instruction inst, @@ -113,15 +143,6 @@ multiclass GISelVop2IntrPat < def : GISelSop2Pat <or, S_OR_B32, i32>; def : GISelVop2Pat <or, V_OR_B32_e32, i32>; -def : GISelSop2Pat <sra, S_ASHR_I32, i32>; -let AddedComplexity = 100 in { -let SubtargetPredicate = isSICI in { -def : GISelVop2Pat <sra, V_ASHR_I32_e32, i32>; -} -def : GISelVop2CommutePat <sra, V_ASHRREV_I32_e32, i32>; -} -def : GISelVop3Pat2CommutePat <sra, V_ASHRREV_I32_e64, i32>; - // FIXME: We can't re-use SelectionDAG patterns here because they match // against a custom SDNode and we would need to create a generic machine // instruction that is equivalent to the custom SDNode. This would also require @@ -135,3 +156,11 @@ defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>; def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>; defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>; def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>; + +// Since GlobalISel is more flexible then SelectionDAG, I think we can get +// away with adding patterns for integer types and not legalizing all +// loads and stores to vector types. This should help simplify the load/store +// legalization. +foreach Ty = [i64, p0, p1, p4] in { + defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>; +} diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def index 6eab59ab4e09..0a1f48231b18 100644 --- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -1,9 +1,8 @@ //===- AMDGPUGenRegisterBankInfo.def -----------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -92,6 +91,28 @@ const RegisterBankInfo::ValueMapping ValMappings[] { {&PartMappings[17], 1} }; +const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] { + /*32-bit op*/ {0, 32, SGPRRegBank}, + /*2x32-bit op*/ {0, 32, SGPRRegBank}, + {32, 32, SGPRRegBank}, +/*<2x32-bit> op*/ {0, 64, SGPRRegBank}, + + /*32-bit op*/ {0, 32, VGPRRegBank}, + /*2x32-bit op*/ {0, 32, VGPRRegBank}, + {32, 32, VGPRRegBank}, +}; + + +// For some instructions which can operate 64-bit only for the scalar version. +const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] { + /*32-bit sgpr*/ {&SGPROnly64BreakDown[0], 1}, + /*2 x 32-bit sgpr*/ {&SGPROnly64BreakDown[1], 2}, + /*64-bit sgpr */ {&SGPROnly64BreakDown[3], 1}, + + /*32-bit vgpr*/ {&SGPROnly64BreakDown[4], 1}, + /*2 x 32-bit vgpr*/ {&SGPROnly64BreakDown[5], 2} +}; + enum ValueMappingIdx { SCCStartIdx = 0, SGPRStartIdx = 2, @@ -128,5 +149,89 @@ const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID, return &ValMappings[Idx]; } +const RegisterBankInfo::ValueMapping *getValueMappingSGPR64Only(unsigned BankID, + unsigned Size) { + if (Size != 64) + return getValueMapping(BankID, Size); + + if (BankID == AMDGPU::VGPRRegBankID) + return &ValMappingsSGPR64OnlyVGPR32[4]; + + assert(BankID == AMDGPU::SGPRRegBankID); + return &ValMappingsSGPR64OnlyVGPR32[2]; +} + +const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] { + /* 256-bit load */ {0, 256, SGPRRegBank}, + /* 512-bit load */ {0, 512, SGPRRegBank}, + /* 8 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, + {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, + {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, + {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, + /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, + {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, + {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, + {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, + {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank}, + {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank}, + {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank}, + {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank}, + /* 4 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, + {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, + /* 8 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, + {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, + {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank}, + {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank}, + + /* FIXME: The generic register bank select does not support complex + * break downs where the number of vector elements does not equal the + * number of breakdowns. + * FIXME: register bank select now tries to handle complex break downs, + * but it emits an illegal instruction: + * %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128) + */ + /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, + /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, + {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank} +}; + +const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] { + /* 256-bit load */ {&LoadSGPROnlyBreakDown[0], 1}, + /* 512-bit load */ {&LoadSGPROnlyBreakDown[1], 1}, + /* <8 x i32> load */ {&LoadSGPROnlyBreakDown[2], 8}, + /* <16 x i32> load */ {&LoadSGPROnlyBreakDown[10], 16}, + /* <4 x i64> load */ {&LoadSGPROnlyBreakDown[26], 4}, + /* <8 x i64> load */ {&LoadSGPROnlyBreakDown[30], 8} +}; + +const RegisterBankInfo::ValueMapping * +getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) { + unsigned Size = SizeTy.getSizeInBits(); + if (Size < 256 || BankID == AMDGPU::SGPRRegBankID) + return getValueMapping(BankID, Size); + + assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID); + + // Default to using the non-split ValueMappings, we will use these if + // the register bank is SGPR or if we don't know how to handle the vector + // type. + unsigned Idx = Size == 256 ? 0 : 1; + + // We need to split this load if it has a vgpr pointer. + if (BankID == AMDGPU::VGPRRegBankID) { + if (SizeTy == LLT::vector(8, 32)) + Idx = 2; + else if (SizeTy == LLT::vector(16, 32)) + Idx = 3; + else if (SizeTy == LLT::vector(4, 64)) + Idx = 4; + else if (SizeTy == LLT::vector(8, 64)) + Idx = 5; + } + + return &ValMappingsLoadSGPROnly[Idx]; +} + + } // End AMDGPU namespace. } // End llvm namespace. diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index c38b0e61558b..b31de0af5018 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -1,9 +1,8 @@ //===--- AMDGPUHSAMetadataStreamer.cpp --------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -240,23 +239,7 @@ MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF, Kernel::DebugProps::Metadata MetadataStreamerV2::getHSADebugProps(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const { - const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); - HSAMD::Kernel::DebugProps::Metadata HSADebugProps; - - if (!STM.debuggerSupported()) - return HSADebugProps; - - HSADebugProps.mDebuggerABIVersion.push_back(1); - HSADebugProps.mDebuggerABIVersion.push_back(0); - - if (STM.debuggerEmitPrologue()) { - HSADebugProps.mPrivateSegmentBufferSGPR = - ProgramInfo.DebuggerPrivateSegmentBufferSGPR; - HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR = - ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; - } - - return HSADebugProps; + return HSAMD::Kernel::DebugProps::Metadata(); } void MetadataStreamerV2::emitVersion() { @@ -452,6 +435,10 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) { emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); } } + + // Emit the pointer argument for multi-grid object. + if (HiddenArgNumBytes >= 56) + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenMultiGridSyncArg); } bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) { @@ -506,20 +493,16 @@ void MetadataStreamerV3::dump(StringRef HSAMetadataString) const { void MetadataStreamerV3::verify(StringRef HSAMetadataString) const { errs() << "AMDGPU HSA Metadata Parser Test: "; - std::shared_ptr<msgpack::Node> FromHSAMetadataString = - std::make_shared<msgpack::MapNode>(); + msgpack::Document FromHSAMetadataString; - yaml::Input YIn(HSAMetadataString); - YIn >> FromHSAMetadataString; - if (YIn.error()) { + if (!FromHSAMetadataString.fromYAML(HSAMetadataString)) { errs() << "FAIL\n"; return; } std::string ToHSAMetadataString; raw_string_ostream StrOS(ToHSAMetadataString); - yaml::Output YOut(StrOS); - YOut << FromHSAMetadataString; + FromHSAMetadataString.toYAML(StrOS); errs() << (HSAMetadataString == StrOS.str() ? "PASS" : "FAIL") << '\n'; if (HSAMetadataString != ToHSAMetadataString) { @@ -653,23 +636,23 @@ std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const { } } -std::shared_ptr<msgpack::ArrayNode> +msgpack::ArrayDocNode MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const { - auto Dims = std::make_shared<msgpack::ArrayNode>(); + auto Dims = HSAMetadataDoc->getArrayNode(); if (Node->getNumOperands() != 3) return Dims; for (auto &Op : Node->operands()) - Dims->push_back(std::make_shared<msgpack::ScalarNode>( - mdconst::extract<ConstantInt>(Op)->getZExtValue())); + Dims.push_back(Dims.getDocument()->getNode( + uint64_t(mdconst::extract<ConstantInt>(Op)->getZExtValue()))); return Dims; } void MetadataStreamerV3::emitVersion() { - auto Version = std::make_shared<msgpack::ArrayNode>(); - Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMajor)); - Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMinor)); - getRootMetadata("amdhsa.version") = std::move(Version); + auto Version = HSAMetadataDoc->getArrayNode(); + Version.push_back(Version.getDocument()->getNode(VersionMajor)); + Version.push_back(Version.getDocument()->getNode(VersionMinor)); + getRootMetadata("amdhsa.version") = Version; } void MetadataStreamerV3::emitPrintf(const Module &Mod) { @@ -677,16 +660,16 @@ void MetadataStreamerV3::emitPrintf(const Module &Mod) { if (!Node) return; - auto Printf = std::make_shared<msgpack::ArrayNode>(); + auto Printf = HSAMetadataDoc->getArrayNode(); for (auto Op : Node->operands()) if (Op->getNumOperands()) - Printf->push_back(std::make_shared<msgpack::ScalarNode>( - cast<MDString>(Op->getOperand(0))->getString())); - getRootMetadata("amdhsa.printf") = std::move(Printf); + Printf.push_back(Printf.getDocument()->getNode( + cast<MDString>(Op->getOperand(0))->getString(), /*Copy=*/true)); + getRootMetadata("amdhsa.printf") = Printf; } void MetadataStreamerV3::emitKernelLanguage(const Function &Func, - msgpack::MapNode &Kern) { + msgpack::MapDocNode Kern) { // TODO: What about other languages? auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version"); if (!Node || !Node->getNumOperands()) @@ -695,77 +678,50 @@ void MetadataStreamerV3::emitKernelLanguage(const Function &Func, if (Op0->getNumOperands() <= 1) return; - Kern[".language"] = std::make_shared<msgpack::ScalarNode>("OpenCL C"); - auto LanguageVersion = std::make_shared<msgpack::ArrayNode>(); - LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>( + Kern[".language"] = Kern.getDocument()->getNode("OpenCL C"); + auto LanguageVersion = Kern.getDocument()->getArrayNode(); + LanguageVersion.push_back(Kern.getDocument()->getNode( mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue())); - LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>( + LanguageVersion.push_back(Kern.getDocument()->getNode( mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue())); - Kern[".language_version"] = std::move(LanguageVersion); + Kern[".language_version"] = LanguageVersion; } void MetadataStreamerV3::emitKernelAttrs(const Function &Func, - msgpack::MapNode &Kern) { + msgpack::MapDocNode Kern) { if (auto Node = Func.getMetadata("reqd_work_group_size")) Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node); if (auto Node = Func.getMetadata("work_group_size_hint")) Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node); if (auto Node = Func.getMetadata("vec_type_hint")) { - Kern[".vec_type_hint"] = std::make_shared<msgpack::ScalarNode>(getTypeName( - cast<ValueAsMetadata>(Node->getOperand(0))->getType(), - mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue())); + Kern[".vec_type_hint"] = Kern.getDocument()->getNode( + getTypeName( + cast<ValueAsMetadata>(Node->getOperand(0))->getType(), + mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()), + /*Copy=*/true); } if (Func.hasFnAttribute("runtime-handle")) { - Kern[".device_enqueue_symbol"] = std::make_shared<msgpack::ScalarNode>( - Func.getFnAttribute("runtime-handle").getValueAsString().str()); + Kern[".device_enqueue_symbol"] = Kern.getDocument()->getNode( + Func.getFnAttribute("runtime-handle").getValueAsString().str(), + /*Copy=*/true); } } void MetadataStreamerV3::emitKernelArgs(const Function &Func, - msgpack::MapNode &Kern) { + msgpack::MapDocNode Kern) { unsigned Offset = 0; - auto Args = std::make_shared<msgpack::ArrayNode>(); + auto Args = HSAMetadataDoc->getArrayNode(); for (auto &Arg : Func.args()) - emitKernelArg(Arg, Offset, *Args); - - emitHiddenKernelArgs(Func, Offset, *Args); - - // TODO: What about other languages? - if (Func.getParent()->getNamedMetadata("opencl.ocl.version")) { - auto &DL = Func.getParent()->getDataLayout(); - auto Int64Ty = Type::getInt64Ty(Func.getContext()); - - emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, *Args); - emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, *Args); - emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, *Args); - - auto Int8PtrTy = - Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); + emitKernelArg(Arg, Offset, Args); - // Emit "printf buffer" argument if printf is used, otherwise emit dummy - // "none" argument. - if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, *Args); - else - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); + emitHiddenKernelArgs(Func, Offset, Args); - // Emit "default queue" and "completion action" arguments if enqueue kernel - // is used, otherwise emit dummy "none" arguments. - if (Func.hasFnAttribute("calls-enqueue-kernel")) { - emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, *Args); - emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, *Args); - } else { - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); - } - } - - Kern[".args"] = std::move(Args); + Kern[".args"] = Args; } void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, - msgpack::ArrayNode &Args) { + msgpack::ArrayDocNode Args) { auto Func = Arg.getParent(); auto ArgNo = Arg.getArgNo(); const MDNode *Node; @@ -822,36 +778,35 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind, unsigned &Offset, - msgpack::ArrayNode &Args, + msgpack::ArrayDocNode Args, unsigned PointeeAlign, StringRef Name, StringRef TypeName, StringRef BaseTypeName, StringRef AccQual, StringRef TypeQual) { - auto ArgPtr = std::make_shared<msgpack::MapNode>(); - auto &Arg = *ArgPtr; + auto Arg = Args.getDocument()->getMapNode(); if (!Name.empty()) - Arg[".name"] = std::make_shared<msgpack::ScalarNode>(Name); + Arg[".name"] = Arg.getDocument()->getNode(Name, /*Copy=*/true); if (!TypeName.empty()) - Arg[".type_name"] = std::make_shared<msgpack::ScalarNode>(TypeName); + Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true); auto Size = DL.getTypeAllocSize(Ty); auto Align = DL.getABITypeAlignment(Ty); - Arg[".size"] = std::make_shared<msgpack::ScalarNode>(Size); + Arg[".size"] = Arg.getDocument()->getNode(Size); Offset = alignTo(Offset, Align); - Arg[".offset"] = std::make_shared<msgpack::ScalarNode>(Offset); + Arg[".offset"] = Arg.getDocument()->getNode(Offset); Offset += Size; - Arg[".value_kind"] = std::make_shared<msgpack::ScalarNode>(ValueKind); + Arg[".value_kind"] = Arg.getDocument()->getNode(ValueKind, /*Copy=*/true); Arg[".value_type"] = - std::make_shared<msgpack::ScalarNode>(getValueType(Ty, BaseTypeName)); + Arg.getDocument()->getNode(getValueType(Ty, BaseTypeName), /*Copy=*/true); if (PointeeAlign) - Arg[".pointee_align"] = std::make_shared<msgpack::ScalarNode>(PointeeAlign); + Arg[".pointee_align"] = Arg.getDocument()->getNode(PointeeAlign); if (auto PtrTy = dyn_cast<PointerType>(Ty)) if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace())) - Arg[".address_space"] = std::make_shared<msgpack::ScalarNode>(*Qualifier); + Arg[".address_space"] = Arg.getDocument()->getNode(*Qualifier, /*Copy=*/true); if (auto AQ = getAccessQualifier(AccQual)) - Arg[".access"] = std::make_shared<msgpack::ScalarNode>(*AQ); + Arg[".access"] = Arg.getDocument()->getNode(*AQ, /*Copy=*/true); // TODO: Emit Arg[".actual_access"]. @@ -859,21 +814,21 @@ void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, TypeQual.split(SplitTypeQuals, " ", -1, false); for (StringRef Key : SplitTypeQuals) { if (Key == "const") - Arg[".is_const"] = std::make_shared<msgpack::ScalarNode>(true); + Arg[".is_const"] = Arg.getDocument()->getNode(true); else if (Key == "restrict") - Arg[".is_restrict"] = std::make_shared<msgpack::ScalarNode>(true); + Arg[".is_restrict"] = Arg.getDocument()->getNode(true); else if (Key == "volatile") - Arg[".is_volatile"] = std::make_shared<msgpack::ScalarNode>(true); + Arg[".is_volatile"] = Arg.getDocument()->getNode(true); else if (Key == "pipe") - Arg[".is_pipe"] = std::make_shared<msgpack::ScalarNode>(true); + Arg[".is_pipe"] = Arg.getDocument()->getNode(true); } - Args.push_back(std::move(ArgPtr)); + Args.push_back(Arg); } void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, unsigned &Offset, - msgpack::ArrayNode &Args) { + msgpack::ArrayDocNode Args) { int HiddenArgNumBytes = getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); @@ -913,56 +868,58 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); } } + + // Emit the pointer argument for multi-grid object. + if (HiddenArgNumBytes >= 56) + emitKernelArg(DL, Int8PtrTy, "hidden_multigrid_sync_arg", Offset, Args); } -std::shared_ptr<msgpack::MapNode> +msgpack::MapDocNode MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const { const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); const Function &F = MF.getFunction(); - auto HSAKernelProps = std::make_shared<msgpack::MapNode>(); - auto &Kern = *HSAKernelProps; + auto Kern = HSAMetadataDoc->getMapNode(); unsigned MaxKernArgAlign; - Kern[".kernarg_segment_size"] = std::make_shared<msgpack::ScalarNode>( + Kern[".kernarg_segment_size"] = Kern.getDocument()->getNode( STM.getKernArgSegmentSize(F, MaxKernArgAlign)); Kern[".group_segment_fixed_size"] = - std::make_shared<msgpack::ScalarNode>(ProgramInfo.LDSSize); + Kern.getDocument()->getNode(ProgramInfo.LDSSize); Kern[".private_segment_fixed_size"] = - std::make_shared<msgpack::ScalarNode>(ProgramInfo.ScratchSize); + Kern.getDocument()->getNode(ProgramInfo.ScratchSize); Kern[".kernarg_segment_align"] = - std::make_shared<msgpack::ScalarNode>(std::max(uint32_t(4), MaxKernArgAlign)); + Kern.getDocument()->getNode(std::max(uint32_t(4), MaxKernArgAlign)); Kern[".wavefront_size"] = - std::make_shared<msgpack::ScalarNode>(STM.getWavefrontSize()); - Kern[".sgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumSGPR); - Kern[".vgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumVGPR); + Kern.getDocument()->getNode(STM.getWavefrontSize()); + Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR); + Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR); Kern[".max_flat_workgroup_size"] = - std::make_shared<msgpack::ScalarNode>(MFI.getMaxFlatWorkGroupSize()); + Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize()); Kern[".sgpr_spill_count"] = - std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledSGPRs()); + Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs()); Kern[".vgpr_spill_count"] = - std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledVGPRs()); + Kern.getDocument()->getNode(MFI.getNumSpilledVGPRs()); - return HSAKernelProps; + return Kern; } bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) { - return TargetStreamer.EmitHSAMetadata(getHSAMetadataRoot(), true); + return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true); } void MetadataStreamerV3::begin(const Module &Mod) { emitVersion(); emitPrintf(Mod); - getRootMetadata("amdhsa.kernels").reset(new msgpack::ArrayNode()); + getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode(); } void MetadataStreamerV3::end() { std::string HSAMetadataString; raw_string_ostream StrOS(HSAMetadataString); - yaml::Output YOut(StrOS); - YOut << HSAMetadataRoot; + HSAMetadataDoc->toYAML(StrOS); if (DumpHSAMetadata) dump(StrOS.str()); @@ -973,25 +930,24 @@ void MetadataStreamerV3::end() { void MetadataStreamerV3::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) { auto &Func = MF.getFunction(); - auto KernelProps = getHSAKernelProps(MF, ProgramInfo); + auto Kern = getHSAKernelProps(MF, ProgramInfo); assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL || Func.getCallingConv() == CallingConv::SPIR_KERNEL); - auto &KernelsNode = getRootMetadata("amdhsa.kernels"); - auto Kernels = cast<msgpack::ArrayNode>(KernelsNode.get()); + auto Kernels = + getRootMetadata("amdhsa.kernels").getArray(/*Convert=*/true); { - auto &Kern = *KernelProps; - Kern[".name"] = std::make_shared<msgpack::ScalarNode>(Func.getName()); - Kern[".symbol"] = std::make_shared<msgpack::ScalarNode>( - (Twine(Func.getName()) + Twine(".kd")).str()); + Kern[".name"] = Kern.getDocument()->getNode(Func.getName()); + Kern[".symbol"] = Kern.getDocument()->getNode( + (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true); emitKernelLanguage(Func, Kern); emitKernelAttrs(Func, Kern); emitKernelArgs(Func, Kern); } - Kernels->push_back(std::move(KernelProps)); + Kernels.push_back(Kern); } } // end namespace HSAMD diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index afc09baf952d..2eecddbd7b01 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -1,9 +1,8 @@ //===--- AMDGPUHSAMetadataStreamer.h ----------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,7 +18,7 @@ #include "AMDGPU.h" #include "AMDKernelCodeT.h" #include "llvm/ADT/StringRef.h" -#include "llvm/BinaryFormat/MsgPackTypes.h" +#include "llvm/BinaryFormat/MsgPackDocument.h" #include "llvm/Support/AMDGPUMetadata.h" namespace llvm { @@ -52,8 +51,8 @@ public: class MetadataStreamerV3 final : public MetadataStreamer { private: - std::shared_ptr<msgpack::Node> HSAMetadataRoot = - std::make_shared<msgpack::MapNode>(); + std::unique_ptr<msgpack::Document> HSAMetadataDoc = + llvm::make_unique<msgpack::Document>(); void dump(StringRef HSAMetadataString) const; @@ -70,41 +69,39 @@ private: std::string getTypeName(Type *Ty, bool Signed) const; - std::shared_ptr<msgpack::ArrayNode> - getWorkGroupDimensions(MDNode *Node) const; + msgpack::ArrayDocNode getWorkGroupDimensions(MDNode *Node) const; - std::shared_ptr<msgpack::MapNode> - getHSAKernelProps(const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const; + msgpack::MapDocNode getHSAKernelProps(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const; void emitVersion(); void emitPrintf(const Module &Mod); - void emitKernelLanguage(const Function &Func, msgpack::MapNode &Kern); + void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern); - void emitKernelAttrs(const Function &Func, msgpack::MapNode &Kern); + void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern); - void emitKernelArgs(const Function &Func, msgpack::MapNode &Kern); + void emitKernelArgs(const Function &Func, msgpack::MapDocNode Kern); void emitKernelArg(const Argument &Arg, unsigned &Offset, - msgpack::ArrayNode &Args); + msgpack::ArrayDocNode Args); void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind, - unsigned &Offset, msgpack::ArrayNode &Args, + unsigned &Offset, msgpack::ArrayDocNode Args, unsigned PointeeAlign = 0, StringRef Name = "", StringRef TypeName = "", StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); void emitHiddenKernelArgs(const Function &Func, unsigned &Offset, - msgpack::ArrayNode &Args); + msgpack::ArrayDocNode Args); - std::shared_ptr<msgpack::Node> &getRootMetadata(StringRef Key) { - return (*cast<msgpack::MapNode>(HSAMetadataRoot.get()))[Key]; + msgpack::DocNode &getRootMetadata(StringRef Key) { + return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key]; } - std::shared_ptr<msgpack::Node> &getHSAMetadataRoot() { - return HSAMetadataRoot; + msgpack::DocNode &getHSAMetadataRoot() { + return HSAMetadataDoc->getRoot(); } public: diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index a0a045e72a58..ea730539f834 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// // @@ -40,6 +39,9 @@ #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/BasicBlock.h" +#ifdef EXPENSIVE_CHECKS +#include "llvm/IR/Dominators.h" +#endif #include "llvm/IR/Instruction.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/Casting.h" @@ -52,6 +54,8 @@ #include <new> #include <vector> +#define DEBUG_TYPE "isel" + using namespace llvm; namespace llvm { @@ -66,6 +70,57 @@ class R600InstrInfo; namespace { +static bool isNullConstantOrUndef(SDValue V) { + if (V.isUndef()) + return true; + + ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V); + return Const != nullptr && Const->isNullValue(); +} + +static bool getConstantValue(SDValue N, uint32_t &Out) { + // This is only used for packed vectors, where ussing 0 for undef should + // always be good. + if (N.isUndef()) { + Out = 0; + return true; + } + + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { + Out = C->getAPIntValue().getSExtValue(); + return true; + } + + if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) { + Out = C->getValueAPF().bitcastToAPInt().getSExtValue(); + return true; + } + + return false; +} + +// TODO: Handle undef as zero +static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, + bool Negate = false) { + assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + SDLoc SL(N); + uint32_t K = Negate ? + (-LHSVal & 0xffff) | (-RHSVal << 16) : + (LHSVal & 0xffff) | (RHSVal << 16); + return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), + DAG.getTargetConstant(K, SL, MVT::i32)); + } + + return nullptr; +} + +static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) { + return packConstantV2I16(N, DAG, true); +} + /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -84,12 +139,18 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AMDGPUArgumentUsageInfo>(); - AU.addRequired<AMDGPUPerfHintAnalysis>(); AU.addRequired<LegacyDivergenceAnalysis>(); +#ifdef EXPENSIVE_CHECKS + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); +#endif SelectionDAGISel::getAnalysisUsage(AU); } + bool matchLoadD16FromBuildVector(SDNode *N) const; + bool runOnMachineFunction(MachineFunction &MF) override; + void PreprocessISelDAG() override; void Select(SDNode *N) override; StringRef getPassName() const override; void PostprocessISelDAG() override; @@ -100,19 +161,24 @@ protected: private: std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; bool isNoNanSrc(SDValue N) const; - bool isInlineImmediate(const SDNode *N) const; + bool isInlineImmediate(const SDNode *N, bool Negated = false) const; + bool isNegInlineImmediate(const SDNode *N) const { + return isInlineImmediate(N, true); + } + bool isVGPRImm(const SDNode *N) const; bool isUniformLoad(const SDNode *N) const; bool isUniformBr(const SDNode *N) const; MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; - SDNode *glueCopyToM0(SDNode *N) const; + SDNode *glueCopyToM0LDSInit(SDNode *N) const; + SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); - bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, + bool isDSOffsetLegal(SDValue Base, unsigned Offset, unsigned OffsetBits) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, @@ -120,10 +186,10 @@ private: bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const; + SDValue &TFE, SDValue &DLC) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE) const; + SDValue &SLC, SDValue &TFE, SDValue &DLC) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; @@ -136,19 +202,19 @@ private: bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const; + SDValue &TFE, SDValue &DLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; - bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr, + bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; - bool SelectFlatAtomicSigned(SDValue Addr, SDValue &VAddr, + bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; template <bool IsSigned> - bool SelectFlatOffset(SDValue Addr, SDValue &VAddr, + bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, @@ -164,6 +230,7 @@ private: bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3Mods_f32(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; @@ -193,11 +260,13 @@ private: bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectHi16Elt(SDValue In, SDValue &Src) const; + SDValue getHi16Elt(SDValue In) const; void SelectADD_SUB_I64(SDNode *N); + void SelectAddcSubb(SDNode *N); void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); + void SelectDIV_FMAS(SDNode *N); void SelectMAD_64_32(SDNode *N); void SelectFMA_W_CHAIN(SDNode *N); void SelectFMUL_W_CHAIN(SDNode *N); @@ -210,6 +279,10 @@ private: void SelectBRCOND(SDNode *N); void SelectFMAD_FMA(SDNode *N); void SelectATOMIC_CMP_SWAP(SDNode *N); + void SelectDSAppendConsume(SDNode *N, unsigned IntrID); + void SelectDS_GWS(SDNode *N, unsigned IntrID); + void SelectINTRINSIC_W_CHAIN(SDNode *N); + void SelectINTRINSIC_VOID(SDNode *N); protected: // Include the pieces autogenerated from the target description. @@ -235,11 +308,49 @@ public: SDValue &Offset) override; bool runOnMachineFunction(MachineFunction &MF) override; + + void PreprocessISelDAG() override {} + protected: // Include the pieces autogenerated from the target description. #include "R600GenDAGISel.inc" }; +static SDValue stripBitcast(SDValue Val) { + return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; +} + +// Figure out if this is really an extract of the high 16-bits of a dword. +static bool isExtractHiElt(SDValue In, SDValue &Out) { + In = stripBitcast(In); + if (In.getOpcode() != ISD::TRUNCATE) + return false; + + SDValue Srl = In.getOperand(0); + if (Srl.getOpcode() == ISD::SRL) { + if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { + if (ShiftAmt->getZExtValue() == 16) { + Out = stripBitcast(Srl.getOperand(0)); + return true; + } + } + } + + return false; +} + +// Look through operations that obscure just looking at the low 16-bits of the +// same register. +static SDValue stripExtractLoElt(SDValue In) { + if (In.getOpcode() == ISD::TRUNCATE) { + SDValue Src = In.getOperand(0); + if (Src.getValueType().getSizeInBits() == 32) + return stripBitcast(Src); + } + + return In; +} + } // end anonymous namespace INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", @@ -247,6 +358,10 @@ INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +#ifdef EXPENSIVE_CHECKS +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +#endif INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) @@ -265,10 +380,125 @@ FunctionPass *llvm::createR600ISelDag(TargetMachine *TM, } bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { +#ifdef EXPENSIVE_CHECKS + DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + for (auto &L : LI->getLoopsInPreorder()) { + assert(L->isLCSSAForm(DT)); + } +#endif Subtarget = &MF.getSubtarget<GCNSubtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); } +bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const { + assert(Subtarget->d16PreservesUnusedBits()); + MVT VT = N->getValueType(0).getSimpleVT(); + if (VT != MVT::v2i16 && VT != MVT::v2f16) + return false; + + SDValue Lo = N->getOperand(0); + SDValue Hi = N->getOperand(1); + + LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi)); + + // build_vector lo, (load ptr) -> load_d16_hi ptr, lo + // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo + // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo + + // Need to check for possible indirect dependencies on the other half of the + // vector to avoid introducing a cycle. + if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) { + SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); + + SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo); + SDValue Ops[] = { + LdHi->getChain(), LdHi->getBasePtr(), TiedIn + }; + + unsigned LoadOp = AMDGPUISD::LOAD_D16_HI; + if (LdHi->getMemoryVT() == MVT::i8) { + LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ? + AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8; + } else { + assert(LdHi->getMemoryVT() == MVT::i16); + } + + SDValue NewLoadHi = + CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, + Ops, LdHi->getMemoryVT(), + LdHi->getMemOperand()); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1)); + return true; + } + + // build_vector (load ptr), hi -> load_d16_lo ptr, hi + // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi + // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi + LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo)); + if (LdLo && Lo.hasOneUse()) { + SDValue TiedIn = getHi16Elt(Hi); + if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode())) + return false; + + SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); + unsigned LoadOp = AMDGPUISD::LOAD_D16_LO; + if (LdLo->getMemoryVT() == MVT::i8) { + LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ? + AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8; + } else { + assert(LdLo->getMemoryVT() == MVT::i16); + } + + TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn); + + SDValue Ops[] = { + LdLo->getChain(), LdLo->getBasePtr(), TiedIn + }; + + SDValue NewLoadLo = + CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, + Ops, LdLo->getMemoryVT(), + LdLo->getMemOperand()); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1)); + return true; + } + + return false; +} + +void AMDGPUDAGToDAGISel::PreprocessISelDAG() { + if (!Subtarget->d16PreservesUnusedBits()) + return; + + SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); + + bool MadeChange = false; + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = &*--Position; + if (N->use_empty()) + continue; + + switch (N->getOpcode()) { + case ISD::BUILD_VECTOR: + MadeChange |= matchLoadD16FromBuildVector(N); + break; + default: + break; + } + } + + if (MadeChange) { + CurDAG->RemoveDeadNodes(); + LLVM_DEBUG(dbgs() << "After PreProcess:\n"; + CurDAG->dump();); + } +} + bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { if (TM.Options.NoNaNsFPMath) return true; @@ -280,14 +510,26 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { return CurDAG->isKnownNeverNaN(N); } -bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { +bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N, + bool Negated) const { + if (N->isUndef()) + return true; + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (Negated) { + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) + return TII->isInlineConstant(-C->getAPIntValue()); + + if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) + return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt()); - if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) - return TII->isInlineConstant(C->getAPIntValue()); + } else { + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) + return TII->isInlineConstant(C->getAPIntValue()); - if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) - return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); + if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) + return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); + } return false; } @@ -340,37 +582,48 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, } } -SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { - if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS || - !Subtarget->ldsRequiresM0Init()) - return N; - +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { const SITargetLowering& Lowering = - *static_cast<const SITargetLowering*>(getTargetLowering()); + *static_cast<const SITargetLowering*>(getTargetLowering()); - // Write max value to m0 before each load operation + assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain"); - SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), - CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), + Val); SDValue Glue = M0.getValue(1); SmallVector <SDValue, 8> Ops; - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { - Ops.push_back(N->getOperand(i)); - } + Ops.push_back(M0); // Replace the chain. + for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) + Ops.push_back(N->getOperand(i)); + Ops.push_back(Glue); return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); } +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { + unsigned AS = cast<MemSDNode>(N)->getAddressSpace(); + if (AS == AMDGPUAS::LOCAL_ADDRESS) { + if (Subtarget->ldsRequiresM0Init()) + return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + } else if (AS == AMDGPUAS::REGION_ADDRESS) { + MachineFunction &MF = CurDAG->getMachineFunction(); + unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize(); + return + glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32)); + } + return N; +} + MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, EVT VT) const { SDNode *Lo = CurDAG->getMachineNode( AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); + CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); + CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32)); const SDValue Ops[] = { CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), @@ -385,31 +638,23 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { return AMDGPU::SReg_32_XM0RegClassID; case 2: return AMDGPU::SReg_64RegClassID; + case 3: + return AMDGPU::SGPR_96RegClassID; case 4: return AMDGPU::SReg_128RegClassID; + case 5: + return AMDGPU::SGPR_160RegClassID; case 8: return AMDGPU::SReg_256RegClassID; case 16: return AMDGPU::SReg_512RegClassID; + case 32: + return AMDGPU::SReg_1024RegClassID; } llvm_unreachable("invalid vector size"); } -static bool getConstantValue(SDValue N, uint32_t &Out) { - if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { - Out = C->getAPIntValue().getZExtValue(); - return true; - } - - if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) { - Out = C->getValueAPF().bitcastToAPInt().getZExtValue(); - return true; - } - - return false; -} - void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); @@ -423,12 +668,12 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { return; } - assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " + assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not " "supported yet"); - // 16 = Max Num Vector Elements + // 32 = Max Num Vector Elements // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) // 1 = Vector Register Class - SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); + SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); bool IsRegSeq = true; @@ -470,10 +715,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { if (isa<AtomicSDNode>(N) || (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || - Opc == AMDGPUISD::ATOMIC_LOAD_FADD || + Opc == ISD::ATOMIC_LOAD_FADD || Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) - N = glueCopyToM0(N); + N = glueCopyToM0LDSInit(N); switch (Opc) { default: @@ -491,6 +736,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectADD_SUB_I64(N); return; } + case ISD::ADDCARRY: + case ISD::SUBCARRY: + if (N->getValueType(0) != MVT::i32) + break; + + SelectAddcSubb(N); + return; case ISD::UADDO: case ISD::USUBO: { SelectUADDO_USUBO(N); @@ -511,12 +763,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned NumVectorElts = VT.getVectorNumElements(); if (VT.getScalarSizeInBits() == 16) { if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) { - uint32_t LHSVal, RHSVal; - if (getConstantValue(N->getOperand(0), LHSVal) && - getConstantValue(N->getOperand(1), RHSVal)) { - uint32_t K = LHSVal | (RHSVal << 16); - CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT, - CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32)); + if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) { + ReplaceNode(N, Packed); return; } } @@ -571,7 +819,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::STORE: case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: { - N = glueCopyToM0(N); + N = glueCopyToM0LDSInit(N); break; } @@ -606,6 +854,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectDIV_SCALE(N); return; } + case AMDGPUISD::DIV_FMAS: { + SelectDIV_FMAS(N); + return; + } case AMDGPUISD::MAD_I64_I32: case AMDGPUISD::MAD_U64_U32: { SelectMAD_64_32(N); @@ -649,6 +901,16 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectCode(N); return; } + + break; + } + case ISD::INTRINSIC_W_CHAIN: { + SelectINTRINSIC_W_CHAIN(N); + return; + } + case ISD::INTRINSIC_VOID: { + SelectINTRINSIC_VOID(N); + return; } } @@ -763,6 +1025,19 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { ReplaceNode(N, RegSequence); } +void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) { + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue CI = N->getOperand(2); + + unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; + CurDAG->SelectNodeTo( + N, Opc, N->getVTList(), + {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); +} + void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned // carry out despite the _i32 name. These were renamed in VI to _U32. @@ -770,8 +1045,10 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; - CurDAG->SelectNodeTo(N, Opc, N->getVTList(), - { N->getOperand(0), N->getOperand(1) }); + CurDAG->SelectNodeTo( + N, Opc, N->getVTList(), + {N->getOperand(0), N->getOperand(1), + CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); } void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { @@ -816,6 +1093,35 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } +void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) { + const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget); + const SIRegisterInfo *TRI = ST->getRegisterInfo(); + + SDLoc SL(N); + EVT VT = N->getValueType(0); + + assert(VT == MVT::f32 || VT == MVT::f64); + + unsigned Opc + = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32; + + SDValue CarryIn = N->getOperand(3); + // V_DIV_FMAS implicitly reads VCC. + SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL, + TRI->getVCC(), CarryIn, SDValue()); + + SDValue Ops[10]; + + SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); + + Ops[8] = VCC; + Ops[9] = VCC.getValue(1); + + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); +} + // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { @@ -829,13 +1135,13 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } -bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, +bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset, unsigned OffsetBits) const { if ((OffsetBits == 16 && !isUInt<16>(Offset)) || (OffsetBits == 8 && !isUInt<8>(Offset))) return false; - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS || + if (Subtarget->hasUsableDSOffset() || Subtarget->unsafeDSOffsetFoldingEnabled()) return true; @@ -871,13 +1177,20 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + SmallVector<SDValue, 3> Opnds; + Opnds.push_back(Zero); + Opnds.push_back(Addr.getOperand(1)); + // FIXME: Select to VOP3 version for with-carry. - unsigned SubOp = Subtarget->hasAddNoCarry() ? - AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + unsigned SubOp = AMDGPU::V_SUB_I32_e32; + if (Subtarget->hasAddNoCarry()) { + SubOp = AMDGPU::V_SUB_U32_e64; + Opnds.push_back( + CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit + } - MachineSDNode *MachineSub - = CurDAG->getMachineNode(SubOp, DL, MVT::i32, - Zero, Addr.getOperand(1)); + MachineSDNode *MachineSub = + CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); Base = SDValue(MachineSub, 0); Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16); @@ -945,12 +1258,18 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { - unsigned SubOp = Subtarget->hasAddNoCarry() ? - AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + SmallVector<SDValue, 3> Opnds; + Opnds.push_back(Zero); + Opnds.push_back(Addr.getOperand(1)); + unsigned SubOp = AMDGPU::V_SUB_I32_e32; + if (Subtarget->hasAddNoCarry()) { + SubOp = AMDGPU::V_SUB_U32_e64; + Opnds.push_back( + CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit + } MachineSDNode *MachineSub - = CurDAG->getMachineNode(SubOp, DL, MVT::i32, - Zero, Addr.getOperand(1)); + = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); Base = SDValue(MachineSub, 0); Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); @@ -989,7 +1308,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const { + SDValue &TFE, SDValue &DLC) const { // Subtarget prefers to use flat instruction if (Subtarget->useFlatForGlobal()) return false; @@ -1001,6 +1320,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, if (!SLC.getNode()) SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); + DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -1079,15 +1399,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE) const { + SDValue &SLC, SDValue &TFE, + SDValue &DLC) const { SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (!Subtarget->hasAddr64()) return false; if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE)) + GLC, SLC, TFE, DLC)) return false; ConstantSDNode *C = cast<ConstantSDNode>(Addr64); @@ -1109,9 +1430,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &Offset, SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE; + SDValue GLC, TFE, DLC; - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC); } static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { @@ -1127,10 +1448,10 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); - // If we can resolve this to a frame index access, this is relative to the - // frame pointer SGPR. - return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(), - MVT::i32)); + // If we can resolve this to a frame index access, this will be relative to + // either the stack or frame pointer SGPR. + return std::make_pair( + TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)); } // If we don't know this private access is a local stack object, it needs to @@ -1236,13 +1557,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const { + SDValue &TFE, SDValue &DLC) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE)) + GLC, SLC, TFE, DLC)) return false; if (!cast<ConstantSDNode>(Offen)->getSExtValue() && @@ -1264,57 +1585,42 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset ) const { - SDValue GLC, SLC, TFE; + SDValue GLC, SLC, TFE, DLC; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const { - SDValue GLC, TFE; + SDValue GLC, TFE, DLC; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); } template <bool IsSigned> -bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, + SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const { - int64_t OffsetVal = 0; - - if (Subtarget->hasFlatInstOffsets() && - CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); - - if ((IsSigned && isInt<13>(COffsetVal)) || - (!IsSigned && isUInt<12>(COffsetVal))) { - Addr = N0; - OffsetVal = COffsetVal; - } - } - - VAddr = Addr; - Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16); - SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); - - return true; + return static_cast<const SITargetLowering*>(getTargetLowering())-> + SelectFlatOffset(IsSigned, *CurDAG, N, Addr, VAddr, Offset, SLC); } -bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N, + SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const { - return SelectFlatOffset<false>(Addr, VAddr, Offset, SLC); + return SelectFlatOffset<false>(N, Addr, VAddr, Offset, SLC); } -bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N, + SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const { - return SelectFlatOffset<true>(Addr, VAddr, Offset, SLC); + return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC); } bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, @@ -1619,9 +1925,12 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { return; } + const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget); + const SIRegisterInfo *TRI = ST->getRegisterInfo(); + bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N); unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ; - unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC; + unsigned CondReg = UseSCCBr ? (unsigned)AMDGPU::SCC : TRI->getVCC(); SDLoc SL(N); if (!UseSCCBr) { @@ -1638,9 +1947,13 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { // the S_AND when is unnecessary. But it would be better to add a separate // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it // catches both cases. - Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1, - CurDAG->getRegister(AMDGPU::EXEC, MVT::i1), - Cond), + Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32 + : AMDGPU::S_AND_B64, + SL, MVT::i1, + CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO + : AMDGPU::EXEC, + MVT::i1), + Cond), 0); } @@ -1761,6 +2074,183 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { CurDAG->RemoveDeadNode(N); } +void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { + // The address is assumed to be uniform, so if it ends up in a VGPR, it will + // be copied to an SGPR with readfirstlane. + unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? + AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; + + SDValue Chain = N->getOperand(0); + SDValue Ptr = N->getOperand(2); + MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); + MachineMemOperand *MMO = M->getMemOperand(); + bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; + + SDValue Offset; + if (CurDAG->isBaseWithConstantOffset(Ptr)) { + SDValue PtrBase = Ptr.getOperand(0); + SDValue PtrOffset = Ptr.getOperand(1); + + const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue(); + if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) { + N = glueCopyToM0(N, PtrBase); + Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); + } + } + + if (!Offset) { + N = glueCopyToM0(N, Ptr); + Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); + } + + SDValue Ops[] = { + Offset, + CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), + Chain, + N->getOperand(N->getNumOperands() - 1) // New glue + }; + + SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); +} + +static unsigned gwsIntrinToOpcode(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_ds_gws_init: + return AMDGPU::DS_GWS_INIT; + case Intrinsic::amdgcn_ds_gws_barrier: + return AMDGPU::DS_GWS_BARRIER; + case Intrinsic::amdgcn_ds_gws_sema_v: + return AMDGPU::DS_GWS_SEMA_V; + case Intrinsic::amdgcn_ds_gws_sema_br: + return AMDGPU::DS_GWS_SEMA_BR; + case Intrinsic::amdgcn_ds_gws_sema_p: + return AMDGPU::DS_GWS_SEMA_P; + case Intrinsic::amdgcn_ds_gws_sema_release_all: + return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; + default: + llvm_unreachable("not a gws intrinsic"); + } +} + +void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { + if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all && + !Subtarget->hasGWSSemaReleaseAll()) { + // Let this error. + SelectCode(N); + return; + } + + // Chain, intrinsic ID, vsrc, offset + const bool HasVSrc = N->getNumOperands() == 4; + assert(HasVSrc || N->getNumOperands() == 3); + + SDLoc SL(N); + SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2); + int ImmOffset = 0; + MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); + MachineMemOperand *MMO = M->getMemOperand(); + + // Don't worry if the offset ends up in a VGPR. Only one lane will have + // effect, so SIFixSGPRCopies will validly insert readfirstlane. + + // The resource id offset is computed as (<isa opaque base> + M0[21:16] + + // offset field) % 64. Some versions of the programming guide omit the m0 + // part, or claim it's from offset 0. + if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) { + // If we have a constant offset, try to use the default value for m0 as a + // base to possibly avoid setting it up. + glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32)); + ImmOffset = ConstOffset->getZExtValue() + 1; + } else { + if (CurDAG->isBaseWithConstantOffset(BaseOffset)) { + ImmOffset = BaseOffset.getConstantOperandVal(1); + BaseOffset = BaseOffset.getOperand(0); + } + + // Prefer to do the shift in an SGPR since it should be possible to use m0 + // as the result directly. If it's already an SGPR, it will be eliminated + // later. + SDNode *SGPROffset + = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32, + BaseOffset); + // Shift to offset in m0 + SDNode *M0Base + = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32, + SDValue(SGPROffset, 0), + CurDAG->getTargetConstant(16, SL, MVT::i32)); + glueCopyToM0(N, SDValue(M0Base, 0)); + } + + SDValue V0; + SDValue Chain = N->getOperand(0); + SDValue Glue; + if (HasVSrc) { + SDValue VSrc0 = N->getOperand(2); + + // The manual doesn't mention this, but it seems only v0 works. + V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32); + + SDValue CopyToV0 = CurDAG->getCopyToReg( + N->getOperand(0), SL, V0, VSrc0, + N->getOperand(N->getNumOperands() - 1)); + Chain = CopyToV0; + Glue = CopyToV0.getValue(1); + } + + SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); + + // TODO: Can this just be removed from the instruction? + SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1); + + const unsigned Opc = gwsIntrinToOpcode(IntrID); + SmallVector<SDValue, 5> Ops; + if (HasVSrc) + Ops.push_back(V0); + Ops.push_back(OffsetField); + Ops.push_back(GDS); + Ops.push_back(Chain); + + if (HasVSrc) + Ops.push_back(Glue); + + SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); +} + +void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { + unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: { + if (N->getValueType(0) != MVT::i32) + break; + SelectDSAppendConsume(N, IntrID); + return; + } + } + + SelectCode(N); +} + +void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { + unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_br: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: + SelectDS_GWS(N, IntrID); + return; + default: + break; + } + + SelectCode(N); +} + bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const { Mods = 0; @@ -1796,6 +2286,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, return isNoNanSrc(Src); } +bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + if (In.getValueType() == MVT::f32) + return SelectVOP3Mods(In, Src, SrcMods); + Src = In; + SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);; + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) return false; @@ -1833,41 +2332,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, return true; } -static SDValue stripBitcast(SDValue Val) { - return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; -} - -// Figure out if this is really an extract of the high 16-bits of a dword. -static bool isExtractHiElt(SDValue In, SDValue &Out) { - In = stripBitcast(In); - if (In.getOpcode() != ISD::TRUNCATE) - return false; - - SDValue Srl = In.getOperand(0); - if (Srl.getOpcode() == ISD::SRL) { - if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { - if (ShiftAmt->getZExtValue() == 16) { - Out = stripBitcast(Srl.getOperand(0)); - return true; - } - } - } - - return false; -} - -// Look through operations that obscure just looking at the low 16-bits of the -// same register. -static SDValue stripExtractLoElt(SDValue In) { - if (In.getOpcode() == ISD::TRUNCATE) { - SDValue Src = In.getOperand(0); - if (Src.getValueType().getSizeInBits() == 32) - return stripBitcast(Src); - } - - return In; -} - bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; @@ -2020,39 +2484,31 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, return true; } -// TODO: Can we identify things like v_mad_mixhi_f16? -bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const { - if (In.isUndef()) { - Src = In; - return true; - } +SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const { + if (In.isUndef()) + return CurDAG->getUNDEF(MVT::i32); if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) { SDLoc SL(In); - SDValue K = CurDAG->getTargetConstant(C->getZExtValue() << 16, SL, MVT::i32); - MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - SL, MVT::i32, K); - Src = SDValue(MovK, 0); - return true; + return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32); } if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) { SDLoc SL(In); - SDValue K = CurDAG->getTargetConstant( + return CurDAG->getConstant( C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32); - MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - SL, MVT::i32, K); - Src = SDValue(MovK, 0); - return true; } - return isExtractHiElt(In, Src); + SDValue Src; + if (isExtractHiElt(In, Src)) + return Src; + + return SDValue(); } bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { - return false; - } + assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn); + const SIRegisterInfo *SIRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); const SIInstrInfo * SII = diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 6951c915b177..39016ed37193 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -21,7 +20,6 @@ #include "AMDGPU.h" #include "AMDGPUCallLowering.h" #include "AMDGPUFrameLowering.h" -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" @@ -65,9 +63,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, case MVT::v2f32: case MVT::v4i16: case MVT::v4f16: { - // Up to SGPR0-SGPR39 + // Up to SGPR0-SGPR105 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::SGPR_64RegClass, 20); + &AMDGPU::SGPR_64RegClass, 53); } default: return false; @@ -152,15 +150,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v2f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v3f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32); + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v5f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::LOAD, MVT::v8f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); setOperationAction(ISD::LOAD, MVT::v16f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); + setOperationAction(ISD::LOAD, MVT::v32f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32); + setOperationAction(ISD::LOAD, MVT::i64, Promote); AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); @@ -237,15 +244,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v2f32, Promote); AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v3f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32); + setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v5f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::STORE, MVT::v8f32, Promote); AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); setOperationAction(ISD::STORE, MVT::v16f32, Promote); AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); + setOperationAction(ISD::STORE, MVT::v32f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32); + setOperationAction(ISD::STORE, MVT::i64, Promote); AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); @@ -327,16 +343,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // Expand to fneg + fadd. setOperationAction(ISD::FSUB, MVT::f64, Expand); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); @@ -394,7 +422,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); static const MVT::SimpleValueType VectorIntTypes[] = { - MVT::v2i32, MVT::v4i32 + MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32 }; for (MVT VT : VectorIntTypes) { @@ -436,7 +464,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, } static const MVT::SimpleValueType FloatVectorTypes[] = { - MVT::v2f32, MVT::v4f32 + MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32 }; for (MVT VT : FloatVectorTypes) { @@ -478,9 +506,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v2f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::SELECT, MVT::v3f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32); + setOperationAction(ISD::SELECT, MVT::v4f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::SELECT, MVT::v5f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32); + // There are no libcalls of any kind. for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr); @@ -499,6 +533,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // vector compares until that is fixed. setHasMultipleConditionRegisters(true); + setMinCmpXchgSizeInBits(32); + setSupportsUnalignedAtomics(false); + PredictableSelectIsExpensive = false; // We want to find all load dependencies for long chains of stores to enable @@ -592,6 +629,7 @@ static bool hasSourceMods(const SDNode *N) { case ISD::FDIV: case ISD::FREM: case ISD::INLINEASM: + case ISD::INLINEASM_BR: case AMDGPUISD::INTERP_P1: case AMDGPUISD::INTERP_P2: case AMDGPUISD::DIV_SCALE: @@ -640,7 +678,8 @@ bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { // The backend supports 32 and 64 bit floating point immediates. // FIXME: Why are we reporting vectors of FP immediates as legal? -bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { +bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { EVT ScalarVT = VT.getScalarType(); return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || (ScalarVT == MVT::f16 && Subtarget->has16BitInsts())); @@ -690,8 +729,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, return (OldSize < 32); } -bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, - EVT CastTy) const { +bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, + const SelectionDAG &DAG, + const MachineMemOperand &MMO) const { assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); @@ -701,8 +741,12 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, unsigned LScalarSize = LoadTy.getScalarSizeInBits(); unsigned CastScalarSize = CastTy.getScalarSizeInBits(); - return (LScalarSize < CastScalarSize) || - (CastScalarSize >= 32); + if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32)) + return false; + + bool Fast = false; + return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy, + MMO, &Fast) && Fast; } // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also @@ -849,9 +893,6 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) { switch (CC) { - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - llvm_unreachable("kernels should not be handled here"); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: @@ -864,8 +905,10 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::Fast: case CallingConv::Cold: return CC_AMDGPU_Func; + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: default: - report_fatal_error("Unsupported calling convention."); + report_fatal_error("Unsupported calling convention for call"); } } @@ -1010,9 +1053,10 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) MemVT = MemVT.getScalarType(); - if (MemVT.isExtended()) { - // This should really only happen if we have vec3 arguments - assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); + // Round up vec3/vec5 argument. + if (MemVT.isVector() && !MemVT.isPow2VectorType()) { + assert(MemVT.getVectorNumElements() == 3 || + MemVT.getVectorNumElements() == 5); MemVT = MemVT.getPow2VectorType(State.getContext()); } @@ -1372,6 +1416,41 @@ SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); } +// Split a vector type into two parts. The first part is a power of two vector. +// The second part is whatever is left over, and is a scalar if it would +// otherwise be a 1-vector. +std::pair<EVT, EVT> +AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const { + EVT LoVT, HiVT; + EVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2); + LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts); + HiVT = NumElts - LoNumElts == 1 + ? EltVT + : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts); + return std::make_pair(LoVT, HiVT); +} + +// Split a vector value into two parts of types LoVT and HiVT. HiVT could be +// scalar. +std::pair<SDValue, SDValue> +AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, + const EVT &LoVT, const EVT &HiVT, + SelectionDAG &DAG) const { + assert(LoVT.getVectorNumElements() + + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= + N.getValueType().getVectorNumElements() && + "More vector elements requested than available!"); + auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, + DAG.getConstant(0, DL, IdxTy)); + SDValue Hi = DAG.getNode( + HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, + HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy)); + return std::make_pair(Lo, Hi); +} + SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, SelectionDAG &DAG) const { LoadSDNode *Load = cast<LoadSDNode>(Op); @@ -1393,9 +1472,9 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, EVT LoMemVT, HiMemVT; SDValue Lo, Hi; - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); - std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); + std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); + std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); + std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG); unsigned Size = LoMemVT.getStoreSize(); unsigned BaseAlign = Load->getAlignment(); @@ -1410,15 +1489,52 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); - SDValue Ops[] = { - DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), - DAG.getNode(ISD::TokenFactor, SL, MVT::Other, - LoLoad.getValue(1), HiLoad.getValue(1)) - }; + auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); + SDValue Join; + if (LoVT == HiVT) { + // This is the case that the vector is power of two so was evenly split. + Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad); + } else { + Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad, + DAG.getConstant(0, SL, IdxTy)); + Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR + : ISD::INSERT_VECTOR_ELT, + SL, VT, Join, HiLoad, + DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy)); + } + + SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + LoLoad.getValue(1), HiLoad.getValue(1))}; return DAG.getMergeValues(Ops, SL); } +// Widen a vector load from vec3 to vec4. +SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op, + SelectionDAG &DAG) const { + LoadSDNode *Load = cast<LoadSDNode>(Op); + EVT VT = Op.getValueType(); + assert(VT.getVectorNumElements() == 3); + SDValue BasePtr = Load->getBasePtr(); + EVT MemVT = Load->getMemoryVT(); + SDLoc SL(Op); + const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); + unsigned BaseAlign = Load->getAlignment(); + + EVT WideVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); + EVT WideMemVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4); + SDValue WideLoad = DAG.getExtLoad( + Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue, + WideMemVT, BaseAlign, Load->getMemOperand()->getFlags()); + return DAG.getMergeValues( + {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad, + DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))), + WideLoad.getValue(1)}, + SL); +} + SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast<StoreSDNode>(Op); @@ -1439,9 +1555,9 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, EVT LoMemVT, HiMemVT; SDValue Lo, Hi; - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); - std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); + std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); + std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); + std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG); SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); @@ -2788,6 +2904,54 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { return true; } +// Find a load or store from corresponding pattern root. +// Roots may be build_vector, bitconvert or their combinations. +static MemSDNode* findMemSDNode(SDNode *N) { + N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode(); + if (MemSDNode *MN = dyn_cast<MemSDNode>(N)) + return MN; + assert(isa<BuildVectorSDNode>(N)); + for (SDValue V : N->op_values()) + if (MemSDNode *MN = + dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V))) + return MN; + llvm_unreachable("cannot find MemSDNode in the pattern!"); +} + +bool AMDGPUTargetLowering::SelectFlatOffset(bool IsSigned, + SelectionDAG &DAG, + SDNode *N, + SDValue Addr, + SDValue &VAddr, + SDValue &Offset, + SDValue &SLC) const { + const GCNSubtarget &ST = + DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); + int64_t OffsetVal = 0; + + if (ST.hasFlatInstOffsets() && + (!ST.hasFlatSegmentOffsetBug() || + findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) && + DAG.isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); + + const SIInstrInfo *TII = ST.getInstrInfo(); + if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(), + IsSigned)) { + Addr = N0; + OffsetVal = COffsetVal; + } + } + + VAddr = Addr; + Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16); + SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1); + + return true; +} + // Replace load of an illegal type with a store of a bitcast to a friendlier // type. SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, @@ -2812,7 +2976,8 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, // Expand unaligned loads earlier than legalization. Due to visitation order // problems during legalization, the emitted instructions to pack and unpack // the bytes again are not eliminated in the case of an unaligned copy. - if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + if (!allowsMisalignedMemoryAccesses( + VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) { if (VT.isVector()) return scalarizeVectorLoad(LN, DAG); @@ -2864,7 +3029,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, // order problems during legalization, the emitted instructions to pack and // unpack the bytes again are not eliminated in the case of an unaligned // copy. - if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + if (!allowsMisalignedMemoryAccesses( + VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) { if (VT.isVector()) return scalarizeVectorStore(SN, DAG); @@ -3049,30 +3215,44 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const { - if (N->getValueType(0) != MVT::i64) - return SDValue(); - - const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); + auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (!RHS) return SDValue(); + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); unsigned ShiftAmt = RHS->getZExtValue(); + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) + // this improves the ability to match BFE patterns in isel. + if (LHS.getOpcode() == ISD::AND) { + if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) { + if (Mask->getAPIntValue().isShiftedMask() && + Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) { + return DAG.getNode( + ISD::AND, SL, VT, + DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)), + DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1))); + } + } + } + + if (VT != MVT::i64) + return SDValue(); + if (ShiftAmt < 32) return SDValue(); // srl i64:x, C for C >= 32 // => // build_pair (srl hi_32(x), C - 32), 0 - - SelectionDAG &DAG = DCI.DAG; - SDLoc SL(N); - SDValue One = DAG.getConstant(1, SL, MVT::i32); SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0)); - SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, - VecOp, One); + SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One); SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32); SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); @@ -3090,7 +3270,7 @@ SDValue AMDGPUTargetLowering::performTruncateCombine( SDValue Src = N->getOperand(0); // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) - if (Src.getOpcode() == ISD::BITCAST) { + if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) { SDValue Vec = Src.getOperand(0); if (Vec.getOpcode() == ISD::BUILD_VECTOR) { SDValue Elt0 = Vec.getOperand(0); @@ -3478,13 +3658,11 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, if (Cond.hasOneUse()) { // TODO: Look for multiple select uses. SelectionDAG &DAG = DCI.DAG; - if ((DAG.isConstantValueOfAnyType(True) || - DAG.isConstantValueOfAnyType(True)) && - (!DAG.isConstantValueOfAnyType(False) && - !DAG.isConstantValueOfAnyType(False))) { + if (DAG.isConstantValueOfAnyType(True) && + !DAG.isConstantValueOfAnyType(False)) { // Swap cmp + select pair to move constant to false input. // This will allow using VOPC cndmasks more often. - // select (setcc x, y), k, x -> select (setcc y, x) x, x + // select (setcc x, y), k, x -> select (setccinv x, y), x, k SDLoc SL(N); ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), @@ -3594,6 +3772,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, RHS = RHS.getOperand(0); SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags()); + if (Res.getOpcode() != ISD::FADD) + return SDValue(); // Op got folded away. if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -3613,6 +3793,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags()); + if (Res.getOpcode() != Opc) + return SDValue(); // Op got folded away. if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -3640,6 +3822,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, RHS = RHS.getOperand(0); SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); + if (Res.getOpcode() != Opc) + return SDValue(); // Op got folded away. if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -3668,6 +3852,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, unsigned Opposite = inverseMinMax(Opc); SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags()); + if (Res.getOpcode() != Opposite) + return SDValue(); // Op got folded away. if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -3678,6 +3864,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags()); SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags()); + if (Res.getOpcode() != AMDGPUISD::FMED3) + return SDValue(); // Op got folded away. if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -4051,9 +4239,19 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, const ArgDescriptor &Arg) const { assert(Arg && "Attempting to load missing argument"); - if (Arg.isRegister()) - return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL); - return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); + SDValue V = Arg.isRegister() ? + CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) : + loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); + + if (!Arg.isMasked()) + return V; + + unsigned Mask = Arg.getMask(); + unsigned Shift = countTrailingZeros<unsigned>(Mask); + V = DAG.getNode(ISD::SRL, SL, VT, V, + DAG.getShiftAmountConstant(Shift, VT, SL)); + return DAG.getNode(ISD::AND, SL, VT, V, + DAG.getConstant(Mask >> Shift, SL, VT)); } uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( @@ -4175,6 +4373,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) + NODE_NAME_CASE(LDS) NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; @@ -4185,24 +4384,38 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(INTERP_MOV) NODE_NAME_CASE(INTERP_P1) NODE_NAME_CASE(INTERP_P2) + NODE_NAME_CASE(INTERP_P1LL_F16) + NODE_NAME_CASE(INTERP_P1LV_F16) + NODE_NAME_CASE(INTERP_P2_F16) + NODE_NAME_CASE(LOAD_D16_HI) + NODE_NAME_CASE(LOAD_D16_LO) + NODE_NAME_CASE(LOAD_D16_HI_I8) + NODE_NAME_CASE(LOAD_D16_HI_U8) + NODE_NAME_CASE(LOAD_D16_LO_I8) + NODE_NAME_CASE(LOAD_D16_LO_U8) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) - NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) + NODE_NAME_CASE(DS_ORDERED_COUNT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) - NODE_NAME_CASE(ATOMIC_LOAD_FADD) NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) + NODE_NAME_CASE(BUFFER_LOAD_UBYTE) + NODE_NAME_CASE(BUFFER_LOAD_USHORT) + NODE_NAME_CASE(BUFFER_LOAD_BYTE) + NODE_NAME_CASE(BUFFER_LOAD_SHORT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(SBUFFER_LOAD) NODE_NAME_CASE(BUFFER_STORE) + NODE_NAME_CASE(BUFFER_STORE_BYTE) + NODE_NAME_CASE(BUFFER_STORE_SHORT) NODE_NAME_CASE(BUFFER_STORE_FORMAT) NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) @@ -4216,6 +4429,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_OR) NODE_NAME_CASE(BUFFER_ATOMIC_XOR) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) + NODE_NAME_CASE(BUFFER_ATOMIC_FADD) + NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) + NODE_NAME_CASE(ATOMIC_FADD) + NODE_NAME_CASE(ATOMIC_PK_FADD) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } @@ -4367,6 +4584,23 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( } break; } + case AMDGPUISD::BUFFER_LOAD_UBYTE: { + Known.Zero.setHighBits(24); + break; + } + case AMDGPUISD::BUFFER_LOAD_USHORT: { + Known.Zero.setHighBits(16); + break; + } + case AMDGPUISD::LDS: { + auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode()); + unsigned Align = GA->getGlobal()->getAlignment(); + + Known.Zero.setHighBits(16); + if (Align) + Known.Zero.setLowBits(Log2_32(Align)); + break; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); switch (IID) { @@ -4412,6 +4646,14 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: return 31; + case AMDGPUISD::BUFFER_LOAD_BYTE: + return 25; + case AMDGPUISD::BUFFER_LOAD_SHORT: + return 17; + case AMDGPUISD::BUFFER_LOAD_UBYTE: + return 24; + case AMDGPUISD::BUFFER_LOAD_USHORT: + return 16; case AMDGPUISD::FP_TO_FP16: case AMDGPUISD::FP16_ZEXT: return 16; @@ -4519,7 +4761,12 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, TargetLowering::AtomicExpansionKind AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { - if (RMW->getOperation() == AtomicRMWInst::Nand) + switch (RMW->getOperation()) { + case AtomicRMWInst::Nand: + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: return AtomicExpansionKind::CmpXChg; - return AtomicExpansionKind::None; + default: + return AtomicExpansionKind::None; + } } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 0d22cb2e3e20..fe7ad694943d 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -1,9 +1,8 @@ //===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -111,9 +110,23 @@ protected: SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const; SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const; + /// Split a vector type into two parts. The first part is a power of two + /// vector. The second part is whatever is left over, and is a scalar if it + /// would otherwise be a 1-vector. + std::pair<EVT, EVT> getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const; + + /// Split a vector value into two parts of types LoVT and HiVT. HiVT could be + /// scalar. + std::pair<SDValue, SDValue> splitVector(const SDValue &N, const SDLoc &DL, + const EVT &LoVT, const EVT &HighVT, + SelectionDAG &DAG) const; + /// Split a vector load into 2 loads of half the vector. SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; + /// Widen a vector load from vec3 to vec4. + SDValue WidenVectorLoad(SDValue Op, SelectionDAG &DAG) const; + /// Split a vector store into 2 stores of half the vector. SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; @@ -162,13 +175,15 @@ public: MVT getVectorIdxTy(const DataLayout &) const override; bool isSelectSupported(SelectSupportKind) const override; - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; bool ShouldShrinkFPConstant(EVT VT) const override; bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override; - bool isLoadBitCastBeneficial(EVT, EVT) const final; + bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, + const MachineMemOperand &MMO) const final; bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, @@ -212,15 +227,15 @@ public: const char* getTargetNodeName(unsigned Opcode) const override; - // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection - // for AMDGPU. - // A commit ( git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319036 - // 91177308-0d34-0410-b5e6-96231b3b80d8 ) turned on - // MergeConsecutiveStores() before Instruction Selection for all targets. - // Enough AMDGPU compiles go into an infinite loop ( MergeConsecutiveStores() - // merges two stores; LegalizeStoreOps() un-merges; MergeConsecutiveStores() - // re-merges, etc. ) to warrant turning it off for now. - bool mergeStoresAfterLegalization() const override { return false; } + // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection for + // AMDGPU. Commit r319036, + // (https://github.com/llvm/llvm-project/commit/db77e57ea86d941a4262ef60261692f4cb6893e6) + // turned on MergeConsecutiveStores() before Instruction Selection for all + // targets. Enough AMDGPU compiles go into an infinite loop ( + // MergeConsecutiveStores() merges two stores; LegalizeStoreOps() un-merges; + // MergeConsecutiveStores() re-merges, etc. ) to warrant turning it off for + // now. + bool mergeStoresAfterLegalization(EVT) const override { return false; } bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { return true; @@ -309,6 +324,10 @@ public: } AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; + + bool SelectFlatOffset(bool IsSigned, SelectionDAG &DAG, SDNode *N, + SDValue Addr, SDValue &VAddr, SDValue &Offset, + SDValue &SLC) const; }; namespace AMDGPUISD { @@ -463,28 +482,44 @@ enum NodeType : unsigned { INTERP_MOV, INTERP_P1, INTERP_P2, + INTERP_P1LL_F16, + INTERP_P1LV_F16, + INTERP_P2_F16, PC_ADD_REL_OFFSET, + LDS, KILL, DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, + LOAD_D16_HI, + LOAD_D16_LO, + LOAD_D16_HI_I8, + LOAD_D16_HI_U8, + LOAD_D16_LO_I8, + LOAD_D16_LO_U8, + STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, - TBUFFER_STORE_FORMAT_X3, TBUFFER_STORE_FORMAT_D16, TBUFFER_LOAD_FORMAT, TBUFFER_LOAD_FORMAT_D16, + DS_ORDERED_COUNT, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, - ATOMIC_LOAD_FADD, ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, BUFFER_LOAD, + BUFFER_LOAD_UBYTE, + BUFFER_LOAD_USHORT, + BUFFER_LOAD_BYTE, + BUFFER_LOAD_SHORT, BUFFER_LOAD_FORMAT, BUFFER_LOAD_FORMAT_D16, SBUFFER_LOAD, BUFFER_STORE, + BUFFER_STORE_BYTE, + BUFFER_STORE_SHORT, BUFFER_STORE_FORMAT, BUFFER_STORE_FORMAT_D16, BUFFER_ATOMIC_SWAP, @@ -498,6 +533,10 @@ enum NodeType : unsigned { BUFFER_ATOMIC_OR, BUFFER_ATOMIC_XOR, BUFFER_ATOMIC_CMPSWAP, + BUFFER_ATOMIC_FADD, + BUFFER_ATOMIC_PK_FADD, + ATOMIC_FADD, + ATOMIC_PK_FADD, LAST_AMDGPU_ISD_NUMBER }; diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp index 945c9acd379a..f4df20b8f03e 100644 --- a/lib/Target/AMDGPU/AMDGPUInline.cpp +++ b/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -1,9 +1,8 @@ //===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -40,7 +39,7 @@ using namespace llvm; #define DEBUG_TYPE "inline" static cl::opt<int> -ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), +ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500), cl::desc("Cost of alloca argument")); // If the amount of scratch memory to eliminate exceeds our ability to allocate @@ -50,6 +49,12 @@ static cl::opt<unsigned> ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost")); +// Inliner constraint to achieve reasonable compilation time +static cl::opt<size_t> +MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300), + cl::desc("Maximum BB number allowed in a function after inlining" + " (compile time constraint)")); + namespace { class AMDGPUInliner : public LegacyInlinerBase { @@ -112,7 +117,8 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { Callee->hasFnAttribute(Attribute::InlineHint); if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres && !Caller->hasFnAttribute(Attribute::MinSize)) - Thres = Params.HintThreshold.getValue(); + Thres = Params.HintThreshold.getValue() * + TTIWP->getTTI(*Callee).getInliningThresholdMultiplier(); const DataLayout &DL = Caller->getParent()->getDataLayout(); if (!Callee) @@ -124,10 +130,11 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { uint64_t AllocaSize = 0; SmallPtrSet<const AllocaInst *, 8> AIVisited; for (Value *PtrArg : CS.args()) { - Type *Ty = PtrArg->getType(); - if (!Ty->isPointerTy() || - Ty->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType()); + if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && + Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) continue; + PtrArg = GetUnderlyingObject(PtrArg, DL); if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) @@ -170,7 +177,6 @@ static bool isWrapperOnlyCall(CallSite CS) { InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); - TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); if (!Callee || Callee->isDeclaration()) return llvm::InlineCost::getNever("undefined callee"); @@ -178,13 +184,15 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { if (CS.isNoInline()) return llvm::InlineCost::getNever("noinline"); + TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); if (!TTI.areInlineCompatible(Caller, Callee)) return llvm::InlineCost::getNever("incompatible"); if (CS.hasFnAttr(Attribute::AlwaysInline)) { - if (isInlineViable(*Callee)) + auto IsViable = isInlineViable(*Callee); + if (IsViable) return llvm::InlineCost::getAlways("alwaysinline viable"); - return llvm::InlineCost::getNever("alwaysinline unviable"); + return llvm::InlineCost::getNever(IsViable.message); } if (isWrapperOnlyCall(CS)) @@ -206,6 +214,15 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { return ACT->getAssumptionCache(F); }; - return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache, - None, PSI, RemarksEnabled ? &ORE : nullptr); + auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee, + LocalParams, TTI, GetAssumptionCache, None, PSI, + RemarksEnabled ? &ORE : nullptr); + + if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) { + // Single BB does not increase total BB amount, thus subtract 1 + size_t Size = Caller->size() + Callee->size() - 1; + if (MaxBB && Size > MaxBB) + return llvm::InlineCost::getNever("max number of bb exceeded"); + } + return IC; } diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 07aa7c2cc8ad..9951cbf2326e 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 2f8166da0d33..698189e14c21 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -1,9 +1,8 @@ //===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 82644be26563..4a8446955496 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -1,9 +1,8 @@ //===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -51,27 +50,21 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4, def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def AMDGPUIfOp : SDTypeProfile<1, 2, - [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>] + [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>] >; def AMDGPUElseOp : SDTypeProfile<1, 2, - [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>] + [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>] >; def AMDGPULoopOp : SDTypeProfile<0, 2, - [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>] + [SDTCisVT<0, i1>, SDTCisVT<1, OtherVT>] >; def AMDGPUIfBreakOp : SDTypeProfile<1, 2, - [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>] ->; - -def AMDGPUAddeSubeOp : SDTypeProfile<2, 3, - [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>] + [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>] >; -def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; - //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -96,7 +89,8 @@ def AMDGPUcall : SDNode<"AMDGPUISD::CALL", SDNPVariadic] >; -def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET, +def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", + SDTypeProfile<0, 3, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; @@ -205,14 +199,8 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; // out = (src1 > src0) ? 1 : 0 def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>; -// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own -// nodes in TargetSelectionDAG.td. -def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>; - -def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>; - def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc - SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> + SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> ]>; def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>; @@ -251,7 +239,8 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; // Special case divide FMA with scale and flags (src0 = Quotient, // src1 = Denominator, src2 = Numerator). -def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>; +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp, + [SDNPOptInGlue]>; // Single or double precision division fixup. // Special case divide fixup and flags(src0 = Quotient, src1 = @@ -370,6 +359,17 @@ def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", SDTypeProfile<1, 4, [SDTCisFP<0>]>, [SDNPInGlue]>; +def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16", + SDTypeProfile<1, 7, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16", + SDTypeProfile<1, 9, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16", + SDTypeProfile<1, 8, [SDTCisFP<0>]>, + [SDNPInGlue]>; def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT, [SDNPHasChain, SDNPSideEffect]>; diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 8eb49d49b2e0..901a2eaa8829 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1,9 +1,8 @@ //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -18,10 +17,11 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" -#include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -35,6 +35,7 @@ #define DEBUG_TYPE "amdgpu-isel" using namespace llvm; +using namespace MIPatternMatch; #define GET_GLOBALISEL_IMPL #define AMDGPUSubtarget GCNSubtarget @@ -60,11 +61,101 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector( const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } +static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return Reg == AMDGPU::SCC; + + auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); + const TargetRegisterClass *RC = + RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); + if (RC) { + // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the + // context of the register bank has been lost. + if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID) + return false; + const LLT Ty = MRI.getType(Reg); + return Ty.isValid() && Ty.getSizeInBits() == 1; + } + + const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); + return RB->getID() == AMDGPU::SCCRegBankID; +} + +bool AMDGPUInstructionSelector::isVCC(Register Reg, + const MachineRegisterInfo &MRI) const { + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return Reg == TRI.getVCC(); + + auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); + const TargetRegisterClass *RC = + RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); + if (RC) { + const LLT Ty = MRI.getType(Reg); + return RC->hasSuperClassEq(TRI.getBoolRC()) && + Ty.isValid() && Ty.getSizeInBits() == 1; + } + + const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); + return RB->getID() == AMDGPU::VCCRegBankID; +} + bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { + const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); I.setDesc(TII.get(TargetOpcode::COPY)); + + const MachineOperand &Src = I.getOperand(1); + MachineOperand &Dst = I.getOperand(0); + Register DstReg = Dst.getReg(); + Register SrcReg = Src.getReg(); + + if (isVCC(DstReg, MRI)) { + if (SrcReg == AMDGPU::SCC) { + const TargetRegisterClass *RC + = TRI.getConstrainedRegClassForOperand(Dst, MRI); + if (!RC) + return true; + return RBI.constrainGenericRegister(DstReg, *RC, MRI); + } + + if (!isVCC(SrcReg, MRI)) { + // TODO: Should probably leave the copy and let copyPhysReg expand it. + if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), MRI)) + return false; + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) + .addImm(0) + .addReg(SrcReg); + + if (!MRI.getRegClassOrNull(SrcReg)) + MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI)); + I.eraseFromParent(); + return true; + } + + const TargetRegisterClass *RC = + TRI.getConstrainedRegClassForOperand(Dst, MRI); + if (RC && !RBI.constrainGenericRegister(DstReg, *RC, MRI)) + return false; + + // Don't constrain the source register to a class so the def instruction + // handles it (unless it's undef). + // + // FIXME: This is a hack. When selecting the def, we neeed to know + // specifically know that the result is VCCRegBank, and not just an SGPR + // with size 1. An SReg_32 with size 1 is ambiguous with wave32. + if (Src.isUndef()) { + const TargetRegisterClass *SrcRC = + TRI.getConstrainedRegClassForOperand(Src, MRI); + if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) + return false; + } + + return true; + } + for (const MachineOperand &MO : I.operands()) { if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) continue; @@ -78,15 +169,54 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const Register DefReg = I.getOperand(0).getReg(); + const LLT DefTy = MRI.getType(DefReg); + + // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) + + const RegClassOrRegBank &RegClassOrBank = + MRI.getRegClassOrRegBank(DefReg); + + const TargetRegisterClass *DefRC + = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); + if (!DefRC) { + if (!DefTy.isValid()) { + LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); + return false; + } + + const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); + if (RB.getID() == AMDGPU::SCCRegBankID) { + LLVM_DEBUG(dbgs() << "illegal scc phi\n"); + return false; + } + + DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI); + if (!DefRC) { + LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); + return false; + } + } + + I.setDesc(TII.get(TargetOpcode::PHI)); + return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); +} + MachineOperand AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, + const TargetRegisterClass &SubRC, unsigned SubIdx) const { MachineInstr *MI = MO.getParent(); MachineBasicBlock *BB = MO.getParent()->getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register DstReg = MRI.createVirtualRegister(&SubRC); if (MO.isReg()) { unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); @@ -118,51 +248,273 @@ static int64_t getConstant(const MachineInstr *MI) { return MI->getOperand(1).getCImm()->getSExtValue(); } -bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const { +static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { + switch (Opc) { + case AMDGPU::G_AND: + return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; + case AMDGPU::G_OR: + return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; + case AMDGPU::G_XOR: + return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; + default: + llvm_unreachable("not a bit op"); + } +} + +bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Size = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); - unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + MachineOperand &Dst = I.getOperand(0); + MachineOperand &Src0 = I.getOperand(1); + MachineOperand &Src1 = I.getOperand(2); + Register DstReg = Dst.getReg(); + unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + + const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + if (DstRB->getID() == AMDGPU::VCCRegBankID) { + const TargetRegisterClass *RC = TRI.getBoolRC(); + unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), + RC == &AMDGPU::SReg_64RegClass); + I.setDesc(TII.get(InstOpc)); + + // FIXME: Hack to avoid turning the register bank into a register class. + // The selector for G_ICMP relies on seeing the register bank for the result + // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will + // be ambiguous whether it's a scalar or vector bool. + if (Src0.isUndef() && !MRI.getRegClassOrNull(Src0.getReg())) + MRI.setRegClass(Src0.getReg(), RC); + if (Src1.isUndef() && !MRI.getRegClassOrNull(Src1.getReg())) + MRI.setRegClass(Src1.getReg(), RC); + + return RBI.constrainGenericRegister(DstReg, *RC, MRI); + } - if (Size != 64) - return false; + // TODO: Should this allow an SCC bank result, and produce a copy from SCC for + // the result? + if (DstRB->getID() == AMDGPU::SGPRRegBankID) { + unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); + I.setDesc(TII.get(InstOpc)); - DebugLoc DL = I.getDebugLoc(); + const TargetRegisterClass *RC + = TRI.getConstrainedRegClassForOperand(Dst, MRI); + if (!RC) + return false; + return RBI.constrainGenericRegister(DstReg, *RC, MRI) && + RBI.constrainGenericRegister(Src0.getReg(), *RC, MRI) && + RBI.constrainGenericRegister(Src1.getReg(), *RC, MRI); + } - MachineOperand Lo1(getSubOperand64(I.getOperand(1), AMDGPU::sub0)); - MachineOperand Lo2(getSubOperand64(I.getOperand(2), AMDGPU::sub0)); + return false; +} - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) - .add(Lo1) - .add(Lo2); +bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register DstReg = I.getOperand(0).getReg(); + const DebugLoc &DL = I.getDebugLoc(); + unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; + const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; - MachineOperand Hi1(getSubOperand64(I.getOperand(1), AMDGPU::sub1)); - MachineOperand Hi2(getSubOperand64(I.getOperand(2), AMDGPU::sub1)); + if (Size == 32) { + if (IsSALU) { + const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; + MachineInstr *Add = + BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) + .add(I.getOperand(1)) + .add(I.getOperand(2)); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); + } - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) - .add(Hi1) - .add(Hi2); + if (STI.hasAddNoCarry()) { + const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; + I.setDesc(TII.get(Opc)); + I.addOperand(*MF, MachineOperand::CreateImm(0)); + I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } - BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), I.getOperand(0).getReg()) - .addReg(DstLo) - .addImm(AMDGPU::sub0) - .addReg(DstHi) - .addImm(AMDGPU::sub1); + const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; - for (MachineOperand &MO : I.explicit_operands()) { - if (!MO.isReg() || TargetRegisterInfo::isPhysicalRegister(MO.getReg())) - continue; - RBI.constrainGenericRegister(MO.getReg(), AMDGPU::SReg_64RegClass, MRI); + Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass()); + MachineInstr *Add + = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) + .addDef(UnusedCarry, RegState::Dead) + .add(I.getOperand(1)) + .add(I.getOperand(2)) + .addImm(0); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); } + assert(!Sub && "illegal sub should not reach here"); + + const TargetRegisterClass &RC + = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; + const TargetRegisterClass &HalfRC + = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; + + MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); + MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); + MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); + MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); + + Register DstLo = MRI.createVirtualRegister(&HalfRC); + Register DstHi = MRI.createVirtualRegister(&HalfRC); + + if (IsSALU) { + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) + .add(Lo1) + .add(Lo2); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) + .add(Hi1) + .add(Hi2); + } else { + const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); + Register CarryReg = MRI.createVirtualRegister(CarryRC); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) + .addDef(CarryReg) + .add(Lo1) + .add(Lo2) + .addImm(0); + MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) + .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead) + .add(Hi1) + .add(Hi2) + .addReg(CarryReg, RegState::Kill) + .addImm(0); + + if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) + return false; + } + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(DstLo) + .addImm(AMDGPU::sub0) + .addReg(DstHi) + .addImm(AMDGPU::sub1); + + + if (!RBI.constrainGenericRegister(DstReg, RC, MRI)) + return false; + + I.eraseFromParent(); + return true; +} + +bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + assert(I.getOperand(2).getImm() % 32 == 0); + unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32); + const DebugLoc &DL = I.getDebugLoc(); + MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), + I.getOperand(0).getReg()) + .addReg(I.getOperand(1).getReg(), 0, SubReg); + + for (const MachineOperand &MO : Copy->operands()) { + const TargetRegisterClass *RC = + TRI.getConstrainedRegClassForOperand(MO, MRI); + if (!RC) + continue; + RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); + } I.eraseFromParent(); return true; } +bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { + MachineBasicBlock *BB = MI.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + + const unsigned SrcSize = SrcTy.getSizeInBits(); + if (SrcSize < 32) + return false; + + const DebugLoc &DL = MI.getDebugLoc(); + const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI); + const unsigned DstSize = DstTy.getSizeInBits(); + const TargetRegisterClass *DstRC = + TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI); + if (!DstRC) + return false; + + ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); + MachineInstrBuilder MIB = + BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); + for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { + MachineOperand &Src = MI.getOperand(I + 1); + MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); + MIB.addImm(SubRegs[I]); + + const TargetRegisterClass *SrcRC + = TRI.getConstrainedRegClassForOperand(Src, MRI); + if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI)) + return false; + } + + if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) + return false; + + MI.eraseFromParent(); + return true; +} + +bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { + MachineBasicBlock *BB = MI.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const int NumDst = MI.getNumOperands() - 1; + + MachineOperand &Src = MI.getOperand(NumDst); + + Register SrcReg = Src.getReg(); + Register DstReg0 = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg0); + LLT SrcTy = MRI.getType(SrcReg); + + const unsigned DstSize = DstTy.getSizeInBits(); + const unsigned SrcSize = SrcTy.getSizeInBits(); + const DebugLoc &DL = MI.getDebugLoc(); + const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); + + const TargetRegisterClass *SrcRC = + TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI); + if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) + return false; + + const unsigned SrcFlags = getUndefRegState(Src.isUndef()); + + // Note we could have mixed SGPR and VGPR destination banks for an SGPR + // source, and this relies on the fact that the same subregister indices are + // used for both. + ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); + for (int I = 0, E = NumDst; I != E; ++I) { + MachineOperand &Dst = MI.getOperand(I); + BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) + .addReg(SrcReg, SrcFlags, SubRegs[I]); + + const TargetRegisterClass *DstRC = + TRI.getConstrainedRegClassForOperand(Dst, MRI); + if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI)) + return false; + } + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const { - return selectG_ADD(I); + return selectG_ADD_SUB(I); } bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { @@ -170,47 +522,200 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const MachineOperand &MO = I.getOperand(0); - const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, MRI); - if (RC) + + // FIXME: Interface for getConstrainedRegClassForOperand needs work. The + // regbank check here is to know why getConstrainedRegClassForOperand failed. + const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI); + if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) || + (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) { + I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); + return true; + } + + return false; +} + +bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32); + DebugLoc DL = I.getDebugLoc(); + MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG)) + .addDef(I.getOperand(0).getReg()) + .addReg(I.getOperand(1).getReg()) + .addReg(I.getOperand(2).getReg()) + .addImm(SubReg); + + for (const MachineOperand &MO : Ins->operands()) { + if (!MO.isReg()) + continue; + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + continue; + + const TargetRegisterClass *RC = + TRI.getConstrainedRegClassForOperand(MO, MRI); + if (!RC) + continue; RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); - I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); + } + I.eraseFromParent(); return true; } -bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { - unsigned IntrinsicID = I.getOperand(1).getIntrinsicID(); - +bool AMDGPUInstructionSelector::selectG_INTRINSIC( + MachineInstr &I, CodeGenCoverage &CoverageInfo) const { + unsigned IntrinsicID = I.getOperand(I.getNumExplicitDefs()).getIntrinsicID(); switch (IntrinsicID) { - default: - break; case Intrinsic::maxnum: case Intrinsic::minnum: case Intrinsic::amdgcn_cvt_pkrtz: return selectImpl(I, CoverageInfo); - - case Intrinsic::amdgcn_kernarg_segment_ptr: { - MachineFunction *MF = I.getParent()->getParent(); + case Intrinsic::amdgcn_if_break: { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const ArgDescriptor *InputPtrReg; - const TargetRegisterClass *RC; - const DebugLoc &DL = I.getDebugLoc(); - - std::tie(InputPtrReg, RC) - = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); - if (!InputPtrReg) - report_fatal_error("missing kernarg segment ptr"); - BuildMI(*I.getParent(), &I, DL, TII.get(AMDGPU::COPY)) + // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick + // SelectionDAG uses for wave32 vs wave64. + BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) .add(I.getOperand(0)) - .addReg(MRI.getLiveInVirtReg(InputPtrReg->getRegister())); + .add(I.getOperand(2)) + .add(I.getOperand(3)); + + Register DstReg = I.getOperand(0).getReg(); + Register Src0Reg = I.getOperand(2).getReg(); + Register Src1Reg = I.getOperand(3).getReg(); + I.eraseFromParent(); + + for (Register Reg : { DstReg, Src0Reg, Src1Reg }) { + if (!MRI.getRegClassOrNull(Reg)) + MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); + } + return true; } + default: + return selectImpl(I, CoverageInfo); + } +} + +static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) { + if (Size != 32 && Size != 64) + return -1; + switch (P) { + default: + llvm_unreachable("Unknown condition code!"); + case CmpInst::ICMP_NE: + return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64; + case CmpInst::ICMP_EQ: + return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64; + case CmpInst::ICMP_SGT: + return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64; + case CmpInst::ICMP_SGE: + return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64; + case CmpInst::ICMP_SLT: + return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64; + case CmpInst::ICMP_SLE: + return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64; + case CmpInst::ICMP_UGT: + return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64; + case CmpInst::ICMP_UGE: + return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64; + case CmpInst::ICMP_ULT: + return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64; + case CmpInst::ICMP_ULE: + return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64; } - return false; +} + +int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, + unsigned Size) const { + if (Size == 64) { + if (!STI.hasScalarCompareEq64()) + return -1; + + switch (P) { + case CmpInst::ICMP_NE: + return AMDGPU::S_CMP_LG_U64; + case CmpInst::ICMP_EQ: + return AMDGPU::S_CMP_EQ_U64; + default: + return -1; + } + } + + if (Size != 32) + return -1; + + switch (P) { + case CmpInst::ICMP_NE: + return AMDGPU::S_CMP_LG_U32; + case CmpInst::ICMP_EQ: + return AMDGPU::S_CMP_EQ_U32; + case CmpInst::ICMP_SGT: + return AMDGPU::S_CMP_GT_I32; + case CmpInst::ICMP_SGE: + return AMDGPU::S_CMP_GE_I32; + case CmpInst::ICMP_SLT: + return AMDGPU::S_CMP_LT_I32; + case CmpInst::ICMP_SLE: + return AMDGPU::S_CMP_LE_I32; + case CmpInst::ICMP_UGT: + return AMDGPU::S_CMP_GT_U32; + case CmpInst::ICMP_UGE: + return AMDGPU::S_CMP_GE_U32; + case CmpInst::ICMP_ULT: + return AMDGPU::S_CMP_LT_U32; + case CmpInst::ICMP_ULE: + return AMDGPU::S_CMP_LE_U32; + default: + llvm_unreachable("Unknown condition code!"); + } +} + +bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const DebugLoc &DL = I.getDebugLoc(); + + unsigned SrcReg = I.getOperand(2).getReg(); + unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI); + + auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); + + unsigned CCReg = I.getOperand(0).getReg(); + if (isSCC(CCReg, MRI)) { + int Opcode = getS_CMPOpcode(Pred, Size); + if (Opcode == -1) + return false; + MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) + .add(I.getOperand(2)) + .add(I.getOperand(3)); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) + .addReg(AMDGPU::SCC); + bool Ret = + constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && + RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI); + I.eraseFromParent(); + return Ret; + } + + int Opcode = getV_CMPOpcode(Pred, Size); + if (Opcode == -1) + return false; + + MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), + I.getOperand(0).getReg()) + .add(I.getOperand(2)) + .add(I.getOperand(3)); + RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), + *TRI.getBoolRC(), MRI); + bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); + I.eraseFromParent(); + return Ret; } static MachineInstr * @@ -232,8 +737,7 @@ buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, } bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( - MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { + MachineInstr &I, CodeGenCoverage &CoverageInfo) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -272,8 +776,72 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( I.eraseFromParent(); return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); } + case Intrinsic::amdgcn_end_cf: { + // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick + // SelectionDAG uses for wave32 vs wave64. + BuildMI(*BB, &I, I.getDebugLoc(), + TII.get(AMDGPU::SI_END_CF)) + .add(I.getOperand(1)); + + Register Reg = I.getOperand(1).getReg(); + I.eraseFromParent(); + + if (!MRI.getRegClassOrNull(Reg)) + MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); + return true; } - return false; + default: + return selectImpl(I, CoverageInfo); + } +} + +bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const DebugLoc &DL = I.getDebugLoc(); + + unsigned DstReg = I.getOperand(0).getReg(); + unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + assert(Size <= 32 || Size == 64); + const MachineOperand &CCOp = I.getOperand(1); + unsigned CCReg = CCOp.getReg(); + if (isSCC(CCReg, MRI)) { + unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : + AMDGPU::S_CSELECT_B32; + MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(CCReg); + + // The generic constrainSelectedInstRegOperands doesn't work for the scc register + // bank, because it does not cover the register class that we used to represent + // for it. So we need to manually set the register class here. + if (!MRI.getRegClassOrNull(CCReg)) + MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI)); + MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) + .add(I.getOperand(2)) + .add(I.getOperand(3)); + + bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) | + constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); + I.eraseFromParent(); + return Ret; + } + + // Wide VGPR select should have been split in RegBankSelect. + if (Size > 32) + return false; + + MachineInstr *Select = + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) + .add(I.getOperand(3)) + .addImm(0) + .add(I.getOperand(2)) + .add(I.getOperand(1)); + + bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); + I.eraseFromParent(); + return Ret; } bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { @@ -281,10 +849,16 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); DebugLoc DL = I.getDebugLoc(); + unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI); + if (PtrSize != 64) { + LLVM_DEBUG(dbgs() << "Unhandled address space\n"); + return false; + } + unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); unsigned Opcode; - // FIXME: Select store instruction based on address space + // FIXME: Remove this when integers > s32 naturally selected. switch (StoreSize) { default: return false; @@ -307,7 +881,8 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { .add(I.getOperand(0)) .addImm(0) // offset .addImm(0) // glc - .addImm(0); // slc + .addImm(0) // slc + .addImm(0); // dlc // Now that we selected an opcode, we need to constrain the register @@ -318,6 +893,218 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { return Ret; } +static int sizeToSubRegIndex(unsigned Size) { + switch (Size) { + case 32: + return AMDGPU::sub0; + case 64: + return AMDGPU::sub0_sub1; + case 96: + return AMDGPU::sub0_sub1_sub2; + case 128: + return AMDGPU::sub0_sub1_sub2_sub3; + case 256: + return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; + default: + if (Size < 32) + return AMDGPU::sub0; + if (Size > 256) + return -1; + return sizeToSubRegIndex(PowerOf2Ceil(Size)); + } +} + +bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned DstReg = I.getOperand(0).getReg(); + unsigned SrcReg = I.getOperand(1).getReg(); + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + if (!DstTy.isScalar()) + return false; + + const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI); + if (SrcRB != DstRB) + return false; + + unsigned DstSize = DstTy.getSizeInBits(); + unsigned SrcSize = SrcTy.getSizeInBits(); + + const TargetRegisterClass *SrcRC + = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI); + const TargetRegisterClass *DstRC + = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI); + + if (SrcSize > 32) { + int SubRegIdx = sizeToSubRegIndex(DstSize); + if (SubRegIdx == -1) + return false; + + // Deal with weird cases where the class only partially supports the subreg + // index. + SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); + if (!SrcRC) + return false; + + I.getOperand(1).setSubReg(SubRegIdx); + } + + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); + return false; + } + + I.setDesc(TII.get(TargetOpcode::COPY)); + return true; +} + +/// \returns true if a bitmask for \p Size bits will be an inline immediate. +static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { + Mask = maskTrailingOnes<unsigned>(Size); + int SignedMask = static_cast<int>(Mask); + return SignedMask >= -16 && SignedMask <= 64; +} + +bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { + bool Signed = I.getOpcode() == AMDGPU::G_SEXT; + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned SrcReg = I.getOperand(1).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + const LLT S1 = LLT::scalar(1); + const unsigned SrcSize = SrcTy.getSizeInBits(); + const unsigned DstSize = DstTy.getSizeInBits(); + if (!DstTy.isScalar()) + return false; + + const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); + + if (SrcBank->getID() == AMDGPU::SCCRegBankID) { + if (SrcTy != S1 || DstSize > 64) // Invalid + return false; + + unsigned Opcode = + DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; + const TargetRegisterClass *DstRC = + DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass; + + // FIXME: Create an extra copy to avoid incorrectly constraining the result + // of the scc producer. + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg) + .addReg(SrcReg); + BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(TmpReg); + + // The instruction operands are backwards from what you would expect. + BuildMI(MBB, I, DL, TII.get(Opcode), DstReg) + .addImm(0) + .addImm(Signed ? -1 : 1); + return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); + } + + if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) { + if (SrcTy != S1) // Invalid + return false; + + MachineInstr *ExtI = + BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) // src0_modifiers + .addImm(0) // src0 + .addImm(0) // src1_modifiers + .addImm(Signed ? -1 : 1) // src1 + .addUse(SrcReg); + return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + } + + if (I.getOpcode() == AMDGPU::G_ANYEXT) + return selectCOPY(I); + + if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { + // 64-bit should have been split up in RegBankSelect + + // Try to use an and with a mask if it will save code size. + unsigned Mask; + if (!Signed && shouldUseAndMask(SrcSize, Mask)) { + MachineInstr *ExtI = + BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) + .addImm(Mask) + .addReg(SrcReg); + return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + } + + const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32; + MachineInstr *ExtI = + BuildMI(MBB, I, DL, TII.get(BFE), DstReg) + .addReg(SrcReg) + .addImm(0) // Offset + .addImm(SrcSize); // Width + return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); + } + + if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { + if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI)) + return false; + + if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { + const unsigned SextOpc = SrcSize == 8 ? + AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; + BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) + .addReg(SrcReg); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); + } + + const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; + const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; + + // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. + if (DstSize > 32 && SrcSize <= 32) { + // We need a 64-bit register source, but the high bits don't matter. + unsigned ExtReg + = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned UndefReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); + BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) + .addReg(SrcReg) + .addImm(AMDGPU::sub0) + .addReg(UndefReg) + .addImm(AMDGPU::sub1); + + BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) + .addReg(ExtReg) + .addImm(SrcSize << 16); + + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI); + } + + unsigned Mask; + if (!Signed && shouldUseAndMask(SrcSize, Mask)) { + BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) + .addReg(SrcReg) + .addImm(Mask); + } else { + BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) + .addReg(SrcReg) + .addImm(SrcSize << 16); + } + + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); + } + + return false; +} + bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); @@ -423,7 +1210,7 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, getAddrModeInfo(*PtrMI, MRI, AddrInfo); } -static bool isInstrUniform(const MachineInstr &MI) { +bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { if (!MI.hasOneMemOperand()) return false; @@ -445,52 +1232,6 @@ static bool isInstrUniform(const MachineInstr &MI) { return I && I->getMetadata("amdgpu.uniform"); } -static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) { - - if (LoadSize == 32) - return BaseOpcode; - - switch (BaseOpcode) { - case AMDGPU::S_LOAD_DWORD_IMM: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_IMM; - case 128: - return AMDGPU::S_LOAD_DWORDX4_IMM; - case 256: - return AMDGPU::S_LOAD_DWORDX8_IMM; - case 512: - return AMDGPU::S_LOAD_DWORDX16_IMM; - } - break; - case AMDGPU::S_LOAD_DWORD_IMM_ci: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_IMM_ci; - case 128: - return AMDGPU::S_LOAD_DWORDX4_IMM_ci; - case 256: - return AMDGPU::S_LOAD_DWORDX8_IMM_ci; - case 512: - return AMDGPU::S_LOAD_DWORDX16_IMM_ci; - } - break; - case AMDGPU::S_LOAD_DWORD_SGPR: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_SGPR; - case 128: - return AMDGPU::S_LOAD_DWORDX4_SGPR; - case 256: - return AMDGPU::S_LOAD_DWORDX8_SGPR; - case 512: - return AMDGPU::S_LOAD_DWORDX16_SGPR; - } - break; - } - llvm_unreachable("Invalid base smrd opcode or size"); -} - bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { for (const GEPInfo &GEPInfo : AddrInfo) { if (!GEPInfo.VgprParts.empty()) @@ -499,125 +1240,77 @@ bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { return false; } -bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, - ArrayRef<GEPInfo> AddrInfo) const { - - if (!I.hasOneMemOperand()) - return false; - - if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS && - (*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT) - return false; - - if (!isInstrUniform(I)) - return false; - - if (hasVgprParts(AddrInfo)) - return false; +bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { + // TODO: Can/should we insert m0 initialization here for DS instructions and + // call the normal selector? + return false; +} +bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); - const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned DstReg = I.getOperand(0).getReg(); + MachineOperand &CondOp = I.getOperand(0); + Register CondReg = CondOp.getReg(); const DebugLoc &DL = I.getDebugLoc(); - unsigned Opcode; - unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); - - if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) { - - const GEPInfo &GEPInfo = AddrInfo[0]; - - unsigned PtrReg = GEPInfo.SgprParts[0]; - int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm); - if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(EncodedImm) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } + unsigned BrOpcode; + Register CondPhysReg; + const TargetRegisterClass *ConstrainRC; + + // In SelectionDAG, we inspect the IR block for uniformity metadata to decide + // whether the branch is uniform when selecting the instruction. In + // GlobalISel, we should push that decision into RegBankSelect. Assume for now + // RegBankSelect knows what it's doing if the branch condition is scc, even + // though it currently does not. + if (isSCC(CondReg, MRI)) { + CondPhysReg = AMDGPU::SCC; + BrOpcode = AMDGPU::S_CBRANCH_SCC1; + ConstrainRC = &AMDGPU::SReg_32_XM0RegClass; + } else if (isVCC(CondReg, MRI)) { + // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? + // We sort of know that a VCC producer based on the register bank, that ands + // inactive lanes with 0. What if there was a logical operation with vcc + // producers in different blocks/with different exec masks? + // FIXME: Should scc->vcc copies and with exec? + CondPhysReg = TRI.getVCC(); + BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; + ConstrainRC = TRI.getBoolRC(); + } else + return false; - if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS && - isUInt<32>(EncodedImm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize); - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(EncodedImm) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } + if (!MRI.getRegClassOrNull(CondReg)) + MRI.setRegClass(CondReg, ConstrainRC); - if (isUInt<32>(GEPInfo.Imm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize); - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg) - .addImm(GEPInfo.Imm); - - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addReg(OffsetReg) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } - } + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) + .addReg(CondReg); + BuildMI(*BB, &I, DL, TII.get(BrOpcode)) + .addMBB(I.getOperand(1).getMBB()); - unsigned PtrReg = I.getOperand(1).getReg(); - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(0) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); + I.eraseFromParent(); + return true; } - -bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { +bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - DebugLoc DL = I.getDebugLoc(); - unsigned DstReg = I.getOperand(0).getReg(); - unsigned PtrReg = I.getOperand(1).getReg(); - unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); - unsigned Opcode; - - SmallVector<GEPInfo, 4> AddrInfo; - - getAddrModeInfo(I, MRI, AddrInfo); - - if (selectSMRD(I, AddrInfo)) { - I.eraseFromParent(); - return true; - } - switch (LoadSize) { - default: - llvm_unreachable("Load size not supported\n"); - case 32: - Opcode = AMDGPU::FLAT_LOAD_DWORD; - break; - case 64: - Opcode = AMDGPU::FLAT_LOAD_DWORDX2; - break; - } + Register DstReg = I.getOperand(0).getReg(); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; + I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); + if (IsVGPR) + I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); - MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode)) - .add(I.getOperand(0)) - .addReg(PtrReg) - .addImm(0) // offset - .addImm(0) // glc - .addImm(0); // slc - - bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); - I.eraseFromParent(); - return Ret; + return RBI.constrainGenericRegister( + DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI); } bool AMDGPUInstructionSelector::select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const { + if (I.isPHI()) + return selectPHI(I); if (!isPreISelGenericOpcode(I.getOpcode())) { if (I.isCopy()) @@ -626,28 +1319,75 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, } switch (I.getOpcode()) { - default: + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: + if (selectG_AND_OR_XOR(I)) + return true; return selectImpl(I, CoverageInfo); case TargetOpcode::G_ADD: - return selectG_ADD(I); + case TargetOpcode::G_SUB: + if (selectG_ADD_SUB(I)) + return true; + LLVM_FALLTHROUGH; + default: + return selectImpl(I, CoverageInfo); case TargetOpcode::G_INTTOPTR: case TargetOpcode::G_BITCAST: return selectCOPY(I); case TargetOpcode::G_CONSTANT: case TargetOpcode::G_FCONSTANT: return selectG_CONSTANT(I); + case TargetOpcode::G_EXTRACT: + return selectG_EXTRACT(I); + case TargetOpcode::G_MERGE_VALUES: + case TargetOpcode::G_BUILD_VECTOR: + case TargetOpcode::G_CONCAT_VECTORS: + return selectG_MERGE_VALUES(I); + case TargetOpcode::G_UNMERGE_VALUES: + return selectG_UNMERGE_VALUES(I); case TargetOpcode::G_GEP: return selectG_GEP(I); case TargetOpcode::G_IMPLICIT_DEF: return selectG_IMPLICIT_DEF(I); + case TargetOpcode::G_INSERT: + return selectG_INSERT(I); case TargetOpcode::G_INTRINSIC: return selectG_INTRINSIC(I, CoverageInfo); case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo); + case TargetOpcode::G_ICMP: + if (selectG_ICMP(I)) + return true; + return selectImpl(I, CoverageInfo); case TargetOpcode::G_LOAD: - return selectG_LOAD(I); + return selectImpl(I, CoverageInfo); + case TargetOpcode::G_SELECT: + return selectG_SELECT(I); case TargetOpcode::G_STORE: + if (selectImpl(I, CoverageInfo)) + return true; return selectG_STORE(I); + case TargetOpcode::G_TRUNC: + return selectG_TRUNC(I); + case TargetOpcode::G_SEXT: + case TargetOpcode::G_ZEXT: + case TargetOpcode::G_ANYEXT: + if (selectG_SZA_EXT(I)) { + I.eraseFromParent(); + return true; + } + + return false; + case TargetOpcode::G_BRCOND: + return selectG_BRCOND(I); + case TargetOpcode::G_FRAME_INDEX: + return selectG_FRAME_INDEX(I); + case TargetOpcode::G_FENCE: + // FIXME: Tablegen importer doesn't handle the imm operands correctly, and + // is checking for G_CONSTANT + I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE)); + return true; } return false; } @@ -660,6 +1400,26 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { } +std::pair<Register, unsigned> +AMDGPUInstructionSelector::selectVOP3ModsImpl( + Register Src, const MachineRegisterInfo &MRI) const { + unsigned Mods = 0; + MachineInstr *MI = MRI.getVRegDef(Src); + + if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { + Src = MI->getOperand(1).getReg(); + Mods |= SISrcMods::NEG; + MI = MRI.getVRegDef(Src); + } + + if (MI && MI->getOpcode() == AMDGPU::G_FABS) { + Src = MI->getOperand(1).getReg(); + Mods |= SISrcMods::ABS; + } + + return std::make_pair(Src, Mods); +} + /// /// This will select either an SGPR or VGPR operand and will save us from /// having to write an extra tablegen pattern. @@ -672,11 +1432,18 @@ AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); + return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src0_mods - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod }}; } InstructionSelector::ComplexRendererFns @@ -690,8 +1457,274 @@ AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); + return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + SmallVector<GEPInfo, 4> AddrInfo; + getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + + if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) + return None; + + unsigned PtrReg = GEPInfo.SgprParts[0]; + int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + SmallVector<GEPInfo, 4> AddrInfo; + getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + unsigned PtrReg = GEPInfo.SgprParts[0]; + int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); + if (!isUInt<32>(EncodedImm)) + return None; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + SmallVector<GEPInfo, 4> AddrInfo; + getAddrModeInfo(*MI, MRI, AddrInfo); + + // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, + // then we can select all ptr + 32-bit offsets not just immediate offsets. + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) + return None; + + // If we make it this far we have a load with an 32-bit immediate offset. + // It is OK to select this using a sgpr offset, because we have already + // failed trying to select this load into one of the _IMM variants since + // the _IMM Patterns are considered before the _SGPR patterns. + unsigned PtrReg = GEPInfo.SgprParts[0]; + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(GEPInfo.Imm); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } + }}; +} + +template <bool Signed> +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + InstructionSelector::ComplexRendererFns Default = {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc + }}; + + if (!STI.hasFlatInstOffsets()) + return Default; + + const MachineInstr *OpDef = MRI.getVRegDef(Root.getReg()); + if (!OpDef || OpDef->getOpcode() != AMDGPU::G_GEP) + return Default; + + Optional<int64_t> Offset = + getConstantVRegVal(OpDef->getOperand(2).getReg(), MRI); + if (!Offset.hasValue()) + return Default; + + unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); + if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed)) + return Default; + + Register BasePtr = OpDef->getOperand(1).getReg(); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // slc + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { + return selectFlatOffsetImpl<false>(Root); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { + return selectFlatOffsetImpl<true>(Root); +} + +// FIXME: Implement +static bool signBitIsZero(const MachineOperand &Op, + const MachineRegisterInfo &MRI) { + return false; +} + +static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { + auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); + return PSV && PSV->isStack(); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); + + int64_t Offset = 0; + if (mi_match(Root.getReg(), MRI, m_ICst(Offset))) { + Register HighBits = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // TODO: Should this be inside the render function? The iterator seems to + // move. + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), + HighBits) + .addImm(Offset & ~4095); + + return {{[=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(Info->getScratchRSrcReg()); + }, + [=](MachineInstrBuilder &MIB) { // vaddr + MIB.addReg(HighBits); + }, + [=](MachineInstrBuilder &MIB) { // soffset + const MachineMemOperand *MMO = *MI->memoperands_begin(); + const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); + + Register SOffsetReg = isStackPtrRelative(PtrInfo) + ? Info->getStackPtrOffsetReg() + : Info->getScratchWaveOffsetReg(); + MIB.addReg(SOffsetReg); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(Offset & 4095); + }}}; + } + + assert(Offset == 0); + + // Try to fold a frame index directly into the MUBUF vaddr field, and any + // offsets. + Optional<int> FI; + Register VAddr = Root.getReg(); + if (const MachineInstr *RootDef = MRI.getVRegDef(Root.getReg())) { + if (isBaseWithConstantOffset(Root, MRI)) { + const MachineOperand &LHS = RootDef->getOperand(1); + const MachineOperand &RHS = RootDef->getOperand(2); + const MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); + const MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); + if (LHSDef && RHSDef) { + int64_t PossibleOffset = + RHSDef->getOperand(1).getCImm()->getSExtValue(); + if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && + (!STI.privateMemoryResourceIsRangeChecked() || + signBitIsZero(LHS, MRI))) { + if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) + FI = LHSDef->getOperand(1).getIndex(); + else + VAddr = LHS.getReg(); + Offset = PossibleOffset; + } + } + } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { + FI = RootDef->getOperand(1).getIndex(); + } + } + + // If we don't know this private access is a local stack object, it needs to + // be relative to the entry point's scratch wave offset register. + // TODO: Should split large offsets that don't fit like above. + // TODO: Don't use scratch wave offset just because the offset didn't fit. + Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() + : Info->getScratchWaveOffsetReg(); + + return {{[=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(Info->getScratchRSrcReg()); + }, + [=](MachineInstrBuilder &MIB) { // vaddr + if (FI.hasValue()) + MIB.addFrameIndex(FI.getValue()); + else + MIB.addReg(VAddr); + }, + [=](MachineInstrBuilder &MIB) { // soffset + MIB.addReg(SOffset); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(Offset); + }}}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectMUBUFScratchOffset( + MachineOperand &Root) const { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + int64_t Offset = 0; + if (!mi_match(Root.getReg(), MRI, m_ICst(Offset)) || + !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) + return {}; + + const MachineFunction *MF = MBB->getParent(); + const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); + const MachineMemOperand *MMO = *MI->memoperands_begin(); + const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); + + Register SOffsetReg = isStackPtrRelative(PtrInfo) + ? Info->getStackPtrOffsetReg() + : Info->getScratchWaveOffsetReg(); + return {{ + [=](MachineInstrBuilder &MIB) { + MIB.addReg(Info->getScratchRSrcReg()); + }, // rsrc + [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset }}; } diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 449431adc561..4f489ddfb23d 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -1,9 +1,8 @@ //===- AMDGPUInstructionSelector --------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -18,7 +17,9 @@ #include "AMDGPUArgumentUsageInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/IR/InstrTypes.h" namespace { #define GET_GLOBALISEL_PREDICATE_BITSET @@ -58,24 +59,45 @@ private: GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { } }; + bool isInstrUniform(const MachineInstr &MI) const; + bool isVCC(Register Reg, const MachineRegisterInfo &MRI) const; + /// tblgen-erated 'select' implementation. bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; - MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const; + MachineOperand getSubOperand64(MachineOperand &MO, + const TargetRegisterClass &SubRC, + unsigned SubIdx) const; bool selectCOPY(MachineInstr &I) const; + bool selectPHI(MachineInstr &I) const; + bool selectG_TRUNC(MachineInstr &I) const; + bool selectG_SZA_EXT(MachineInstr &I) const; bool selectG_CONSTANT(MachineInstr &I) const; - bool selectG_ADD(MachineInstr &I) const; + bool selectG_AND_OR_XOR(MachineInstr &I) const; + bool selectG_ADD_SUB(MachineInstr &I) const; + bool selectG_EXTRACT(MachineInstr &I) const; + bool selectG_MERGE_VALUES(MachineInstr &I) const; + bool selectG_UNMERGE_VALUES(MachineInstr &I) const; bool selectG_GEP(MachineInstr &I) const; bool selectG_IMPLICIT_DEF(MachineInstr &I) const; + bool selectG_INSERT(MachineInstr &I) const; bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const; + bool selectG_ICMP(MachineInstr &I) const; bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const; void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const; bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const; bool selectG_LOAD(MachineInstr &I) const; + bool selectG_SELECT(MachineInstr &I) const; bool selectG_STORE(MachineInstr &I) const; + bool selectG_BRCOND(MachineInstr &I) const; + bool selectG_FRAME_INDEX(MachineInstr &I) const; + + std::pair<Register, unsigned> + selectVOP3ModsImpl(Register Src, const MachineRegisterInfo &MRI) const; InstructionSelector::ComplexRendererFns selectVCSRC(MachineOperand &Root) const; @@ -90,6 +112,27 @@ private: InstructionSelector::ComplexRendererFns selectVOP3Mods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdImm(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdImm32(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdSgpr(MachineOperand &Root) const; + + template <bool Signed> + InstructionSelector::ComplexRendererFns + selectFlatOffsetImpl(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectFlatOffset(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectFlatOffsetSigned(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectMUBUFScratchOffen(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectMUBUFScratchOffset(MachineOperand &Root) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index eb8f2002ff2d..61bc415c839d 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -1,9 +1,8 @@ //===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -12,6 +11,18 @@ // //===----------------------------------------------------------------------===// +class AddressSpacesImpl { + int Flat = 0; + int Global = 1; + int Region = 2; + int Local = 3; + int Constant = 4; + int Private = 5; +} + +def AddrSpaces : AddressSpacesImpl; + + class AMDGPUInst <dag outs, dag ins, string asm = "", list<dag> pattern = []> : Instruction { field bit isRegisterLoad = 0; @@ -66,17 +77,15 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> def TruePredicate : Predicate<"true">; -// Exists to help track down where SubtargetPredicate isn't set rather -// than letting tablegen crash with an unhelpful error. -def InvalidPred : Predicate<"predicate not set on instruction or pattern">; - class PredicateControl { - Predicate SubtargetPredicate = InvalidPred; + Predicate SubtargetPredicate = TruePredicate; list<Predicate> AssemblerPredicates = []; Predicate AssemblerPredicate = TruePredicate; + Predicate WaveSizePredicate = TruePredicate; list<Predicate> OtherPredicates = []; list<Predicate> Predicates = !listconcat([SubtargetPredicate, - AssemblerPredicate], + AssemblerPredicate, + WaveSizePredicate], AssemblerPredicates, OtherPredicates); } @@ -326,6 +335,10 @@ def TEX_SHADOW_ARRAY : PatLeaf< // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// +class AddressSpaceList<list<int> AS> { + list<int> AddrSpaces = AS; +} + class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ return cast<MemSDNode>(N)->getAlignment() % 8 == 0; }]>; @@ -344,21 +357,25 @@ class StoreHi16<SDPatternOperator op> : PatFrag < (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr) >; -class PrivateAddress : CodePatPred<[{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; -}]>; +def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant ]>; +def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, AddrSpaces.Constant ]>; +def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>; -class ConstantAddress : CodePatPred<[{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; -}]>; +def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, + AddrSpaces.Global, + AddrSpaces.Constant ]>; +def StoreAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, AddrSpaces.Global ]>; + +def LoadAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>; +def StoreAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>; + +def LoadAddress_local : AddressSpaceList<[ AddrSpaces.Local ]>; +def StoreAddress_local : AddressSpaceList<[ AddrSpaces.Local ]>; + +def LoadAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>; +def StoreAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>; -class LocalAddress : CodePatPred<[{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; -class GlobalAddress : CodePatPred<[{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; -}]>; class GlobalLoadAddress : CodePatPred<[{ auto AS = cast<MemSDNode>(N)->getAddressSpace(); @@ -372,86 +389,126 @@ class FlatLoadAddress : CodePatPred<[{ AS == AMDGPUAS::CONSTANT_ADDRESS; }]>; -class FlatStoreAddress : CodePatPred<[{ - const auto AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::GLOBAL_ADDRESS; +class GlobalAddress : CodePatPred<[{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; -class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr), - (ld_node node:$ptr), [{ - LoadSDNode *L = cast<LoadSDNode>(N); - return L->getExtensionType() == ISD::ZEXTLOAD || - L->getExtensionType() == ISD::EXTLOAD; +class PrivateAddress : CodePatPred<[{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; }]>; -def az_extload : AZExtLoadBase <unindexedload>; - -def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8; +class LocalAddress : CodePatPred<[{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; -def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; +class RegionAddress : CodePatPred<[{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; }]>; -def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32; +class FlatStoreAddress : CodePatPred<[{ + const auto AS = cast<MemSDNode>(N)->getAddressSpace(); + return AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::GLOBAL_ADDRESS; }]>; -class PrivateLoad <SDPatternOperator op> : LoadFrag <op>, PrivateAddress; +// TODO: Remove these when stores to new PatFrag format. class PrivateStore <SDPatternOperator op> : StoreFrag <op>, PrivateAddress; - -class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress; class LocalStore <SDPatternOperator op> : StoreFrag <op>, LocalAddress; - -class GlobalLoad <SDPatternOperator op> : LoadFrag<op>, GlobalLoadAddress; +class RegionStore <SDPatternOperator op> : StoreFrag <op>, RegionAddress; class GlobalStore <SDPatternOperator op> : StoreFrag<op>, GlobalAddress; - -class FlatLoad <SDPatternOperator op> : LoadFrag <op>, FlatLoadAddress; class FlatStore <SDPatternOperator op> : StoreFrag <op>, FlatStoreAddress; -class ConstantLoad <SDPatternOperator op> : LoadFrag <op>, ConstantAddress; +foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { +let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { -def load_private : PrivateLoad <load>; -def az_extloadi8_private : PrivateLoad <az_extloadi8>; -def sextloadi8_private : PrivateLoad <sextloadi8>; -def az_extloadi16_private : PrivateLoad <az_extloadi16>; -def sextloadi16_private : PrivateLoad <sextloadi16>; +def load_#as : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; +} -def store_private : PrivateStore <store>; -def truncstorei8_private : PrivateStore<truncstorei8>; -def truncstorei16_private : PrivateStore <truncstorei16>; -def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress; -def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress; +def extloadi8_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} + +def extloadi16_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} +def sextloadi8_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} + +def sextloadi16_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} + +def zextloadi8_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} + +def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} + +def atomic_load_32_#as : PatFrag<(ops node:$ptr), (atomic_load_32 node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i32; +} -def load_global : GlobalLoad <load>; -def sextloadi8_global : GlobalLoad <sextloadi8>; -def az_extloadi8_global : GlobalLoad <az_extloadi8>; -def sextloadi16_global : GlobalLoad <sextloadi16>; -def az_extloadi16_global : GlobalLoad <az_extloadi16>; -def atomic_load_global : GlobalLoad<atomic_load>; +def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i64; +} + +def store_#as : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore node:$val, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; +} + +// truncstore fragments. +def truncstore_#as : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore node:$val, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 1; +} + +// TODO: We don't really need the truncstore here. We can use +// unindexedstore with MemoryVT directly, which will save an +// unnecessary check that the memory size is less than the value type +// in the generated matcher table. +def truncstorei8_#as : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i8; +} + +def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i16; +} + +defm atomic_store_#as : binary_atomic_op<atomic_store>; + +} // End let AddressSpaces = ... +} // End foreach AddrSpace + + +def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress; +def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress; -def store_global : GlobalStore <store>; -def truncstorei8_global : GlobalStore <truncstorei8>; -def truncstorei16_global : GlobalStore <truncstorei16>; def store_atomic_global : GlobalStore<atomic_store>; def truncstorei8_hi16_global : StoreHi16 <truncstorei8>, GlobalAddress; def truncstorei16_hi16_global : StoreHi16 <truncstorei16>, GlobalAddress; -def load_local : LocalLoad <load>; -def az_extloadi8_local : LocalLoad <az_extloadi8>; -def sextloadi8_local : LocalLoad <sextloadi8>; -def az_extloadi16_local : LocalLoad <az_extloadi16>; -def sextloadi16_local : LocalLoad <sextloadi16>; -def atomic_load_32_local : LocalLoad<atomic_load_32>; -def atomic_load_64_local : LocalLoad<atomic_load_64>; - -def store_local : LocalStore <store>; -def truncstorei8_local : LocalStore <truncstorei8>; -def truncstorei16_local : LocalStore <truncstorei16>; def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress; def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress; def atomic_store_local : LocalStore <atomic_store>; @@ -472,34 +529,24 @@ def store_align16_local : Aligned16Bytes < (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr) >; -def load_flat : FlatLoad <load>; -def az_extloadi8_flat : FlatLoad <az_extloadi8>; -def sextloadi8_flat : FlatLoad <sextloadi8>; -def az_extloadi16_flat : FlatLoad <az_extloadi16>; -def sextloadi16_flat : FlatLoad <sextloadi16>; -def atomic_load_flat : FlatLoad<atomic_load>; - -def store_flat : FlatStore <store>; -def truncstorei8_flat : FlatStore <truncstorei8>; -def truncstorei16_flat : FlatStore <truncstorei16>; def atomic_store_flat : FlatStore <atomic_store>; def truncstorei8_hi16_flat : StoreHi16<truncstorei8>, FlatStoreAddress; def truncstorei16_hi16_flat : StoreHi16<truncstorei16>, FlatStoreAddress; -def constant_load : ConstantLoad<load>; -def sextloadi8_constant : ConstantLoad <sextloadi8>; -def az_extloadi8_constant : ConstantLoad <az_extloadi8>; -def sextloadi16_constant : ConstantLoad <sextloadi16>; -def az_extloadi16_constant : ConstantLoad <az_extloadi16>; - - class local_binary_atomic_op<SDNode atomic_op> : PatFrag<(ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +class region_binary_atomic_op<SDNode atomic_op> : + PatFrag<(ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; +}]>; + + def atomic_swap_local : local_binary_atomic_op<atomic_swap>; def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>; def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>; @@ -524,13 +571,22 @@ class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag< return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +class AtomicCmpSwapRegion <SDNode cmp_swap_node> : PatFrag< + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; +}]>; + def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>; +class global_binary_atomic_op_frag<SDNode atomic_op> : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + multiclass global_binary_atomic_op<SDNode atomic_op> { - def "" : PatFrag< - (ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + def "" : global_binary_atomic_op_frag<atomic_op>; def _noret : PatFrag< (ops node:$ptr, node:$value), @@ -585,7 +641,6 @@ int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding int FP16_ONE = 0x3C00; int FP16_NEG_ONE = 0xBC00; -int V2FP16_ONE = 0x3C003C00; int FP32_ONE = 0x3f800000; int FP32_NEG_ONE = 0xbf800000; int FP64_ONE = 0x3ff0000000000000; @@ -626,9 +681,7 @@ class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, : AMDGPUPat< (sub_type (extractelt vec_type:$src, sub_idx)), (EXTRACT_SUBREG $src, sub_reg) -> { - let SubtargetPredicate = TruePredicate; -} +>; /* Insert element pattern */ class Insert_Element <ValueType elem_type, ValueType vec_type, @@ -636,9 +689,7 @@ class Insert_Element <ValueType elem_type, ValueType vec_type, : AMDGPUPat < (insertelt vec_type:$vec, elem_type:$elem, sub_idx), (INSERT_SUBREG $vec, $elem, sub_reg) -> { - let SubtargetPredicate = TruePredicate; -} +>; // XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer // can handle COPY instructions. @@ -811,7 +862,7 @@ multiclass IntMed3Pat<Instruction med3Inst, SDPatternOperator max_oneuse, ValueType vt = i32> { - // This matches 16 permutations of + // This matches 16 permutations of // min(max(a, b), max(min(a, b), c)) def : AMDGPUPat < (min (max_oneuse vt:$src0, vt:$src1), @@ -819,7 +870,7 @@ multiclass IntMed3Pat<Instruction med3Inst, (med3Inst vt:$src0, vt:$src1, vt:$src2) >; - // This matches 16 permutations of + // This matches 16 permutations of // max(min(x, y), min(max(x, y), z)) def : AMDGPUPat < (max (min_oneuse vt:$src0, vt:$src1), @@ -827,7 +878,7 @@ multiclass IntMed3Pat<Instruction med3Inst, (med3Inst $src0, $src1, $src2) >; } - + // Special conversion patterns def cvt_rpi_i32_f32 : PatFrag < diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp deleted file mode 100644 index 02108ca3ddd7..000000000000 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ /dev/null @@ -1,103 +0,0 @@ -//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// AMDGPU Implementation of the IntrinsicInfo class. -// -//===-----------------------------------------------------------------------===// - -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" - -using namespace llvm; - -AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() - : TargetIntrinsicInfo() {} - -static const char *const IntrinsicNameTable[] = { -#define GET_INTRINSIC_NAME_TABLE -#include "AMDGPUGenIntrinsicImpl.inc" -#undef GET_INTRINSIC_NAME_TABLE -}; - -namespace { -#define GET_INTRINSIC_ATTRIBUTES -#include "AMDGPUGenIntrinsicImpl.inc" -#undef GET_INTRINSIC_ATTRIBUTES -} - -StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID, - ArrayRef<Type *> Tys) const { - if (IntrID < Intrinsic::num_intrinsics) - return StringRef(); - - assert(IntrID < SIIntrinsic::num_AMDGPU_intrinsics && - "Invalid intrinsic ID"); - - return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]; -} - -std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, - unsigned NumTys) const { - return getName(IntrID, makeArrayRef(Tys, NumTys)).str(); -} - -FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID, - ArrayRef<Type*> Tys) const { - // FIXME: Re-use Intrinsic::getType machinery - llvm_unreachable("unhandled intrinsic"); -} - -unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData, - unsigned Len) const { - StringRef Name(NameData, Len); - if (!Name.startswith("llvm.")) - return 0; // All intrinsics start with 'llvm.' - - // Look for a name match in our table. If the intrinsic is not overloaded, - // require an exact match. If it is overloaded, require a prefix match. The - // AMDGPU enum enum starts at Intrinsic::num_intrinsics. - int Idx = Intrinsic::lookupLLVMIntrinsicByName(IntrinsicNameTable, Name); - if (Idx >= 0) { - bool IsPrefixMatch = Name.size() > strlen(IntrinsicNameTable[Idx]); - return IsPrefixMatch == isOverloaded(Idx + 1) - ? Intrinsic::num_intrinsics + Idx - : 0; - } - - return 0; -} - -bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { -// Overload Table -#define GET_INTRINSIC_OVERLOAD_TABLE -#include "AMDGPUGenIntrinsicImpl.inc" -#undef GET_INTRINSIC_OVERLOAD_TABLE -} - -Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, - ArrayRef<Type *> Tys) const { - FunctionType *FTy = getType(M->getContext(), IntrID, Tys); - Function *F - = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy)); - - AttributeList AS = - getAttributes(M->getContext(), static_cast<SIIntrinsic::ID>(IntrID)); - F->setAttributes(AS); - return F; -} - -Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, - Type **Tys, - unsigned NumTys) const { - return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys)); -} diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h deleted file mode 100644 index a1a094dded23..000000000000 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h +++ /dev/null @@ -1,58 +0,0 @@ -//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// Interface for the AMDGPU Implementation of the Intrinsic Info class. -// -//===-----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H - -#include "llvm/IR/Intrinsics.h" -#include "llvm/Target/TargetIntrinsicInfo.h" - -namespace llvm { -class TargetMachine; - -namespace SIIntrinsic { -enum ID { - last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, -#define GET_INTRINSIC_ENUM_VALUES -#include "AMDGPUGenIntrinsicEnums.inc" -#undef GET_INTRINSIC_ENUM_VALUES - , num_AMDGPU_intrinsics -}; - -} // end namespace AMDGPUIntrinsic - -class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo { -public: - AMDGPUIntrinsicInfo(); - - StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const; - - std::string getName(unsigned IntrId, Type **Tys = nullptr, - unsigned NumTys = 0) const override; - - unsigned lookupName(const char *Name, unsigned Len) const override; - bool isOverloaded(unsigned IID) const override; - Function *getDeclaration(Module *M, unsigned ID, - Type **Tys = nullptr, - unsigned NumTys = 0) const override; - - Function *getDeclaration(Module *M, unsigned ID, - ArrayRef<Type *> = None) const; - - FunctionType *getType(LLVMContext &Context, unsigned ID, - ArrayRef<Type*> Tys = None) const; -}; - -} // end namespace llvm - -#endif diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index ef85c1040545..670f6225fbf7 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1,9 +1,8 @@ //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -15,17 +14,93 @@ #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" #include "AMDGPUTargetMachine.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" +#define DEBUG_TYPE "amdgpu-legalinfo" + using namespace llvm; using namespace LegalizeActions; +using namespace LegalizeMutations; +using namespace LegalityPredicates; + + +static LegalityPredicate isMultiple32(unsigned TypeIdx, + unsigned MaxSize = 512) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + const LLT EltTy = Ty.getScalarType(); + return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; + }; +} + +static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + return Ty.isVector() && + Ty.getNumElements() % 2 != 0 && + Ty.getElementType().getSizeInBits() < 32; + }; +} -AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, - const GCNTargetMachine &TM) { +static LegalizeMutation oneMoreElement(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + const LLT EltTy = Ty.getElementType(); + return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); + }; +} + +static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + const LLT EltTy = Ty.getElementType(); + unsigned Size = Ty.getSizeInBits(); + unsigned Pieces = (Size + 63) / 64; + unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; + return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); + }; +} + +static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { + return [=](const LegalityQuery &Query) { + const LLT QueryTy = Query.Types[TypeIdx]; + return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; + }; +} + +static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT QueryTy = Query.Types[TypeIdx]; + return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; + }; +} + +// Any combination of 32 or 64-bit elements up to 512 bits, and multiples of +// v2s16. +static LegalityPredicate isRegisterType(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + if (Ty.isVector()) { + const int EltSize = Ty.getElementType().getSizeInBits(); + return EltSize == 32 || EltSize == 64 || + (EltSize == 16 && Ty.getNumElements() % 2 == 0) || + EltSize == 128 || EltSize == 256; + } + + return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; + }; +} + +AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, + const GCNTargetMachine &TM) + : ST(ST_) { using namespace TargetOpcode; auto GetAddrSpacePtr = [&TM](unsigned AS) { @@ -33,13 +108,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, }; const LLT S1 = LLT::scalar(1); + const LLT S8 = LLT::scalar(8); + const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); + const LLT S128 = LLT::scalar(128); + const LLT S256 = LLT::scalar(256); const LLT S512 = LLT::scalar(512); const LLT V2S16 = LLT::vector(2, 16); const LLT V4S16 = LLT::vector(4, 16); - const LLT V8S16 = LLT::vector(8, 16); const LLT V2S32 = LLT::vector(2, 32); const LLT V3S32 = LLT::vector(3, 32); @@ -79,156 +157,428 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, const LLT CodePtr = FlatPtr; - const LLT AddrSpaces[] = { - GlobalPtr, - ConstantPtr, - LocalPtr, - FlatPtr, - PrivatePtr + const std::initializer_list<LLT> AddrSpaces64 = { + GlobalPtr, ConstantPtr, FlatPtr + }; + + const std::initializer_list<LLT> AddrSpaces32 = { + LocalPtr, PrivatePtr + }; + + const std::initializer_list<LLT> FPTypesBase = { + S32, S64 + }; + + const std::initializer_list<LLT> FPTypes16 = { + S32, S64, S16 + }; + + const std::initializer_list<LLT> FPTypesPK16 = { + S32, S64, S16, V2S16 }; setAction({G_BRCOND, S1}, Legal); - setAction({G_ADD, S32}, Legal); - setAction({G_ASHR, S32}, Legal); - setAction({G_SUB, S32}, Legal); - setAction({G_MUL, S32}, Legal); + // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more + // elements for v3s16 + getActionDefinitionsBuilder(G_PHI) + .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) + .legalFor(AllS32Vectors) + .legalFor(AllS64Vectors) + .legalFor(AddrSpaces64) + .legalFor(AddrSpaces32) + .clampScalar(0, S32, S256) + .widenScalarToNextPow2(0, 32) + .clampMaxNumElements(0, S32, 16) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .legalIf(isPointer(0)); - // FIXME: 64-bit ones only legal for scalar + if (ST.has16BitInsts()) { + getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + .legalFor({S32, S16}) + .clampScalar(0, S16, S32) + .scalarize(0); + } else { + getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + .legalFor({S32}) + .clampScalar(0, S32, S32) + .scalarize(0); + } + + getActionDefinitionsBuilder({G_UMULH, G_SMULH}) + .legalFor({S32}) + .clampScalar(0, S32, S32) + .scalarize(0); + + // Report legal for any types we can handle anywhere. For the cases only legal + // on the SALU, RegBankSelect will be able to re-legalize. getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) - .legalFor({S32, S1, S64, V2S32}); + .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) + .clampScalar(0, S32, S64) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) + .widenScalarToNextPow2(0) + .scalarize(0); getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) - .legalFor({{S32, S1}}); + .legalFor({{S32, S1}}) + .clampScalar(0, S32, S32); - setAction({G_BITCAST, V2S16}, Legal); - setAction({G_BITCAST, 1, S32}, Legal); + getActionDefinitionsBuilder(G_BITCAST) + .legalForCartesianProduct({S32, V2S16}) + .legalForCartesianProduct({S64, V2S32, V4S16}) + .legalForCartesianProduct({V2S64, V4S32}) + // Don't worry about the size constraint. + .legalIf(all(isPointer(0), isPointer(1))); - setAction({G_BITCAST, S32}, Legal); - setAction({G_BITCAST, 1, V2S16}, Legal); - - getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({S32, S64}); + if (ST.has16BitInsts()) { + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({S32, S64, S16}) + .clampScalar(0, S16, S64); + } else { + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({S32, S64}) + .clampScalar(0, S32, S64); + } - // G_IMPLICIT_DEF is a no-op so we can make it legal for any value type that - // can fit in a register. - // FIXME: We need to legalize several more operations before we can add - // a test case for size > 512. getActionDefinitionsBuilder(G_IMPLICIT_DEF) - .legalIf([=](const LegalityQuery &Query) { - return Query.Types[0].getSizeInBits() <= 512; - }) - .clampScalar(0, S1, S512); + .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, + ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .clampScalarOrElt(0, S32, S512) + .legalIf(isMultiple32(0)) + .widenScalarToNextPow2(0, 32) + .clampMaxNumElements(0, S32, 16); - getActionDefinitionsBuilder(G_CONSTANT) - .legalFor({S1, S32, S64}); // FIXME: i1 operands to intrinsics should always be legal, but other i1 // values may not be legal. We need to figure out how to distinguish // between these two scenarios. - setAction({G_CONSTANT, S1}, Legal); + getActionDefinitionsBuilder(G_CONSTANT) + .legalFor({S1, S32, S64, GlobalPtr, + LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) + .clampScalar(0, S32, S64) + .widenScalarToNextPow2(0) + .legalIf(isPointer(0)); setAction({G_FRAME_INDEX, PrivatePtr}, Legal); - getActionDefinitionsBuilder( - { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA}) + auto &FPOpActions = getActionDefinitionsBuilder( + { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) .legalFor({S32, S64}); - getActionDefinitionsBuilder(G_FPTRUNC) - .legalFor({{S32, S64}}); + if (ST.has16BitInsts()) { + if (ST.hasVOP3PInsts()) + FPOpActions.legalFor({S16, V2S16}); + else + FPOpActions.legalFor({S16}); + } - // Use actual fsub instruction - setAction({G_FSUB, S32}, Legal); + auto &MinNumMaxNum = getActionDefinitionsBuilder({ + G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); + + if (ST.hasVOP3PInsts()) { + MinNumMaxNum.customFor(FPTypesPK16) + .clampMaxNumElements(0, S16, 2) + .clampScalar(0, S16, S64) + .scalarize(0); + } else if (ST.has16BitInsts()) { + MinNumMaxNum.customFor(FPTypes16) + .clampScalar(0, S16, S64) + .scalarize(0); + } else { + MinNumMaxNum.customFor(FPTypesBase) + .clampScalar(0, S32, S64) + .scalarize(0); + } - // Must use fadd + fneg - setAction({G_FSUB, S64}, Lower); + // TODO: Implement + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); - setAction({G_FCMP, S1}, Legal); - setAction({G_FCMP, 1, S32}, Legal); - setAction({G_FCMP, 1, S64}, Legal); + if (ST.hasVOP3PInsts()) + FPOpActions.clampMaxNumElements(0, S16, 2); + FPOpActions + .scalarize(0) + .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); - setAction({G_ZEXT, S64}, Legal); - setAction({G_ZEXT, 1, S32}, Legal); + if (ST.has16BitInsts()) { + getActionDefinitionsBuilder(G_FSQRT) + .legalFor({S32, S64, S16}) + .scalarize(0) + .clampScalar(0, S16, S64); + } else { + getActionDefinitionsBuilder(G_FSQRT) + .legalFor({S32, S64}) + .scalarize(0) + .clampScalar(0, S32, S64); + } - setAction({G_SEXT, S64}, Legal); - setAction({G_SEXT, 1, S32}, Legal); + getActionDefinitionsBuilder(G_FPTRUNC) + .legalFor({{S32, S64}, {S16, S32}}) + .scalarize(0); - setAction({G_ANYEXT, S64}, Legal); - setAction({G_ANYEXT, 1, S32}, Legal); + getActionDefinitionsBuilder(G_FPEXT) + .legalFor({{S64, S32}, {S32, S16}}) + .lowerFor({{S64, S16}}) // FIXME: Implement + .scalarize(0); - setAction({G_FPTOSI, S32}, Legal); - setAction({G_FPTOSI, 1, S32}, Legal); + // TODO: Verify V_BFI_B32 is generated from expanded bit ops. + getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); - setAction({G_SITOFP, S32}, Legal); - setAction({G_SITOFP, 1, S32}, Legal); + getActionDefinitionsBuilder(G_FSUB) + // Use actual fsub instruction + .legalFor({S32}) + // Must use fadd + fneg + .lowerFor({S64, S16, V2S16}) + .scalarize(0) + .clampScalar(0, S32, S64); - setAction({G_UITOFP, S32}, Legal); - setAction({G_UITOFP, 1, S32}, Legal); + getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) + .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, + {S32, S1}, {S64, S1}, {S16, S1}, + // FIXME: Hack + {S64, LLT::scalar(33)}, + {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) + .scalarize(0); - setAction({G_FPTOUI, S32}, Legal); - setAction({G_FPTOUI, 1, S32}, Legal); + getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) + .legalFor({{S32, S32}, {S64, S32}}) + .lowerFor({{S32, S64}}) + .customFor({{S64, S64}}) + .scalarize(0); - setAction({G_FPOW, S32}, Legal); - setAction({G_FEXP2, S32}, Legal); - setAction({G_FLOG2, S32}, Legal); + getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) + .legalFor({{S32, S32}, {S32, S64}}) + .scalarize(0); - getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND}) - .legalFor({S32, S64}); + getActionDefinitionsBuilder(G_INTRINSIC_ROUND) + .legalFor({S32, S64}) + .scalarize(0); - for (LLT PtrTy : AddrSpaces) { - LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits()); - setAction({G_GEP, PtrTy}, Legal); - setAction({G_GEP, 1, IdxTy}, Legal); + if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) + .legalFor({S32, S64}) + .clampScalar(0, S32, S64) + .scalarize(0); + } else { + getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) + .legalFor({S32}) + .customFor({S64}) + .clampScalar(0, S32, S64) + .scalarize(0); } + getActionDefinitionsBuilder(G_GEP) + .legalForCartesianProduct(AddrSpaces64, {S64}) + .legalForCartesianProduct(AddrSpaces32, {S32}) + .scalarize(0); + setAction({G_BLOCK_ADDR, CodePtr}, Legal); - setAction({G_ICMP, S1}, Legal); - setAction({G_ICMP, 1, S32}, Legal); + auto &CmpBuilder = + getActionDefinitionsBuilder(G_ICMP) + .legalForCartesianProduct( + {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) + .legalFor({{S1, S32}, {S1, S64}}); + if (ST.has16BitInsts()) { + CmpBuilder.legalFor({{S1, S16}}); + } + + CmpBuilder + .widenScalarToNextPow2(1) + .clampScalar(1, S32, S64) + .scalarize(0) + .legalIf(all(typeIs(0, S1), isPointer(1))); + + getActionDefinitionsBuilder(G_FCMP) + .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) + .widenScalarToNextPow2(1) + .clampScalar(1, S32, S64) + .scalarize(0); + + // FIXME: fexp, flog2, flog10 needs to be custom lowered. + getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, + G_FLOG, G_FLOG2, G_FLOG10}) + .legalFor({S32}) + .scalarize(0); + + // The 64-bit versions produce 32-bit results, but only on the SALU. + getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, + G_CTTZ, G_CTTZ_ZERO_UNDEF, + G_CTPOP}) + .legalFor({{S32, S32}, {S32, S64}}) + .clampScalar(0, S32, S32) + .clampScalar(1, S32, S64) + .scalarize(0) + .widenScalarToNextPow2(0, 32) + .widenScalarToNextPow2(1, 32); + + // TODO: Expand for > s32 + getActionDefinitionsBuilder(G_BSWAP) + .legalFor({S32}) + .clampScalar(0, S32, S32) + .scalarize(0); + + if (ST.has16BitInsts()) { + if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, V2S16}) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .clampMaxNumElements(0, S16, 2) + .clampScalar(0, S16, S32) + .widenScalarToNextPow2(0) + .scalarize(0); + } else { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16}) + .widenScalarToNextPow2(0) + .clampScalar(0, S16, S32) + .scalarize(0); + } + } else { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32}) + .clampScalar(0, S32, S32) + .widenScalarToNextPow2(0) + .scalarize(0); + } - setAction({G_CTLZ, S32}, Legal); - setAction({G_CTLZ_ZERO_UNDEF, S32}, Legal); - setAction({G_CTTZ, S32}, Legal); - setAction({G_CTTZ_ZERO_UNDEF, S32}, Legal); - setAction({G_BSWAP, S32}, Legal); - setAction({G_CTPOP, S32}, Legal); + auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { + return [=](const LegalityQuery &Query) { + return Query.Types[TypeIdx0].getSizeInBits() < + Query.Types[TypeIdx1].getSizeInBits(); + }; + }; + + auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { + return [=](const LegalityQuery &Query) { + return Query.Types[TypeIdx0].getSizeInBits() > + Query.Types[TypeIdx1].getSizeInBits(); + }; + }; getActionDefinitionsBuilder(G_INTTOPTR) - .legalIf([](const LegalityQuery &Query) { - return true; - }); + // List the common cases + .legalForCartesianProduct(AddrSpaces64, {S64}) + .legalForCartesianProduct(AddrSpaces32, {S32}) + .scalarize(0) + // Accept any address space as long as the size matches + .legalIf(sameSize(0, 1)) + .widenScalarIf(smallerThan(1, 0), + [](const LegalityQuery &Query) { + return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); + }) + .narrowScalarIf(greaterThan(1, 0), + [](const LegalityQuery &Query) { + return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); + }); getActionDefinitionsBuilder(G_PTRTOINT) - .legalIf([](const LegalityQuery &Query) { - return true; - }); + // List the common cases + .legalForCartesianProduct(AddrSpaces64, {S64}) + .legalForCartesianProduct(AddrSpaces32, {S32}) + .scalarize(0) + // Accept any address space as long as the size matches + .legalIf(sameSize(0, 1)) + .widenScalarIf(smallerThan(0, 1), + [](const LegalityQuery &Query) { + return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); + }) + .narrowScalarIf( + greaterThan(0, 1), + [](const LegalityQuery &Query) { + return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); + }); + + if (ST.hasFlatAddressSpace()) { + getActionDefinitionsBuilder(G_ADDRSPACE_CAST) + .scalarize(0) + .custom(); + } + // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we + // handle some operations by just promoting the register during + // selection. There are also d16 loads on GFX9+ which preserve the high bits. getActionDefinitionsBuilder({G_LOAD, G_STORE}) - .legalIf([=, &ST](const LegalityQuery &Query) { + .narrowScalarIf([](const LegalityQuery &Query) { + unsigned Size = Query.Types[0].getSizeInBits(); + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + return (Size > 32 && MemSize < Size); + }, + [](const LegalityQuery &Query) { + return std::make_pair(0, LLT::scalar(32)); + }) + .fewerElementsIf([=](const LegalityQuery &Query) { + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + return (MemSize == 96) && + Query.Types[0].isVector() && + !ST.hasDwordx3LoadStores(); + }, + [=](const LegalityQuery &Query) { + return std::make_pair(0, V2S32); + }) + .legalIf([=](const LegalityQuery &Query) { const LLT &Ty0 = Query.Types[0]; + unsigned Size = Ty0.getSizeInBits(); + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + if (Size < 32 || (Size > 32 && MemSize < Size)) + return false; + + if (Ty0.isVector() && Size != MemSize) + return false; + // TODO: Decompose private loads into 4-byte components. // TODO: Illegal flat loads on SI - switch (Ty0.getSizeInBits()) { + switch (MemSize) { + case 8: + case 16: + return Size == 32; case 32: case 64: case 128: return true; case 96: - // XXX hasLoadX3 - return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS); + return ST.hasDwordx3LoadStores(); case 256: case 512: - // TODO: constant loads + // TODO: Possibly support loads of i256 and i512 . This will require + // adding i256 and i512 types to MVT in order for to be able to use + // TableGen. + // TODO: Add support for other vector types, this will require + // defining more value mappings for the new types. + return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || + Ty0.getScalarType().getSizeInBits() == 64); + default: return false; } - }); + }) + .clampScalar(0, S32, S64); + // FIXME: Handle alignment requirements. + auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) + .legalForTypesWithMemDesc({ + {S32, GlobalPtr, 8, 8}, + {S32, GlobalPtr, 16, 8}, + {S32, LocalPtr, 8, 8}, + {S32, LocalPtr, 16, 8}, + {S32, PrivatePtr, 8, 8}, + {S32, PrivatePtr, 16, 8}}); + if (ST.hasFlatAddressSpace()) { + ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, + {S32, FlatPtr, 16, 8}}); + } + + ExtLoads.clampScalar(0, S32, S32) + .widenScalarToNextPow2(0) + .unsupportedIfMemSizeNotPow2() + .lower(); + auto &Atomics = getActionDefinitionsBuilder( {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, @@ -240,84 +590,805 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } - setAction({G_SELECT, S32}, Legal); - setAction({G_SELECT, 1, S1}, Legal); + // TODO: Pointer types, any 32-bit or 64-bit vector + getActionDefinitionsBuilder(G_SELECT) + .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, + GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, + LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) + .clampScalar(0, S16, S64) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .fewerElementsIf(numElementsNotEven(0), scalarize(0)) + .scalarize(1) + .clampMaxNumElements(0, S32, 2) + .clampMaxNumElements(0, LocalPtr, 2) + .clampMaxNumElements(0, PrivatePtr, 2) + .scalarize(0) + .widenScalarToNextPow2(0) + .legalIf(all(isPointer(0), typeIs(1, S1))); - setAction({G_SHL, S32}, Legal); + // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can + // be more flexible with the shift amount type. + auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) + .legalFor({{S32, S32}, {S64, S32}}); + if (ST.has16BitInsts()) { + if (ST.hasVOP3PInsts()) { + Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) + .clampMaxNumElements(0, S16, 2); + } else + Shifts.legalFor({{S16, S32}, {S16, S16}}); - - // FIXME: When RegBankSelect inserts copies, it will only create new - // registers with scalar types. This means we can end up with - // G_LOAD/G_STORE/G_GEP instruction with scalar types for their pointer - // operands. In assert builds, the instruction selector will assert - // if it sees a generic instruction which isn't legal, so we need to - // tell it that scalar types are legal for pointer operands - setAction({G_GEP, S64}, Legal); + Shifts.clampScalar(1, S16, S32); + Shifts.clampScalar(0, S16, S64); + Shifts.widenScalarToNextPow2(0, 16); + } else { + // Make sure we legalize the shift amount type first, as the general + // expansion for the shifted type will produce much worse code if it hasn't + // been truncated already. + Shifts.clampScalar(1, S32, S32); + Shifts.clampScalar(0, S32, S64); + Shifts.widenScalarToNextPow2(0, 32); + } + Shifts.scalarize(0); for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { + unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; + unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; + unsigned IdxTypeIdx = 2; + getActionDefinitionsBuilder(Op) - .legalIf([=](const LegalityQuery &Query) { - const LLT &VecTy = Query.Types[1]; - const LLT &IdxTy = Query.Types[2]; - return VecTy.getSizeInBits() % 32 == 0 && - VecTy.getSizeInBits() <= 512 && - IdxTy.getSizeInBits() == 32; - }); + .customIf([=](const LegalityQuery &Query) { + const LLT EltTy = Query.Types[EltTypeIdx]; + const LLT VecTy = Query.Types[VecTypeIdx]; + const LLT IdxTy = Query.Types[IdxTypeIdx]; + return (EltTy.getSizeInBits() == 16 || + EltTy.getSizeInBits() % 32 == 0) && + VecTy.getSizeInBits() % 32 == 0 && + VecTy.getSizeInBits() <= 512 && + IdxTy.getSizeInBits() == 32; + }) + .clampScalar(EltTypeIdx, S32, S64) + .clampScalar(VecTypeIdx, S32, S64) + .clampScalar(IdxTypeIdx, S32, S32); } - // FIXME: Doesn't handle extract of illegal sizes. - getActionDefinitionsBuilder({G_EXTRACT, G_INSERT}) - .legalIf([=](const LegalityQuery &Query) { - const LLT &Ty0 = Query.Types[0]; - const LLT &Ty1 = Query.Types[1]; - return (Ty0.getSizeInBits() % 32 == 0) && - (Ty1.getSizeInBits() % 32 == 0); + getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) + .unsupportedIf([=](const LegalityQuery &Query) { + const LLT &EltTy = Query.Types[1].getElementType(); + return Query.Types[0] != EltTy; }); + for (unsigned Op : {G_EXTRACT, G_INSERT}) { + unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; + unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; + + // FIXME: Doesn't handle extract of illegal sizes. + getActionDefinitionsBuilder(Op) + .legalIf([=](const LegalityQuery &Query) { + const LLT BigTy = Query.Types[BigTyIdx]; + const LLT LitTy = Query.Types[LitTyIdx]; + return (BigTy.getSizeInBits() % 32 == 0) && + (LitTy.getSizeInBits() % 16 == 0); + }) + .widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT BigTy = Query.Types[BigTyIdx]; + return (BigTy.getScalarSizeInBits() < 16); + }, + LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) + .widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT LitTy = Query.Types[LitTyIdx]; + return (LitTy.getScalarSizeInBits() < 16); + }, + LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) + .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) + .widenScalarToNextPow2(BigTyIdx, 32); + + } + getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalForCartesianProduct(AllS32Vectors, {S32}) - .legalForCartesianProduct(AllS64Vectors, {S64}) - .clampNumElements(0, V16S32, V16S32) - .clampNumElements(0, V2S64, V8S64) - .minScalarSameAs(1, 0); + .legalForCartesianProduct(AllS32Vectors, {S32}) + .legalForCartesianProduct(AllS64Vectors, {S64}) + .clampNumElements(0, V16S32, V16S32) + .clampNumElements(0, V2S64, V8S64) + .minScalarSameAs(1, 0) + .legalIf(isRegisterType(0)) + .minScalarOrElt(0, S32); - // TODO: Support any combination of v2s32 getActionDefinitionsBuilder(G_CONCAT_VECTORS) - .legalFor({{V4S32, V2S32}, - {V8S32, V2S32}, - {V8S32, V4S32}, - {V4S64, V2S64}, - {V4S16, V2S16}, - {V8S16, V2S16}, - {V8S16, V4S16}}); + .legalIf(isRegisterType(0)); // Merge/Unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; + auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { + const LLT &Ty = Query.Types[TypeIdx]; + if (Ty.isVector()) { + const LLT &EltTy = Ty.getElementType(); + if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) + return true; + if (!isPowerOf2_32(EltTy.getSizeInBits())) + return true; + } + return false; + }; + getActionDefinitionsBuilder(Op) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) + // Clamp the little scalar to s8-s256 and make it a power of 2. It's not + // worth considering the multiples of 64 since 2*192 and 2*384 are not + // valid. + .clampScalar(LitTyIdx, S16, S256) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) + + // Break up vectors with weird elements into scalars + .fewerElementsIf( + [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, + scalarize(0)) + .fewerElementsIf( + [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, + scalarize(1)) + .clampScalar(BigTyIdx, S32, S512) + .widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT &Ty = Query.Types[BigTyIdx]; + return !isPowerOf2_32(Ty.getSizeInBits()) && + Ty.getSizeInBits() % 16 != 0; + }, + [=](const LegalityQuery &Query) { + // Pick the next power of 2, or a multiple of 64 over 128. + // Whichever is smaller. + const LLT &Ty = Query.Types[BigTyIdx]; + unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); + if (NewSizeInBits >= 256) { + unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); + if (RoundedTo < NewSizeInBits) + NewSizeInBits = RoundedTo; + } + return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); + }) .legalIf([=](const LegalityQuery &Query) { const LLT &BigTy = Query.Types[BigTyIdx]; const LLT &LitTy = Query.Types[LitTyIdx]; - return BigTy.getSizeInBits() % 32 == 0 && - LitTy.getSizeInBits() % 32 == 0 && + + if (BigTy.isVector() && BigTy.getSizeInBits() < 32) + return false; + if (LitTy.isVector() && LitTy.getSizeInBits() < 32) + return false; + + return BigTy.getSizeInBits() % 16 == 0 && + LitTy.getSizeInBits() % 16 == 0 && BigTy.getSizeInBits() <= 512; }) // Any vectors left are the wrong size. Scalarize them. - .fewerElementsIf([](const LegalityQuery &Query) { return true; }, - [](const LegalityQuery &Query) { - return std::make_pair( - 0, Query.Types[0].getElementType()); - }) - .fewerElementsIf([](const LegalityQuery &Query) { return true; }, - [](const LegalityQuery &Query) { - return std::make_pair( - 1, Query.Types[1].getElementType()); - }); - + .scalarize(0) + .scalarize(1); } computeTables(); verify(*ST.getInstrInfo()); } + +bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_ADDRSPACE_CAST: + return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); + case TargetOpcode::G_FRINT: + return legalizeFrint(MI, MRI, MIRBuilder); + case TargetOpcode::G_FCEIL: + return legalizeFceil(MI, MRI, MIRBuilder); + case TargetOpcode::G_INTRINSIC_TRUNC: + return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); + case TargetOpcode::G_SITOFP: + return legalizeITOFP(MI, MRI, MIRBuilder, true); + case TargetOpcode::G_UITOFP: + return legalizeITOFP(MI, MRI, MIRBuilder, false); + case TargetOpcode::G_FMINNUM: + case TargetOpcode::G_FMAXNUM: + case TargetOpcode::G_FMINNUM_IEEE: + case TargetOpcode::G_FMAXNUM_IEEE: + return legalizeMinNumMaxNum(MI, MRI, MIRBuilder); + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + return legalizeExtractVectorElt(MI, MRI, MIRBuilder); + case TargetOpcode::G_INSERT_VECTOR_ELT: + return legalizeInsertVectorElt(MI, MRI, MIRBuilder); + default: + return false; + } + + llvm_unreachable("expected switch to return"); +} + +Register AMDGPULegalizerInfo::getSegmentAperture( + unsigned AS, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + MachineFunction &MF = MIRBuilder.getMF(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const LLT S32 = LLT::scalar(32); + + if (ST.hasApertureRegs()) { + // FIXME: Use inline constants (src_{shared, private}_base) instead of + // getreg. + unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? + AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : + AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; + unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? + AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : + AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; + unsigned Encoding = + AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | + Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | + WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; + + Register ApertureReg = MRI.createGenericVirtualRegister(S32); + Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) + .addDef(GetReg) + .addImm(Encoding); + MRI.setType(GetReg, S32); + + auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); + MIRBuilder.buildInstr(TargetOpcode::G_SHL) + .addDef(ApertureReg) + .addUse(GetReg) + .addUse(ShiftAmt.getReg(0)); + + return ApertureReg; + } + + Register QueuePtr = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + + // FIXME: Placeholder until we can track the input registers. + MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); + + // Offset into amd_queue_t for group_segment_aperture_base_hi / + // private_segment_aperture_base_hi. + uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; + + // FIXME: Don't use undef + Value *V = UndefValue::get(PointerType::get( + Type::getInt8Ty(MF.getFunction().getContext()), + AMDGPUAS::CONSTANT_ADDRESS)); + + MachinePointerInfo PtrInfo(V, StructOffset); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + 4, + MinAlign(64, StructOffset)); + + Register LoadResult = MRI.createGenericVirtualRegister(S32); + Register LoadAddr; + + MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); + MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); + return LoadResult; +} + +bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + MachineFunction &MF = MIRBuilder.getMF(); + + MIRBuilder.setInstr(MI); + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + unsigned DestAS = DstTy.getAddressSpace(); + unsigned SrcAS = SrcTy.getAddressSpace(); + + // TODO: Avoid reloading from the queue ptr for each cast, or at least each + // vector element. + assert(!DstTy.isVector()); + + const AMDGPUTargetMachine &TM + = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { + MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); + return true; + } + + if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { + assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || + DestAS == AMDGPUAS::PRIVATE_ADDRESS); + unsigned NullVal = TM.getNullPointerValue(DestAS); + + auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); + auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); + + Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); + + // Extract low 32-bits of the pointer. + MIRBuilder.buildExtract(PtrLo32, Src, 0); + + Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); + MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); + MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); + + MI.eraseFromParent(); + return true; + } + + assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || + SrcAS == AMDGPUAS::PRIVATE_ADDRESS); + + auto SegmentNull = + MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); + auto FlatNull = + MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); + + Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); + + Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); + MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); + + Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); + + // Coerce the type of the low half of the result so we can use merge_values. + Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); + MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) + .addDef(SrcAsInt) + .addUse(Src); + + // TODO: Should we allow mismatched types but matching sizes in merges to + // avoid the ptrtoint? + MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); + MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFrint( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + MIRBuilder.setInstr(MI); + + Register Src = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(Src); + assert(Ty.isScalar() && Ty.getSizeInBits() == 64); + + APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); + APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); + + auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); + auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); + + // TODO: Should this propagate fast-math-flags? + auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); + auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); + + auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); + auto Fabs = MIRBuilder.buildFAbs(Ty, Src); + + auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); + MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFceil( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + + const LLT S1 = LLT::scalar(1); + const LLT S64 = LLT::scalar(64); + + Register Src = MI.getOperand(1).getReg(); + assert(MRI.getType(Src) == S64); + + // result = trunc(src) + // if (src > 0.0 && src != result) + // result += 1.0 + + auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); + + const auto Zero = B.buildFConstant(S64, 0.0); + const auto One = B.buildFConstant(S64, 1.0); + auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); + auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); + auto And = B.buildAnd(S1, Lt0, NeTrunc); + auto Add = B.buildSelect(S64, And, One, Zero); + + // TODO: Should this propagate fast-math-flags? + B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); + return true; +} + +static MachineInstrBuilder extractF64Exponent(unsigned Hi, + MachineIRBuilder &B) { + const unsigned FractBits = 52; + const unsigned ExpBits = 11; + LLT S32 = LLT::scalar(32); + + auto Const0 = B.buildConstant(S32, FractBits - 32); + auto Const1 = B.buildConstant(S32, ExpBits); + + auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) + .addUse(Const0.getReg(0)) + .addUse(Const1.getReg(0)); + + return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); +} + +bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + + const LLT S1 = LLT::scalar(1); + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + + Register Src = MI.getOperand(1).getReg(); + assert(MRI.getType(Src) == S64); + + // TODO: Should this use extract since the low half is unused? + auto Unmerge = B.buildUnmerge({S32, S32}, Src); + Register Hi = Unmerge.getReg(1); + + // Extract the upper half, since this is where we will find the sign and + // exponent. + auto Exp = extractF64Exponent(Hi, B); + + const unsigned FractBits = 52; + + // Extract the sign bit. + const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); + auto SignBit = B.buildAnd(S32, Hi, SignBitMask); + + const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); + + const auto Zero32 = B.buildConstant(S32, 0); + + // Extend back to 64-bits. + auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); + + auto Shr = B.buildAShr(S64, FractMask, Exp); + auto Not = B.buildNot(S64, Shr); + auto Tmp0 = B.buildAnd(S64, Src, Not); + auto FiftyOne = B.buildConstant(S32, FractBits - 1); + + auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); + auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); + + auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); + B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); + return true; +} + +bool AMDGPULegalizerInfo::legalizeITOFP( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool Signed) const { + B.setInstr(MI); + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + + const LLT S64 = LLT::scalar(64); + const LLT S32 = LLT::scalar(32); + + assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); + + auto Unmerge = B.buildUnmerge({S32, S32}, Src); + + auto CvtHi = Signed ? + B.buildSITOFP(S64, Unmerge.getReg(1)) : + B.buildUITOFP(S64, Unmerge.getReg(1)); + + auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); + + auto ThirtyTwo = B.buildConstant(S32, 32); + auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) + .addUse(CvtHi.getReg(0)) + .addUse(ThirtyTwo.getReg(0)); + + // TODO: Should this propagate fast-math-flags? + B.buildFAdd(Dst, LdExp, CvtLo); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || + MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; + + // With ieee_mode disabled, the instructions have the correct behavior + // already for G_FMINNUM/G_FMAXNUM + if (!MFI->getMode().IEEE) + return !IsIEEEOp; + + if (IsIEEEOp) + return true; + + MachineIRBuilder HelperBuilder(MI); + GISelObserverWrapper DummyObserver; + LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); + HelperBuilder.setMBB(*MI.getParent()); + return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; +} + +bool AMDGPULegalizerInfo::legalizeExtractVectorElt( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // TODO: Should move some of this into LegalizerHelper. + + // TODO: Promote dynamic indexing of s16 to s32 + // TODO: Dynamic s64 indexing is only legal for SGPR. + Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); + if (!IdxVal) // Dynamic case will be selected to register indexing. + return true; + + Register Dst = MI.getOperand(0).getReg(); + Register Vec = MI.getOperand(1).getReg(); + + LLT VecTy = MRI.getType(Vec); + LLT EltTy = VecTy.getElementType(); + assert(EltTy == MRI.getType(Dst)); + + B.setInstr(MI); + + if (IdxVal.getValue() < VecTy.getNumElements()) + B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); + else + B.buildUndef(Dst); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeInsertVectorElt( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // TODO: Should move some of this into LegalizerHelper. + + // TODO: Promote dynamic indexing of s16 to s32 + // TODO: Dynamic s64 indexing is only legal for SGPR. + Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); + if (!IdxVal) // Dynamic case will be selected to register indexing. + return true; + + Register Dst = MI.getOperand(0).getReg(); + Register Vec = MI.getOperand(1).getReg(); + Register Ins = MI.getOperand(2).getReg(); + + LLT VecTy = MRI.getType(Vec); + LLT EltTy = VecTy.getElementType(); + assert(EltTy == MRI.getType(Ins)); + + B.setInstr(MI); + + if (IdxVal.getValue() < VecTy.getNumElements()) + B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); + else + B.buildUndef(Dst); + + MI.eraseFromParent(); + return true; +} + +// Return the use branch instruction, otherwise null if the usage is invalid. +static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI) { + Register CondDef = MI.getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(CondDef)) + return nullptr; + + MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); + return UseMI.getParent() == MI.getParent() && + UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; +} + +Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, + Register Reg, LLT Ty) const { + Register LiveIn = MRI.getLiveInVirtReg(Reg); + if (LiveIn) + return LiveIn; + + Register NewReg = MRI.createGenericVirtualRegister(Ty); + MRI.addLiveIn(Reg, NewReg); + return NewReg; +} + +bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, + const ArgDescriptor *Arg) const { + if (!Arg->isRegister()) + return false; // TODO: Handle these + + assert(Arg->getRegister() != 0); + assert(Arg->getRegister().isPhysical()); + + MachineRegisterInfo &MRI = *B.getMRI(); + + LLT Ty = MRI.getType(DstReg); + Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); + + if (Arg->isMasked()) { + // TODO: Should we try to emit this once in the entry block? + const LLT S32 = LLT::scalar(32); + const unsigned Mask = Arg->getMask(); + const unsigned Shift = countTrailingZeros<unsigned>(Mask); + + auto ShiftAmt = B.buildConstant(S32, Shift); + auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); + B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); + } else + B.buildCopy(DstReg, LiveIn); + + // Insert the argument copy if it doens't already exist. + // FIXME: It seems EmitLiveInCopies isn't called anywhere? + if (!MRI.getVRegDef(LiveIn)) { + MachineBasicBlock &EntryMBB = B.getMF().front(); + EntryMBB.addLiveIn(Arg->getRegister()); + B.setInsertPt(EntryMBB, EntryMBB.begin()); + B.buildCopy(LiveIn, Arg->getRegister()); + } + + return true; +} + +bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( + MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + B.setInstr(MI); + + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + + const ArgDescriptor *Arg; + const TargetRegisterClass *RC; + std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); + if (!Arg) { + LLVM_DEBUG(dbgs() << "Required arg register missing\n"); + return false; + } + + if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { + MI.eraseFromParent(); + return true; + } + + return false; +} + +bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + if (!MFI->isEntryFunction()) { + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); + } + + B.setInstr(MI); + + uint64_t Offset = + ST.getTargetLowering()->getImplicitParameterOffset( + B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); + + const ArgDescriptor *Arg; + const TargetRegisterClass *RC; + std::tie(Arg, RC) + = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + if (!Arg) + return false; + + Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); + if (!loadInputValue(KernargPtrReg, B, Arg)) + return false; + + B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // Replace the use G_BRCOND with the exec manipulate and branch pseudos. + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + case Intrinsic::amdgcn_if: { + if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { + const SIRegisterInfo *TRI + = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); + + B.setInstr(*BrCond); + Register Def = MI.getOperand(1).getReg(); + Register Use = MI.getOperand(3).getReg(); + B.buildInstr(AMDGPU::SI_IF) + .addDef(Def) + .addUse(Use) + .addMBB(BrCond->getOperand(1).getMBB()); + + MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); + MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); + MI.eraseFromParent(); + BrCond->eraseFromParent(); + return true; + } + + return false; + } + case Intrinsic::amdgcn_loop: { + if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { + const SIRegisterInfo *TRI + = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); + + B.setInstr(*BrCond); + Register Reg = MI.getOperand(2).getReg(); + B.buildInstr(AMDGPU::SI_LOOP) + .addUse(Reg) + .addMBB(BrCond->getOperand(1).getMBB()); + MI.eraseFromParent(); + BrCond->eraseFromParent(); + MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); + return true; + } + + return false; + } + case Intrinsic::amdgcn_kernarg_segment_ptr: + return legalizePreloadedArgIntrin( + MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + case Intrinsic::amdgcn_implicitarg_ptr: + return legalizeImplicitArgPtr(MI, MRI, B); + case Intrinsic::amdgcn_workitem_id_x: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKITEM_ID_X); + case Intrinsic::amdgcn_workitem_id_y: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + case Intrinsic::amdgcn_workitem_id_z: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + case Intrinsic::amdgcn_workgroup_id_x: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKGROUP_ID_X); + case Intrinsic::amdgcn_workgroup_id_y: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); + case Intrinsic::amdgcn_workgroup_id_z: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_dispatch_ptr: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::DISPATCH_PTR); + case Intrinsic::amdgcn_queue_ptr: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::QUEUE_PTR); + case Intrinsic::amdgcn_implicit_buffer_ptr: + return legalizePreloadedArgIntrin( + MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); + case Intrinsic::amdgcn_dispatch_id: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::DISPATCH_ID); + default: + return true; + } + + return true; +} diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 1cbd37c42c4b..3f1cc1d265dd 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -1,9 +1,8 @@ //===- AMDGPULegalizerInfo ---------------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -16,6 +15,7 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "AMDGPUArgumentUsageInfo.h" namespace llvm { @@ -25,9 +25,51 @@ class GCNSubtarget; /// This class provides the information for the target register banks. class AMDGPULegalizerInfo : public LegalizerInfo { + const GCNSubtarget &ST; + public: AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM); + + bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const override; + + Register getSegmentAperture(unsigned AddrSpace, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + + bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, bool Signed) const; + bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; + + Register getLiveInRegister(MachineRegisterInfo &MRI, + Register Reg, LLT Ty) const; + + bool loadInputValue(Register DstReg, MachineIRBuilder &B, + const ArgDescriptor *Arg) const; + bool legalizePreloadedArgIntrin( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + + bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; + }; } // End llvm namespace. #endif diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 14e880042691..ce0a9db7c7f4 100644 --- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -1,9 +1,8 @@ //===- AMDGPULibCalls.cpp -------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPULibFunc.h" +#include "AMDGPUSubtarget.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/ADT/StringSet.h" @@ -23,6 +23,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" @@ -30,6 +31,7 @@ #include "llvm/IR/ValueSymbolTable.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include <vector> #include <cmath> @@ -66,6 +68,8 @@ private: typedef llvm::AMDGPULibFunc FuncInfo; + const TargetMachine *TM; + // -fuse-native. bool AllNative = false; @@ -73,7 +77,7 @@ private: // Return a pointer (pointer expr) to the function if function defintion with // "FuncName" exists. It may create a new function prototype in pre-link mode. - Constant *getFunction(Module *M, const FuncInfo& fInfo); + FunctionCallee getFunction(Module *M, const FuncInfo &fInfo); // Replace a normal function with its native version. bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo); @@ -135,12 +139,15 @@ private: // __read_pipe/__write_pipe bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo); + // llvm.amdgcn.wavefrontsize + bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B); + // Get insertion point at entry. BasicBlock::iterator getEntryIns(CallInst * UI); // Insert an Alloc instruction. AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix); // Get a scalar native builtin signle argument FP function - Constant* getNativeFunction(Module* M, const FuncInfo &FInfo); + FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo); protected: CallInst *CI; @@ -153,6 +160,8 @@ protected: } public: + AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {} + bool fold(CallInst *CI, AliasAnalysis *AA = nullptr); void initNativeFuncs(); @@ -167,15 +176,16 @@ namespace { class AMDGPUSimplifyLibCalls : public FunctionPass { - AMDGPULibCalls Simplifier; - const TargetOptions Options; + AMDGPULibCalls Simplifier; + public: static char ID; // Pass identification - AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions()) - : FunctionPass(ID), Options(Opt) { + AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(), + const TargetMachine *TM = nullptr) + : FunctionPass(ID), Options(Opt), Simplifier(TM) { initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } @@ -217,19 +227,19 @@ INITIALIZE_PASS(AMDGPUUseNativeCalls, "amdgpu-usenative", false, false) template <typename IRB> -static CallInst *CreateCallEx(IRB &B, Value *Callee, Value *Arg, +static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg, const Twine &Name = "") { CallInst *R = B.CreateCall(Callee, Arg, Name); - if (Function* F = dyn_cast<Function>(Callee)) + if (Function *F = dyn_cast<Function>(Callee.getCallee())) R->setCallingConv(F->getCallingConv()); return R; } template <typename IRB> -static CallInst *CreateCallEx2(IRB &B, Value *Callee, Value *Arg1, Value *Arg2, - const Twine &Name = "") { +static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1, + Value *Arg2, const Twine &Name = "") { CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name); - if (Function* F = dyn_cast<Function>(Callee)) + if (Function *F = dyn_cast<Function>(Callee.getCallee())) R->setCallingConv(F->getCallingConv()); return R; } @@ -472,7 +482,7 @@ static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) { return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType; } -Constant *AMDGPULibCalls::getFunction(Module *M, const FuncInfo& fInfo) { +FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) { // If we are doing PreLinkOpt, the function is external. So it is safe to // use getOrInsertFunction() at this stage. @@ -519,11 +529,11 @@ bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) { nf.setPrefix(AMDGPULibFunc::NATIVE); nf.setId(AMDGPULibFunc::EI_SIN); - Constant *sinExpr = getFunction(M, nf); + FunctionCallee sinExpr = getFunction(M, nf); nf.setPrefix(AMDGPULibFunc::NATIVE); nf.setId(AMDGPULibFunc::EI_COS); - Constant *cosExpr = getFunction(M, nf); + FunctionCallee cosExpr = getFunction(M, nf); if (sinExpr && cosExpr) { Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI); Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI); @@ -555,7 +565,7 @@ bool AMDGPULibCalls::useNative(CallInst *aCI) { return sincosUseNative(aCI, FInfo); FInfo.setPrefix(AMDGPULibFunc::NATIVE); - Constant *F = getFunction(aCI->getModule(), FInfo); + FunctionCallee F = getFunction(aCI->getModule(), FInfo); if (!F) return false; @@ -613,7 +623,7 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, auto *FTy = FunctionType::get(Callee->getReturnType(), ArrayRef<Type *>(ArgTys), false); AMDGPULibFunc NewLibFunc(Name, FTy); - auto *F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc); + FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc); if (!F) return false; @@ -640,14 +650,6 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { // Ignore indirect calls. if (Callee == 0) return false; - FuncInfo FInfo; - if (!parseFunctionName(Callee->getName(), &FInfo)) - return false; - - // Further check the number of arguments to see if they match. - if (CI->getNumArgOperands() != FInfo.getNumArgs()) - return false; - BasicBlock *BB = CI->getParent(); LLVMContext &Context = CI->getParent()->getContext(); IRBuilder<> B(Context); @@ -659,6 +661,21 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI)) B.setFastMathFlags(FPOp->getFastMathFlags()); + switch (Callee->getIntrinsicID()) { + default: + break; + case Intrinsic::amdgcn_wavefrontsize: + return !EnablePreLink && fold_wavefrontsize(CI, B); + } + + FuncInfo FInfo; + if (!parseFunctionName(Callee->getName(), &FInfo)) + return false; + + // Further check the number of arguments to see if they match. + if (CI->getNumArgOperands() != FInfo.getNumArgs()) + return false; + if (TDOFold(CI, FInfo)) return true; @@ -795,7 +812,7 @@ bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) { AMDGPULibFunc nf = FInfo; nf.setPrefix(AMDGPULibFunc::NATIVE); - if (Constant *FPExpr = getFunction(M, nf)) { + if (FunctionCallee FPExpr = getFunction(M, nf)) { LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> "); CI->setCalledFunction(FPExpr); @@ -848,7 +865,7 @@ bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B, namespace llvm { static double log2(double V) { -#if _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE || _POSIX_C_SOURCE >= 200112L +#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L return ::log2(V); #else return log(V) / 0.693147180559945309417; @@ -934,9 +951,10 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) { // pow[r](x, [-]0.5) = sqrt(x) bool issqrt = CF->isExactlyValue(0.5); - if (Constant *FPExpr = getFunction(M, - AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT - : AMDGPULibFunc::EI_RSQRT, FInfo))) { + if (FunctionCallee FPExpr = + getFunction(M, AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT + : AMDGPULibFunc::EI_RSQRT, + FInfo))) { LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << FInfo.getName().c_str() << "(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt" @@ -1003,8 +1021,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, // powr ---> exp2(y * log2(x)) // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31)) - Constant *ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, - FInfo)); + FunctionCallee ExpExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo)); if (!ExpExpr) return false; @@ -1090,8 +1108,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, Value *nval; if (needabs) { - Constant *AbsExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS, - FInfo)); + FunctionCallee AbsExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS, FInfo)); if (!AbsExpr) return false; nval = CreateCallEx(B, AbsExpr, opr0, "__fabs"); @@ -1099,8 +1117,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, nval = cnval ? cnval : opr0; } if (needlog) { - Constant *LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, - FInfo)); + FunctionCallee LogExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo)); if (!LogExpr) return false; nval = CreateCallEx(B,LogExpr, nval, "__log2"); @@ -1159,8 +1177,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, std::vector<const Type*> ParamsTys; ParamsTys.push_back(opr0->getType()); Module *M = CI->getModule(); - if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, - FInfo))) { + if (FunctionCallee FPExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt"); replaceCall(nval); @@ -1168,8 +1186,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, } } else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x) Module *M = CI->getModule(); - if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, - FInfo))) { + if (FunctionCallee FPExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) { LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt"); replaceCall(nval); @@ -1186,8 +1204,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, std::vector<const Type*> ParamsTys; ParamsTys.push_back(opr0->getType()); Module *M = CI->getModule(); - if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, - FInfo))) { + if (FunctionCallee FPExpr = + getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) { LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt"); @@ -1243,7 +1261,8 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B, } // Get a scalar native builtin signle argument FP function -Constant* AMDGPULibCalls::getNativeFunction(Module* M, const FuncInfo& FInfo) { +FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M, + const FuncInfo &FInfo) { if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId())) return nullptr; FuncInfo nf = FInfo; @@ -1256,8 +1275,8 @@ bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo) { if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) && (FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) { - if (Constant *FPExpr = getNativeFunction( - CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { + if (FunctionCallee FPExpr = getNativeFunction( + CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { Value *opr0 = CI->getArgOperand(0); LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << "sqrt(" << *opr0 << ")\n"); @@ -1334,7 +1353,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, // function. AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo); nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS); - Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf)); + FunctionCallee Fsincos = getFunction(M, nf); if (!Fsincos) return false; BasicBlock::iterator ItOld = B.GetInsertPoint(); @@ -1342,7 +1361,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, B.SetInsertPoint(UI); Value *P = Alloc; - Type *PTy = Fsincos->getFunctionType()->getParamType(1); + Type *PTy = Fsincos.getFunctionType()->getParamType(1); // The allocaInst allocates the memory in private address space. This need // to be bitcasted to point to the address space of cos pointer type. // In OpenCL 2.0 this is generic, while in 1.2 that is private. @@ -1356,12 +1375,12 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, if (!isSin) { // CI->cos, UI->sin B.SetInsertPoint(&*ItOld); UI->replaceAllUsesWith(&*Call); - Instruction *Reload = B.CreateLoad(Alloc); + Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc); CI->replaceAllUsesWith(Reload); UI->eraseFromParent(); CI->eraseFromParent(); } else { // CI->sin, UI->cos - Instruction *Reload = B.CreateLoad(Alloc); + Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc); UI->replaceAllUsesWith(Reload); CI->replaceAllUsesWith(Call); UI->eraseFromParent(); @@ -1370,6 +1389,29 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, return true; } +bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) { + if (!TM) + return false; + + StringRef CPU = TM->getTargetCPU(); + StringRef Features = TM->getTargetFeatureString(); + if ((CPU.empty() || CPU.equals_lower("generic")) && + (Features.empty() || + Features.find_lower("wavefrontsize") == StringRef::npos)) + return false; + + Function *F = CI->getParent()->getParent(); + const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F); + unsigned N = ST.getWavefrontSize(); + + LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with " + << N << "\n"); + + CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N)); + CI->eraseFromParent(); + return true; +} + // Get insertion point at entry. BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) { Function * Func = UI->getParent()->getParent(); @@ -1679,8 +1721,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) { } // Public interface to the Simplify LibCalls pass. -FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) { - return new AMDGPUSimplifyLibCalls(Opt); +FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt, + const TargetMachine *TM) { + return new AMDGPUSimplifyLibCalls(Opt, TM); } FunctionPass *llvm::createAMDGPUUseNativeCallsPass() { diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp index 4fc3fe0f105b..a5bac25701a0 100644 --- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -1,9 +1,8 @@ //===-- AMDGPULibFunc.cpp -------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -64,6 +63,8 @@ struct ManglingRule { int getNumLeads() const { return (Lead[0] ? 1 : 0) + (Lead[1] ? 1 : 0); } unsigned getNumArgs() const; + + static StringMap<int> buildManglingRulesMap(); }; // Information about library functions with unmangled names. @@ -77,16 +78,7 @@ class UnmangledFuncInfo { // Number of entries in Table. static const unsigned TableSize; - // Map function name to index. - class NameMap : public StringMap<unsigned> { - public: - NameMap() { - for (unsigned I = 0; I != TableSize; ++I) - (*this)[Table[I].Name] = I; - } - }; - friend class NameMap; - static NameMap Map; + static StringMap<unsigned> buildNameMap(); public: using ID = AMDGPULibFunc::EFuncId; @@ -102,7 +94,8 @@ public: static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED); } static ID toFuncId(unsigned Index) { - assert(Index < TableSize && "Invalid unmangled library function"); + assert(Index < TableSize && + "Invalid unmangled library function"); return static_cast<ID>( Index + 1 + static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED)); } @@ -350,18 +343,7 @@ const UnmangledFuncInfo UnmangledFuncInfo::Table[] = { }; const unsigned UnmangledFuncInfo::TableSize = - sizeof(UnmangledFuncInfo::Table) / sizeof(UnmangledFuncInfo::Table[0]); - -UnmangledFuncInfo::NameMap UnmangledFuncInfo::Map; - -static const struct ManglingRulesMap : public StringMap<int> { - ManglingRulesMap() - : StringMap<int>(sizeof(manglingRules)/sizeof(manglingRules[0])) { - int Id = 0; - for (auto Rule : manglingRules) - insert({ Rule.Name, Id++ }); - } -} manglingRulesMap; + array_lengthof(UnmangledFuncInfo::Table); static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id, const AMDGPULibFunc::Param (&Leads)[2]) { @@ -569,7 +551,17 @@ static AMDGPULibFunc::ENamePrefix parseNamePrefix(StringRef& mangledName) { return Pfx; } +StringMap<int> ManglingRule::buildManglingRulesMap() { + StringMap<int> Map(array_lengthof(manglingRules)); + int Id = 0; + for (auto Rule : manglingRules) + Map.insert({Rule.Name, Id++}); + return Map; +} + bool AMDGPUMangledLibFunc::parseUnmangledName(StringRef FullName) { + static const StringMap<int> manglingRulesMap = + ManglingRule::buildManglingRulesMap(); FuncId = static_cast<EFuncId>(manglingRulesMap.lookup(FullName)); return FuncId != EI_NONE; } @@ -961,8 +953,8 @@ Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) { return nullptr; } -Function *AMDGPULibFunc::getOrInsertFunction(Module *M, - const AMDGPULibFunc &fInfo) { +FunctionCallee AMDGPULibFunc::getOrInsertFunction(Module *M, + const AMDGPULibFunc &fInfo) { std::string const FuncName = fInfo.mangle(); Function *F = dyn_cast_or_null<Function>( M->getValueSymbolTable().lookup(FuncName)); @@ -988,7 +980,7 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M, } } - Constant *C = nullptr; + FunctionCallee C; if (hasPtr) { // Do not set extra attributes for functions with pointer arguments. C = M->getOrInsertFunction(FuncName, FuncTy); @@ -1002,10 +994,18 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M, C = M->getOrInsertFunction(FuncName, FuncTy, Attr); } - return cast<Function>(C); + return C; +} + +StringMap<unsigned> UnmangledFuncInfo::buildNameMap() { + StringMap<unsigned> Map; + for (unsigned I = 0; I != TableSize; ++I) + Map[Table[I].Name] = I; + return Map; } bool UnmangledFuncInfo::lookup(StringRef Name, ID &Id) { + static const StringMap<unsigned> Map = buildNameMap(); auto Loc = Map.find(Name); if (Loc != Map.end()) { Id = toFuncId(Loc->second); diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h index fe062384800a..2354ed7df205 100644 --- a/lib/Target/AMDGPU/AMDGPULibFunc.h +++ b/lib/Target/AMDGPU/AMDGPULibFunc.h @@ -1,9 +1,8 @@ //===-- AMDGPULibFunc.h ----------------------------------------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -394,8 +393,8 @@ public: } static Function *getFunction(llvm::Module *M, const AMDGPULibFunc &fInfo); - static Function *getOrInsertFunction(llvm::Module *M, - const AMDGPULibFunc &fInfo); + static FunctionCallee getOrInsertFunction(llvm::Module *M, + const AMDGPULibFunc &fInfo); static bool parse(StringRef MangledName, AMDGPULibFunc &Ptr); private: diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index 2cec8fe53283..15032969890e 100644 --- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -1,9 +1,8 @@ //===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 743dc7a0d00b..5dd5b3691e0a 100644 --- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -1,9 +1,8 @@ //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -110,8 +109,9 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { // modes on SI to know the high bits are 0 so pointer adds don't wrap. We // can't represent this with range metadata because it's only allowed for // integer types. - if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && - ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) + if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && + !ST.hasUsableDSOffset()) continue; // FIXME: We can replace this with equivalent alias.scope/noalias @@ -132,6 +132,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { KernArgBaseAlign); Value *ArgPtr; + Type *AdjustedArgTy; if (DoShiftOpt) { // FIXME: Handle aggregate types // Since we don't have sub-dword scalar loads, avoid doing an extload by // loading earlier than the argument address, and extracting the relevant @@ -139,30 +140,27 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { // // Additionally widen any sub-dword load to i32 even if suitably aligned, // so that CSE between different argument loads works easily. - ArgPtr = Builder.CreateConstInBoundsGEP1_64( - KernArgSegment, - AlignDownOffset, - Arg.getName() + ".kernarg.offset.align.down"); - ArgPtr = Builder.CreateBitCast(ArgPtr, - Builder.getInt32Ty()->getPointerTo(AS), - ArgPtr->getName() + ".cast"); + Builder.getInt8Ty(), KernArgSegment, AlignDownOffset, + Arg.getName() + ".kernarg.offset.align.down"); + AdjustedArgTy = Builder.getInt32Ty(); } else { ArgPtr = Builder.CreateConstInBoundsGEP1_64( - KernArgSegment, - EltOffset, - Arg.getName() + ".kernarg.offset"); - ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS), - ArgPtr->getName() + ".cast"); + Builder.getInt8Ty(), KernArgSegment, EltOffset, + Arg.getName() + ".kernarg.offset"); + AdjustedArgTy = ArgTy; } if (IsV3 && Size >= 32) { V4Ty = VectorType::get(VT->getVectorElementType(), 4); // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads - ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS)); + AdjustedArgTy = V4Ty; } - LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign); + ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS), + ArgPtr->getName() + ".cast"); + LoadInst *Load = + Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); MDBuilder MDB(Ctx); diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index a43dcef4cf0b..00e12f808783 100644 --- a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -1,9 +1,8 @@ //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index f6bdbf5e9be2..ae4c32c258a7 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -1,9 +1,8 @@ //===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,7 +15,7 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" -#include "InstPrinter/AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "R600AsmPrinter.h" #include "SIInstrInfo.h" @@ -91,6 +90,10 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) { return MCSymbolRefExpr::VK_AMDGPU_REL32_LO; case SIInstrInfo::MO_REL32_HI: return MCSymbolRefExpr::VK_AMDGPU_REL32_HI; + case SIInstrInfo::MO_ABS32_LO: + return MCSymbolRefExpr::VK_AMDGPU_ABS32_LO; + case SIInstrInfo::MO_ABS32_HI: + return MCSymbolRefExpr::VK_AMDGPU_ABS32_HI; } } @@ -101,17 +104,22 @@ const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr( = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx); const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx); - assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 && - ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4); + // FIXME: The first half of this assert should be removed. This should + // probably be PC relative instead of using the source block symbol, and + // therefore the indirect branch expansion should use a bundle. + assert( + skipDebugInstructionsForward(SrcBB.begin(), SrcBB.end())->getOpcode() == + AMDGPU::S_GETPC_B64 && + ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4); // s_getpc_b64 returns the address of next instruction. const MCConstantExpr *One = MCConstantExpr::create(4, Ctx); SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx); - if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD) + if (MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_FORWARD) return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx); - assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD); + assert(MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_BACKWARD); return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx); } @@ -142,10 +150,13 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, SmallString<128> SymbolName; AP.getNameWithPrefix(SymbolName, GV); MCSymbol *Sym = Ctx.getOrCreateSymbol(SymbolName); - const MCExpr *SymExpr = + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx); - const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr, - MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); + int64_t Offset = MO.getOffset(); + if (Offset != 0) { + Expr = MCBinaryExpr::createAdd(Expr, + MCConstantExpr::create(Offset, Ctx), Ctx); + } MCOp = MCOperand::createExpr(Expr); return true; } @@ -321,14 +332,13 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { } #endif - if (STI.dumpCode()) { - // Disassemble instruction/operands to text. + if (DumpCodeInstEmitter) { + // Disassemble instruction/operands to text DisasmLines.resize(DisasmLines.size() + 1); std::string &DisasmLine = DisasmLines.back(); raw_string_ostream DisasmStream(DisasmLine); - AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), - *STI.getInstrInfo(), + AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *STI.getInstrInfo(), *STI.getRegisterInfo()); InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), STI); @@ -337,10 +347,8 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<char, 16> CodeBytes; raw_svector_ostream CodeStream(CodeBytes); - auto &ObjStreamer = static_cast<MCObjectStreamer&>(*OutStreamer); - MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); - InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups, - MF->getSubtarget<MCSubtargetInfo>()); + DumpCodeInstEmitter->encodeInstruction( + TmpInst, CodeStream, Fixups, MF->getSubtarget<MCSubtargetInfo>()); HexLines.resize(HexLines.size() + 1); std::string &HexLine = HexLines.back(); raw_string_ostream HexStream(HexLine); diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 6f44e2dbb2d5..237490957058 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -1,9 +1,8 @@ //===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 13b4b50149ce..0d3a1f1a769f 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUMachineFunctionInfo.cpp ---------------------------------------=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -30,13 +29,13 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : // except reserved size is not correctly aligned. const Function &F = MF.getFunction(); - if (auto *Resolver = MF.getMMI().getResolver()) { - if (AMDGPUPerfHintAnalysis *PHA = static_cast<AMDGPUPerfHintAnalysis*>( - Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) { - MemoryBound = PHA->isMemoryBound(&F); - WaveLimiter = PHA->needsWaveLimiter(&F); - } - } + Attribute MemBoundAttr = F.getFnAttribute("amdgpu-memory-bound"); + MemoryBound = MemBoundAttr.isStringAttribute() && + MemBoundAttr.getValueAsString() == "true"; + + Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter"); + WaveLimiter = WaveLimitAttr.isStringAttribute() && + WaveLimitAttr.getValueAsString() == "true"; CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 8d6b871bc03e..52987e2fa411 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -1,9 +1,8 @@ //===-- AMDGPUMachineFunctionInfo.h -------------------------------*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp index 7b9f673c418c..4d9f08b3af01 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp @@ -1,9 +1,8 @@ //===--- AMDGPUMachineModuleInfo.cpp ----------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -24,6 +23,16 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI) AgentSSID = CTX.getOrInsertSyncScopeID("agent"); WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup"); WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront"); + SystemOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("one-as"); + AgentOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("agent-one-as"); + WorkgroupOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("workgroup-one-as"); + WavefrontOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("wavefront-one-as"); + SingleThreadOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("singlethread-one-as"); } } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h index 1219ab26fb69..2b0b8b42acfe 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h +++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -1,9 +1,8 @@ //===--- AMDGPUMachineModuleInfo.h ------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -30,12 +29,22 @@ private: // All supported memory/synchronization scopes can be found here: // http://llvm.org/docs/AMDGPUUsage.html#memory-scopes - /// Agent synchronization scope ID. + /// Agent synchronization scope ID (cross address space). SyncScope::ID AgentSSID; - /// Workgroup synchronization scope ID. + /// Workgroup synchronization scope ID (cross address space). SyncScope::ID WorkgroupSSID; - /// Wavefront synchronization scope ID. + /// Wavefront synchronization scope ID (cross address space). SyncScope::ID WavefrontSSID; + /// System synchronization scope ID (single address space). + SyncScope::ID SystemOneAddressSpaceSSID; + /// Agent synchronization scope ID (single address space). + SyncScope::ID AgentOneAddressSpaceSSID; + /// Workgroup synchronization scope ID (single address space). + SyncScope::ID WorkgroupOneAddressSpaceSSID; + /// Wavefront synchronization scope ID (single address space). + SyncScope::ID WavefrontOneAddressSpaceSSID; + /// Single thread synchronization scope ID (single address space). + SyncScope::ID SingleThreadOneAddressSpaceSSID; /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization @@ -44,35 +53,70 @@ private: /// \returns \p SSID's inclusion ordering, or "None" if \p SSID is not /// supported by the AMDGPU target. Optional<uint8_t> getSyncScopeInclusionOrdering(SyncScope::ID SSID) const { - if (SSID == SyncScope::SingleThread) + if (SSID == SyncScope::SingleThread || + SSID == getSingleThreadOneAddressSpaceSSID()) return 0; - else if (SSID == getWavefrontSSID()) + else if (SSID == getWavefrontSSID() || + SSID == getWavefrontOneAddressSpaceSSID()) return 1; - else if (SSID == getWorkgroupSSID()) + else if (SSID == getWorkgroupSSID() || + SSID == getWorkgroupOneAddressSpaceSSID()) return 2; - else if (SSID == getAgentSSID()) + else if (SSID == getAgentSSID() || + SSID == getAgentOneAddressSpaceSSID()) return 3; - else if (SSID == SyncScope::System) + else if (SSID == SyncScope::System || + SSID == getSystemOneAddressSpaceSSID()) return 4; return None; } + /// \returns True if \p SSID is restricted to single address space, false + /// otherwise + bool isOneAddressSpace(SyncScope::ID SSID) const { + return SSID == getSingleThreadOneAddressSpaceSSID() || + SSID == getWavefrontOneAddressSpaceSSID() || + SSID == getWorkgroupOneAddressSpaceSSID() || + SSID == getAgentOneAddressSpaceSSID() || + SSID == getSystemOneAddressSpaceSSID(); + } + public: AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI); - /// \returns Agent synchronization scope ID. + /// \returns Agent synchronization scope ID (cross address space). SyncScope::ID getAgentSSID() const { return AgentSSID; } - /// \returns Workgroup synchronization scope ID. + /// \returns Workgroup synchronization scope ID (cross address space). SyncScope::ID getWorkgroupSSID() const { return WorkgroupSSID; } - /// \returns Wavefront synchronization scope ID. + /// \returns Wavefront synchronization scope ID (cross address space). SyncScope::ID getWavefrontSSID() const { return WavefrontSSID; } + /// \returns System synchronization scope ID (single address space). + SyncScope::ID getSystemOneAddressSpaceSSID() const { + return SystemOneAddressSpaceSSID; + } + /// \returns Agent synchronization scope ID (single address space). + SyncScope::ID getAgentOneAddressSpaceSSID() const { + return AgentOneAddressSpaceSSID; + } + /// \returns Workgroup synchronization scope ID (single address space). + SyncScope::ID getWorkgroupOneAddressSpaceSSID() const { + return WorkgroupOneAddressSpaceSSID; + } + /// \returns Wavefront synchronization scope ID (single address space). + SyncScope::ID getWavefrontOneAddressSpaceSSID() const { + return WavefrontOneAddressSpaceSSID; + } + /// \returns Single thread synchronization scope ID (single address space). + SyncScope::ID getSingleThreadOneAddressSpaceSSID() const { + return SingleThreadOneAddressSpaceSSID; + } /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization @@ -88,7 +132,11 @@ public: if (!AIO || !BIO) return None; - return AIO.getValue() > BIO.getValue(); + bool IsAOneAddressSpace = isOneAddressSpace(A); + bool IsBOneAddressSpace = isOneAddressSpace(B); + + return AIO.getValue() >= BIO.getValue() && + (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace); } }; diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp index 5e0b7d429022..8c11230f411a 100644 --- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp +++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -1,9 +1,8 @@ //===--- AMDGPUMacroFusion.cpp - AMDGPU Macro Fusion ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/lib/Target/AMDGPU/AMDGPUMacroFusion.h index 844958580a65..da4b3cf8bc24 100644 --- a/lib/Target/AMDGPU/AMDGPUMacroFusion.h +++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.h @@ -1,9 +1,8 @@ //===- AMDGPUMacroFusion.h - AMDGPU Macro Fusion ----------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index 7bd8533a0ccf..f7231471c107 100644 --- a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -1,9 +1,8 @@ //===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -120,11 +119,11 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { auto T = ArrayType::get(Type::getInt64Ty(C), 2); auto *GV = new GlobalVariable( M, T, - /*IsConstant=*/false, GlobalValue::ExternalLinkage, + /*isConstant=*/false, GlobalValue::ExternalLinkage, /*Initializer=*/Constant::getNullValue(T), RuntimeHandle, /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, - /*IsExternallyInitialized=*/false); + /*isExternallyInitialized=*/false); LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); for (auto U : F.users()) { diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h index 2feff14d34a1..8b69f51c1a0d 100644 --- a/lib/Target/AMDGPU/AMDGPUPTNote.h +++ b/lib/Target/AMDGPU/AMDGPUPTNote.h @@ -1,9 +1,8 @@ //===-- AMDGPUNoteType.h - AMDGPU ELF PT_NOTE section info-------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index e53a8fe7c074..9613d5a843b3 100644 --- a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -1,9 +1,8 @@ //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -18,6 +17,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -72,7 +72,7 @@ public: const TargetLowering *TLI_) : FIM(FIM_), DL(nullptr), TLI(TLI_) {} - void runOnFunction(Function &F); + bool runOnFunction(Function &F); private: struct MemAccessInfo { @@ -101,7 +101,7 @@ private: const TargetLowering *TLI; - void visit(const Function &F); + AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F); static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); @@ -203,12 +203,8 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { return false; } -void AMDGPUPerfHint::visit(const Function &F) { - auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo())); - if (!FIP.second) - return; - - AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second; +AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { + AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); @@ -234,10 +230,10 @@ void AMDGPUPerfHint::visit(const Function &F) { if (&F == Callee) // Handle immediate recursion continue; - visit(*Callee); auto Loc = FIM.find(Callee); + if (Loc == FIM.end()) + continue; - assert(Loc != FIM.end() && "No func info"); FI.MemInstCount += Loc->second.MemInstCount; FI.InstCount += Loc->second.InstCount; FI.IAMInstCount += Loc->second.IAMInstCount; @@ -257,36 +253,39 @@ void AMDGPUPerfHint::visit(const Function &F) { } } } -} -void AMDGPUPerfHint::runOnFunction(Function &F) { - if (FIM.find(&F) != FIM.end()) - return; + return &FI; +} +bool AMDGPUPerfHint::runOnFunction(Function &F) { const Module &M = *F.getParent(); DL = &M.getDataLayout(); - visit(F); - auto Loc = FIM.find(&F); + if (F.hasFnAttribute("amdgpu-wave-limiter") && + F.hasFnAttribute("amdgpu-memory-bound")) + return false; + + const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); - assert(Loc != FIM.end() && "No func info"); - LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount + LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount << '\n' - << " IAMInst: " << Loc->second.IAMInstCount << '\n' - << " LSMInst: " << Loc->second.LSMInstCount << '\n' - << " TotalInst: " << Loc->second.InstCount << '\n'); - - auto &FI = Loc->second; + << " IAMInst: " << Info->IAMInstCount << '\n' + << " LSMInst: " << Info->LSMInstCount << '\n' + << " TotalInst: " << Info->InstCount << '\n'); - if (isMemBound(FI)) { + if (isMemBound(*Info)) { LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); NumMemBound++; + F.addFnAttr("amdgpu-memory-bound", "true"); } - if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) { + if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) { LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); NumLimitWave++; + F.addFnAttr("amdgpu-wave-limiter", "true"); } + + return true; } bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { @@ -365,17 +364,27 @@ bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( } } // namespace -bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) { +bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) { auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); if (!TPC) return false; const TargetMachine &TM = TPC->getTM<TargetMachine>(); - const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F); - AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); - Analyzer.runOnFunction(F); - return false; + bool Changed = false; + for (CallGraphNode *I : SCC) { + Function *F = I->getFunction(); + if (!F || F->isDeclaration()) + continue; + + const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F); + AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); + + if (Analyzer.runOnFunction(*F)) + Changed = true; + } + + return Changed; } bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h index be7f37cb6815..9599e09fbd96 100644 --- a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -1,9 +1,8 @@ -//===- AMDGPUPerfHintAnalysis.h - analysis of functions memory traffic ----===// +//===- AMDGPUPerfHintAnalysis.h ---- analysis of memory traffic -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,18 +14,20 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H #define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H + +#include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/IR/ValueMap.h" #include "llvm/Pass.h" namespace llvm { -struct AMDGPUPerfHintAnalysis : public FunctionPass { +struct AMDGPUPerfHintAnalysis : public CallGraphSCCPass { static char ID; public: - AMDGPUPerfHintAnalysis() : FunctionPass(ID) {} + AMDGPUPerfHintAnalysis() : CallGraphSCCPass(ID) {} - bool runOnFunction(Function &F) override; + bool runOnSCC(CallGraphSCC &SCC) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 5d087c099184..e4c9d6685d4a 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -163,12 +162,16 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { bool SufficientLDS = hasSufficientLocalMem(F); bool Changed = false; BasicBlock &EntryBB = *F.begin(); - for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) { - AllocaInst *AI = dyn_cast<AllocaInst>(I); - ++I; - if (AI) - Changed |= handleAlloca(*AI, SufficientLDS); + SmallVector<AllocaInst *, 16> Allocas; + for (Instruction &I : EntryBB) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) + Allocas.push_back(AI); + } + + for (AllocaInst *AI : Allocas) { + if (handleAlloca(*AI, SufficientLDS)) + Changed = true; } return Changed; @@ -245,11 +248,11 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { // We could do a single 64-bit load here, but it's likely that the basic // 32-bit and extract sequence is already present, and it is probably easier // to CSE this. The loads should be mergable later anyway. - Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1); - LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4); + Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1); + LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4); - Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2); - LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4); + Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2); + LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4); MDNode *MD = MDNode::get(Mod->getContext(), None); LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); @@ -427,7 +430,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); - Value *VecValue = Builder.CreateLoad(BitCast); + Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); Inst->replaceAllUsesWith(ExtractElement); Inst->eraseFromParent(); @@ -442,7 +445,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { Value *Ptr = SI->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); - Value *VecValue = Builder.CreateLoad(BitCast); + Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *NewVecValue = Builder.CreateInsertElement(VecValue, SI->getValueOperand(), Index); @@ -919,7 +922,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { ); CallInst *NewCall = Builder.CreateCall( - ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)}); + ObjectSize, + {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)}); Intr->replaceAllUsesWith(NewCall); Intr->eraseFromParent(); continue; diff --git a/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp new file mode 100644 index 000000000000..7a7addd0f5cf --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -0,0 +1,336 @@ +//===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass propagates attributes from kernels to the non-entry +/// functions. Most of the library functions were not compiled for specific ABI, +/// yet will be correctly compiled if proper attrbutes are propagated from the +/// caller. +/// +/// The pass analyzes call graph and propagates ABI target features through the +/// call graph. +/// +/// It can run in two modes: as a function or module pass. A function pass +/// simply propagates attributes. A module pass clones functions if there are +/// callers with different ABI. If a function is clonned all call sites will +/// be updated to use a correct clone. +/// +/// A function pass is limited in functionality but can run early in the +/// pipeline. A module pass is more powerful but has to run late, so misses +/// library folding opportunities. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include <string> + +#define DEBUG_TYPE "amdgpu-propagate-attributes" + +using namespace llvm; + +namespace llvm { +extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1]; +} + +namespace { + +class AMDGPUPropagateAttributes { + const FeatureBitset TargetFeatures = { + AMDGPU::FeatureWavefrontSize16, + AMDGPU::FeatureWavefrontSize32, + AMDGPU::FeatureWavefrontSize64 + }; + + class Clone{ + public: + Clone(FeatureBitset FeatureMask, Function *OrigF, Function *NewF) : + FeatureMask(FeatureMask), OrigF(OrigF), NewF(NewF) {} + + FeatureBitset FeatureMask; + Function *OrigF; + Function *NewF; + }; + + const TargetMachine *TM; + + // Clone functions as needed or just set attributes. + bool AllowClone; + + // Option propagation roots. + SmallSet<Function *, 32> Roots; + + // Clones of functions with their attributes. + SmallVector<Clone, 32> Clones; + + // Find a clone with required features. + Function *findFunction(const FeatureBitset &FeaturesNeeded, + Function *OrigF); + + // Clone function F and set NewFeatures on the clone. + // Cole takes the name of original function. + Function *cloneWithFeatures(Function &F, + const FeatureBitset &NewFeatures); + + // Set new function's features in place. + void setFeatures(Function &F, const FeatureBitset &NewFeatures); + + std::string getFeatureString(const FeatureBitset &Features) const; + + // Propagate attributes from Roots. + bool process(); + +public: + AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) : + TM(TM), AllowClone(AllowClone) {} + + // Use F as a root and propagate its attributes. + bool process(Function &F); + + // Propagate attributes starting from kernel functions. + bool process(Module &M); +}; + +// Allows to propagate attributes early, but no clonning is allowed as it must +// be a function pass to run before any optimizations. +// TODO: We shall only need a one instance of module pass, but that needs to be +// in the linker pipeline which is currently not possible. +class AMDGPUPropagateAttributesEarly : public FunctionPass { + const TargetMachine *TM; + +public: + static char ID; // Pass identification + + AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) : + FunctionPass(ID), TM(TM) { + initializeAMDGPUPropagateAttributesEarlyPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; +}; + +// Allows to propagate attributes with clonning but does that late in the +// pipeline. +class AMDGPUPropagateAttributesLate : public ModulePass { + const TargetMachine *TM; + +public: + static char ID; // Pass identification + + AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) : + ModulePass(ID), TM(TM) { + initializeAMDGPUPropagateAttributesLatePass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // end anonymous namespace. + +char AMDGPUPropagateAttributesEarly::ID = 0; +char AMDGPUPropagateAttributesLate::ID = 0; + +INITIALIZE_PASS(AMDGPUPropagateAttributesEarly, + "amdgpu-propagate-attributes-early", + "Early propagate attributes from kernels to functions", + false, false) +INITIALIZE_PASS(AMDGPUPropagateAttributesLate, + "amdgpu-propagate-attributes-late", + "Late propagate attributes from kernels to functions", + false, false) + +Function * +AMDGPUPropagateAttributes::findFunction(const FeatureBitset &FeaturesNeeded, + Function *OrigF) { + // TODO: search for clone's clones. + for (Clone &C : Clones) + if (C.OrigF == OrigF && FeaturesNeeded == C.FeatureMask) + return C.NewF; + + return nullptr; +} + +bool AMDGPUPropagateAttributes::process(Module &M) { + for (auto &F : M.functions()) + if (AMDGPU::isEntryFunctionCC(F.getCallingConv())) + Roots.insert(&F); + + return process(); +} + +bool AMDGPUPropagateAttributes::process(Function &F) { + Roots.insert(&F); + return process(); +} + +bool AMDGPUPropagateAttributes::process() { + bool Changed = false; + SmallSet<Function *, 32> NewRoots; + SmallSet<Function *, 32> Replaced; + + if (Roots.empty()) + return false; + Module &M = *(*Roots.begin())->getParent(); + + do { + Roots.insert(NewRoots.begin(), NewRoots.end()); + NewRoots.clear(); + + for (auto &F : M.functions()) { + if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F)) + continue; + + const FeatureBitset &CalleeBits = + TM->getSubtargetImpl(F)->getFeatureBits(); + SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace; + + for (User *U : F.users()) { + Instruction *I = dyn_cast<Instruction>(U); + if (!I) + continue; + CallBase *CI = dyn_cast<CallBase>(I); + if (!CI) + continue; + Function *Caller = CI->getCaller(); + if (!Caller) + continue; + if (!Roots.count(Caller)) + continue; + + const FeatureBitset &CallerBits = + TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures; + + if (CallerBits == (CalleeBits & TargetFeatures)) { + NewRoots.insert(&F); + continue; + } + + Function *NewF = findFunction(CallerBits, &F); + if (!NewF) { + FeatureBitset NewFeatures((CalleeBits & ~TargetFeatures) | + CallerBits); + if (!AllowClone) { + // This may set different features on different iteartions if + // there is a contradiction in callers' attributes. In this case + // we rely on a second pass running on Module, which is allowed + // to clone. + setFeatures(F, NewFeatures); + NewRoots.insert(&F); + Changed = true; + break; + } + + NewF = cloneWithFeatures(F, NewFeatures); + Clones.push_back(Clone(CallerBits, &F, NewF)); + NewRoots.insert(NewF); + } + + ToReplace.push_back(std::make_pair(CI, NewF)); + Replaced.insert(&F); + + Changed = true; + } + + while (!ToReplace.empty()) { + auto R = ToReplace.pop_back_val(); + R.first->setCalledFunction(R.second); + } + } + } while (!NewRoots.empty()); + + for (Function *F : Replaced) { + if (F->use_empty()) + F->eraseFromParent(); + } + + return Changed; +} + +Function * +AMDGPUPropagateAttributes::cloneWithFeatures(Function &F, + const FeatureBitset &NewFeatures) { + LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n'); + + ValueToValueMapTy dummy; + Function *NewF = CloneFunction(&F, dummy); + setFeatures(*NewF, NewFeatures); + + // Swap names. If that is the only clone it will retain the name of now + // dead value. + if (F.hasName()) { + std::string NewName = NewF->getName(); + NewF->takeName(&F); + F.setName(NewName); + + // Name has changed, it does not need an external symbol. + F.setVisibility(GlobalValue::DefaultVisibility); + F.setLinkage(GlobalValue::InternalLinkage); + } + + return NewF; +} + +void AMDGPUPropagateAttributes::setFeatures(Function &F, + const FeatureBitset &NewFeatures) { + std::string NewFeatureStr = getFeatureString(NewFeatures); + + LLVM_DEBUG(dbgs() << "Set features " + << getFeatureString(NewFeatures & TargetFeatures) + << " on " << F.getName() << '\n'); + + F.removeFnAttr("target-features"); + F.addFnAttr("target-features", NewFeatureStr); +} + +std::string +AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const +{ + std::string Ret; + for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) { + if (Features[KV.Value]) + Ret += (StringRef("+") + KV.Key + ",").str(); + else if (TargetFeatures[KV.Value]) + Ret += (StringRef("-") + KV.Key + ",").str(); + } + Ret.pop_back(); // Remove last comma. + return Ret; +} + +bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) { + if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv())) + return false; + + return AMDGPUPropagateAttributes(TM, false).process(F); +} + +bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) { + if (!TM) + return false; + + return AMDGPUPropagateAttributes(TM, true).process(M); +} + +FunctionPass +*llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) { + return new AMDGPUPropagateAttributesEarly(TM); +} + +ModulePass +*llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) { + return new AMDGPUPropagateAttributesLate(TM); +} diff --git a/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp b/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp deleted file mode 100644 index 36d88f52910d..000000000000 --- a/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp +++ /dev/null @@ -1,353 +0,0 @@ -//===-- AMDGPURegAsmNames.inc - Register asm names ----------*- C++ -*-----===// - -#ifdef AMDGPU_REG_ASM_NAMES - -static const char *const VGPR32RegNames[] = { - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31", "v32", "v33", "v34", "v35", - "v36", "v37", "v38", "v39", "v40", "v41", "v42", "v43", "v44", - "v45", "v46", "v47", "v48", "v49", "v50", "v51", "v52", "v53", - "v54", "v55", "v56", "v57", "v58", "v59", "v60", "v61", "v62", - "v63", "v64", "v65", "v66", "v67", "v68", "v69", "v70", "v71", - "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", "v80", - "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89", - "v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", - "v99", "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107", - "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", "v116", - "v117", "v118", "v119", "v120", "v121", "v122", "v123", "v124", "v125", - "v126", "v127", "v128", "v129", "v130", "v131", "v132", "v133", "v134", - "v135", "v136", "v137", "v138", "v139", "v140", "v141", "v142", "v143", - "v144", "v145", "v146", "v147", "v148", "v149", "v150", "v151", "v152", - "v153", "v154", "v155", "v156", "v157", "v158", "v159", "v160", "v161", - "v162", "v163", "v164", "v165", "v166", "v167", "v168", "v169", "v170", - "v171", "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179", - "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", "v188", - "v189", "v190", "v191", "v192", "v193", "v194", "v195", "v196", "v197", - "v198", "v199", "v200", "v201", "v202", "v203", "v204", "v205", "v206", - "v207", "v208", "v209", "v210", "v211", "v212", "v213", "v214", "v215", - "v216", "v217", "v218", "v219", "v220", "v221", "v222", "v223", "v224", - "v225", "v226", "v227", "v228", "v229", "v230", "v231", "v232", "v233", - "v234", "v235", "v236", "v237", "v238", "v239", "v240", "v241", "v242", - "v243", "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251", - "v252", "v253", "v254", "v255" -}; - -static const char *const SGPR32RegNames[] = { - "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", - "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", - "s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29", - "s30", "s31", "s32", "s33", "s34", "s35", "s36", "s37", "s38", "s39", - "s40", "s41", "s42", "s43", "s44", "s45", "s46", "s47", "s48", "s49", - "s50", "s51", "s52", "s53", "s54", "s55", "s56", "s57", "s58", "s59", - "s60", "s61", "s62", "s63", "s64", "s65", "s66", "s67", "s68", "s69", - "s70", "s71", "s72", "s73", "s74", "s75", "s76", "s77", "s78", "s79", - "s80", "s81", "s82", "s83", "s84", "s85", "s86", "s87", "s88", "s89", - "s90", "s91", "s92", "s93", "s94", "s95", "s96", "s97", "s98", "s99", - "s100", "s101", "s102", "s103" -}; - -static const char *const VGPR64RegNames[] = { - "v[0:1]", "v[1:2]", "v[2:3]", "v[3:4]", "v[4:5]", - "v[5:6]", "v[6:7]", "v[7:8]", "v[8:9]", "v[9:10]", - "v[10:11]", "v[11:12]", "v[12:13]", "v[13:14]", "v[14:15]", - "v[15:16]", "v[16:17]", "v[17:18]", "v[18:19]", "v[19:20]", - "v[20:21]", "v[21:22]", "v[22:23]", "v[23:24]", "v[24:25]", - "v[25:26]", "v[26:27]", "v[27:28]", "v[28:29]", "v[29:30]", - "v[30:31]", "v[31:32]", "v[32:33]", "v[33:34]", "v[34:35]", - "v[35:36]", "v[36:37]", "v[37:38]", "v[38:39]", "v[39:40]", - "v[40:41]", "v[41:42]", "v[42:43]", "v[43:44]", "v[44:45]", - "v[45:46]", "v[46:47]", "v[47:48]", "v[48:49]", "v[49:50]", - "v[50:51]", "v[51:52]", "v[52:53]", "v[53:54]", "v[54:55]", - "v[55:56]", "v[56:57]", "v[57:58]", "v[58:59]", "v[59:60]", - "v[60:61]", "v[61:62]", "v[62:63]", "v[63:64]", "v[64:65]", - "v[65:66]", "v[66:67]", "v[67:68]", "v[68:69]", "v[69:70]", - "v[70:71]", "v[71:72]", "v[72:73]", "v[73:74]", "v[74:75]", - "v[75:76]", "v[76:77]", "v[77:78]", "v[78:79]", "v[79:80]", - "v[80:81]", "v[81:82]", "v[82:83]", "v[83:84]", "v[84:85]", - "v[85:86]", "v[86:87]", "v[87:88]", "v[88:89]", "v[89:90]", - "v[90:91]", "v[91:92]", "v[92:93]", "v[93:94]", "v[94:95]", - "v[95:96]", "v[96:97]", "v[97:98]", "v[98:99]", "v[99:100]", - "v[100:101]", "v[101:102]", "v[102:103]", "v[103:104]", "v[104:105]", - "v[105:106]", "v[106:107]", "v[107:108]", "v[108:109]", "v[109:110]", - "v[110:111]", "v[111:112]", "v[112:113]", "v[113:114]", "v[114:115]", - "v[115:116]", "v[116:117]", "v[117:118]", "v[118:119]", "v[119:120]", - "v[120:121]", "v[121:122]", "v[122:123]", "v[123:124]", "v[124:125]", - "v[125:126]", "v[126:127]", "v[127:128]", "v[128:129]", "v[129:130]", - "v[130:131]", "v[131:132]", "v[132:133]", "v[133:134]", "v[134:135]", - "v[135:136]", "v[136:137]", "v[137:138]", "v[138:139]", "v[139:140]", - "v[140:141]", "v[141:142]", "v[142:143]", "v[143:144]", "v[144:145]", - "v[145:146]", "v[146:147]", "v[147:148]", "v[148:149]", "v[149:150]", - "v[150:151]", "v[151:152]", "v[152:153]", "v[153:154]", "v[154:155]", - "v[155:156]", "v[156:157]", "v[157:158]", "v[158:159]", "v[159:160]", - "v[160:161]", "v[161:162]", "v[162:163]", "v[163:164]", "v[164:165]", - "v[165:166]", "v[166:167]", "v[167:168]", "v[168:169]", "v[169:170]", - "v[170:171]", "v[171:172]", "v[172:173]", "v[173:174]", "v[174:175]", - "v[175:176]", "v[176:177]", "v[177:178]", "v[178:179]", "v[179:180]", - "v[180:181]", "v[181:182]", "v[182:183]", "v[183:184]", "v[184:185]", - "v[185:186]", "v[186:187]", "v[187:188]", "v[188:189]", "v[189:190]", - "v[190:191]", "v[191:192]", "v[192:193]", "v[193:194]", "v[194:195]", - "v[195:196]", "v[196:197]", "v[197:198]", "v[198:199]", "v[199:200]", - "v[200:201]", "v[201:202]", "v[202:203]", "v[203:204]", "v[204:205]", - "v[205:206]", "v[206:207]", "v[207:208]", "v[208:209]", "v[209:210]", - "v[210:211]", "v[211:212]", "v[212:213]", "v[213:214]", "v[214:215]", - "v[215:216]", "v[216:217]", "v[217:218]", "v[218:219]", "v[219:220]", - "v[220:221]", "v[221:222]", "v[222:223]", "v[223:224]", "v[224:225]", - "v[225:226]", "v[226:227]", "v[227:228]", "v[228:229]", "v[229:230]", - "v[230:231]", "v[231:232]", "v[232:233]", "v[233:234]", "v[234:235]", - "v[235:236]", "v[236:237]", "v[237:238]", "v[238:239]", "v[239:240]", - "v[240:241]", "v[241:242]", "v[242:243]", "v[243:244]", "v[244:245]", - "v[245:246]", "v[246:247]", "v[247:248]", "v[248:249]", "v[249:250]", - "v[250:251]", "v[251:252]", "v[252:253]", "v[253:254]", "v[254:255]" -}; - -static const char *const VGPR96RegNames[] = { - "v[0:2]", "v[1:3]", "v[2:4]", "v[3:5]", "v[4:6]", - "v[5:7]", "v[6:8]", "v[7:9]", "v[8:10]", "v[9:11]", - "v[10:12]", "v[11:13]", "v[12:14]", "v[13:15]", "v[14:16]", - "v[15:17]", "v[16:18]", "v[17:19]", "v[18:20]", "v[19:21]", - "v[20:22]", "v[21:23]", "v[22:24]", "v[23:25]", "v[24:26]", - "v[25:27]", "v[26:28]", "v[27:29]", "v[28:30]", "v[29:31]", - "v[30:32]", "v[31:33]", "v[32:34]", "v[33:35]", "v[34:36]", - "v[35:37]", "v[36:38]", "v[37:39]", "v[38:40]", "v[39:41]", - "v[40:42]", "v[41:43]", "v[42:44]", "v[43:45]", "v[44:46]", - "v[45:47]", "v[46:48]", "v[47:49]", "v[48:50]", "v[49:51]", - "v[50:52]", "v[51:53]", "v[52:54]", "v[53:55]", "v[54:56]", - "v[55:57]", "v[56:58]", "v[57:59]", "v[58:60]", "v[59:61]", - "v[60:62]", "v[61:63]", "v[62:64]", "v[63:65]", "v[64:66]", - "v[65:67]", "v[66:68]", "v[67:69]", "v[68:70]", "v[69:71]", - "v[70:72]", "v[71:73]", "v[72:74]", "v[73:75]", "v[74:76]", - "v[75:77]", "v[76:78]", "v[77:79]", "v[78:80]", "v[79:81]", - "v[80:82]", "v[81:83]", "v[82:84]", "v[83:85]", "v[84:86]", - "v[85:87]", "v[86:88]", "v[87:89]", "v[88:90]", "v[89:91]", - "v[90:92]", "v[91:93]", "v[92:94]", "v[93:95]", "v[94:96]", - "v[95:97]", "v[96:98]", "v[97:99]", "v[98:100]", "v[99:101]", - "v[100:102]", "v[101:103]", "v[102:104]", "v[103:105]", "v[104:106]", - "v[105:107]", "v[106:108]", "v[107:109]", "v[108:110]", "v[109:111]", - "v[110:112]", "v[111:113]", "v[112:114]", "v[113:115]", "v[114:116]", - "v[115:117]", "v[116:118]", "v[117:119]", "v[118:120]", "v[119:121]", - "v[120:122]", "v[121:123]", "v[122:124]", "v[123:125]", "v[124:126]", - "v[125:127]", "v[126:128]", "v[127:129]", "v[128:130]", "v[129:131]", - "v[130:132]", "v[131:133]", "v[132:134]", "v[133:135]", "v[134:136]", - "v[135:137]", "v[136:138]", "v[137:139]", "v[138:140]", "v[139:141]", - "v[140:142]", "v[141:143]", "v[142:144]", "v[143:145]", "v[144:146]", - "v[145:147]", "v[146:148]", "v[147:149]", "v[148:150]", "v[149:151]", - "v[150:152]", "v[151:153]", "v[152:154]", "v[153:155]", "v[154:156]", - "v[155:157]", "v[156:158]", "v[157:159]", "v[158:160]", "v[159:161]", - "v[160:162]", "v[161:163]", "v[162:164]", "v[163:165]", "v[164:166]", - "v[165:167]", "v[166:168]", "v[167:169]", "v[168:170]", "v[169:171]", - "v[170:172]", "v[171:173]", "v[172:174]", "v[173:175]", "v[174:176]", - "v[175:177]", "v[176:178]", "v[177:179]", "v[178:180]", "v[179:181]", - "v[180:182]", "v[181:183]", "v[182:184]", "v[183:185]", "v[184:186]", - "v[185:187]", "v[186:188]", "v[187:189]", "v[188:190]", "v[189:191]", - "v[190:192]", "v[191:193]", "v[192:194]", "v[193:195]", "v[194:196]", - "v[195:197]", "v[196:198]", "v[197:199]", "v[198:200]", "v[199:201]", - "v[200:202]", "v[201:203]", "v[202:204]", "v[203:205]", "v[204:206]", - "v[205:207]", "v[206:208]", "v[207:209]", "v[208:210]", "v[209:211]", - "v[210:212]", "v[211:213]", "v[212:214]", "v[213:215]", "v[214:216]", - "v[215:217]", "v[216:218]", "v[217:219]", "v[218:220]", "v[219:221]", - "v[220:222]", "v[221:223]", "v[222:224]", "v[223:225]", "v[224:226]", - "v[225:227]", "v[226:228]", "v[227:229]", "v[228:230]", "v[229:231]", - "v[230:232]", "v[231:233]", "v[232:234]", "v[233:235]", "v[234:236]", - "v[235:237]", "v[236:238]", "v[237:239]", "v[238:240]", "v[239:241]", - "v[240:242]", "v[241:243]", "v[242:244]", "v[243:245]", "v[244:246]", - "v[245:247]", "v[246:248]", "v[247:249]", "v[248:250]", "v[249:251]", - "v[250:252]", "v[251:253]", "v[252:254]", "v[253:255]" -}; - -static const char *const VGPR128RegNames[] = { - "v[0:3]", "v[1:4]", "v[2:5]", "v[3:6]", "v[4:7]", - "v[5:8]", "v[6:9]", "v[7:10]", "v[8:11]", "v[9:12]", - "v[10:13]", "v[11:14]", "v[12:15]", "v[13:16]", "v[14:17]", - "v[15:18]", "v[16:19]", "v[17:20]", "v[18:21]", "v[19:22]", - "v[20:23]", "v[21:24]", "v[22:25]", "v[23:26]", "v[24:27]", - "v[25:28]", "v[26:29]", "v[27:30]", "v[28:31]", "v[29:32]", - "v[30:33]", "v[31:34]", "v[32:35]", "v[33:36]", "v[34:37]", - "v[35:38]", "v[36:39]", "v[37:40]", "v[38:41]", "v[39:42]", - "v[40:43]", "v[41:44]", "v[42:45]", "v[43:46]", "v[44:47]", - "v[45:48]", "v[46:49]", "v[47:50]", "v[48:51]", "v[49:52]", - "v[50:53]", "v[51:54]", "v[52:55]", "v[53:56]", "v[54:57]", - "v[55:58]", "v[56:59]", "v[57:60]", "v[58:61]", "v[59:62]", - "v[60:63]", "v[61:64]", "v[62:65]", "v[63:66]", "v[64:67]", - "v[65:68]", "v[66:69]", "v[67:70]", "v[68:71]", "v[69:72]", - "v[70:73]", "v[71:74]", "v[72:75]", "v[73:76]", "v[74:77]", - "v[75:78]", "v[76:79]", "v[77:80]", "v[78:81]", "v[79:82]", - "v[80:83]", "v[81:84]", "v[82:85]", "v[83:86]", "v[84:87]", - "v[85:88]", "v[86:89]", "v[87:90]", "v[88:91]", "v[89:92]", - "v[90:93]", "v[91:94]", "v[92:95]", "v[93:96]", "v[94:97]", - "v[95:98]", "v[96:99]", "v[97:100]", "v[98:101]", "v[99:102]", - "v[100:103]", "v[101:104]", "v[102:105]", "v[103:106]", "v[104:107]", - "v[105:108]", "v[106:109]", "v[107:110]", "v[108:111]", "v[109:112]", - "v[110:113]", "v[111:114]", "v[112:115]", "v[113:116]", "v[114:117]", - "v[115:118]", "v[116:119]", "v[117:120]", "v[118:121]", "v[119:122]", - "v[120:123]", "v[121:124]", "v[122:125]", "v[123:126]", "v[124:127]", - "v[125:128]", "v[126:129]", "v[127:130]", "v[128:131]", "v[129:132]", - "v[130:133]", "v[131:134]", "v[132:135]", "v[133:136]", "v[134:137]", - "v[135:138]", "v[136:139]", "v[137:140]", "v[138:141]", "v[139:142]", - "v[140:143]", "v[141:144]", "v[142:145]", "v[143:146]", "v[144:147]", - "v[145:148]", "v[146:149]", "v[147:150]", "v[148:151]", "v[149:152]", - "v[150:153]", "v[151:154]", "v[152:155]", "v[153:156]", "v[154:157]", - "v[155:158]", "v[156:159]", "v[157:160]", "v[158:161]", "v[159:162]", - "v[160:163]", "v[161:164]", "v[162:165]", "v[163:166]", "v[164:167]", - "v[165:168]", "v[166:169]", "v[167:170]", "v[168:171]", "v[169:172]", - "v[170:173]", "v[171:174]", "v[172:175]", "v[173:176]", "v[174:177]", - "v[175:178]", "v[176:179]", "v[177:180]", "v[178:181]", "v[179:182]", - "v[180:183]", "v[181:184]", "v[182:185]", "v[183:186]", "v[184:187]", - "v[185:188]", "v[186:189]", "v[187:190]", "v[188:191]", "v[189:192]", - "v[190:193]", "v[191:194]", "v[192:195]", "v[193:196]", "v[194:197]", - "v[195:198]", "v[196:199]", "v[197:200]", "v[198:201]", "v[199:202]", - "v[200:203]", "v[201:204]", "v[202:205]", "v[203:206]", "v[204:207]", - "v[205:208]", "v[206:209]", "v[207:210]", "v[208:211]", "v[209:212]", - "v[210:213]", "v[211:214]", "v[212:215]", "v[213:216]", "v[214:217]", - "v[215:218]", "v[216:219]", "v[217:220]", "v[218:221]", "v[219:222]", - "v[220:223]", "v[221:224]", "v[222:225]", "v[223:226]", "v[224:227]", - "v[225:228]", "v[226:229]", "v[227:230]", "v[228:231]", "v[229:232]", - "v[230:233]", "v[231:234]", "v[232:235]", "v[233:236]", "v[234:237]", - "v[235:238]", "v[236:239]", "v[237:240]", "v[238:241]", "v[239:242]", - "v[240:243]", "v[241:244]", "v[242:245]", "v[243:246]", "v[244:247]", - "v[245:248]", "v[246:249]", "v[247:250]", "v[248:251]", "v[249:252]", - "v[250:253]", "v[251:254]", "v[252:255]" -}; - -static const char *const VGPR256RegNames[] = { - "v[0:7]", "v[1:8]", "v[2:9]", "v[3:10]", "v[4:11]", - "v[5:12]", "v[6:13]", "v[7:14]", "v[8:15]", "v[9:16]", - "v[10:17]", "v[11:18]", "v[12:19]", "v[13:20]", "v[14:21]", - "v[15:22]", "v[16:23]", "v[17:24]", "v[18:25]", "v[19:26]", - "v[20:27]", "v[21:28]", "v[22:29]", "v[23:30]", "v[24:31]", - "v[25:32]", "v[26:33]", "v[27:34]", "v[28:35]", "v[29:36]", - "v[30:37]", "v[31:38]", "v[32:39]", "v[33:40]", "v[34:41]", - "v[35:42]", "v[36:43]", "v[37:44]", "v[38:45]", "v[39:46]", - "v[40:47]", "v[41:48]", "v[42:49]", "v[43:50]", "v[44:51]", - "v[45:52]", "v[46:53]", "v[47:54]", "v[48:55]", "v[49:56]", - "v[50:57]", "v[51:58]", "v[52:59]", "v[53:60]", "v[54:61]", - "v[55:62]", "v[56:63]", "v[57:64]", "v[58:65]", "v[59:66]", - "v[60:67]", "v[61:68]", "v[62:69]", "v[63:70]", "v[64:71]", - "v[65:72]", "v[66:73]", "v[67:74]", "v[68:75]", "v[69:76]", - "v[70:77]", "v[71:78]", "v[72:79]", "v[73:80]", "v[74:81]", - "v[75:82]", "v[76:83]", "v[77:84]", "v[78:85]", "v[79:86]", - "v[80:87]", "v[81:88]", "v[82:89]", "v[83:90]", "v[84:91]", - "v[85:92]", "v[86:93]", "v[87:94]", "v[88:95]", "v[89:96]", - "v[90:97]", "v[91:98]", "v[92:99]", "v[93:100]", "v[94:101]", - "v[95:102]", "v[96:103]", "v[97:104]", "v[98:105]", "v[99:106]", - "v[100:107]", "v[101:108]", "v[102:109]", "v[103:110]", "v[104:111]", - "v[105:112]", "v[106:113]", "v[107:114]", "v[108:115]", "v[109:116]", - "v[110:117]", "v[111:118]", "v[112:119]", "v[113:120]", "v[114:121]", - "v[115:122]", "v[116:123]", "v[117:124]", "v[118:125]", "v[119:126]", - "v[120:127]", "v[121:128]", "v[122:129]", "v[123:130]", "v[124:131]", - "v[125:132]", "v[126:133]", "v[127:134]", "v[128:135]", "v[129:136]", - "v[130:137]", "v[131:138]", "v[132:139]", "v[133:140]", "v[134:141]", - "v[135:142]", "v[136:143]", "v[137:144]", "v[138:145]", "v[139:146]", - "v[140:147]", "v[141:148]", "v[142:149]", "v[143:150]", "v[144:151]", - "v[145:152]", "v[146:153]", "v[147:154]", "v[148:155]", "v[149:156]", - "v[150:157]", "v[151:158]", "v[152:159]", "v[153:160]", "v[154:161]", - "v[155:162]", "v[156:163]", "v[157:164]", "v[158:165]", "v[159:166]", - "v[160:167]", "v[161:168]", "v[162:169]", "v[163:170]", "v[164:171]", - "v[165:172]", "v[166:173]", "v[167:174]", "v[168:175]", "v[169:176]", - "v[170:177]", "v[171:178]", "v[172:179]", "v[173:180]", "v[174:181]", - "v[175:182]", "v[176:183]", "v[177:184]", "v[178:185]", "v[179:186]", - "v[180:187]", "v[181:188]", "v[182:189]", "v[183:190]", "v[184:191]", - "v[185:192]", "v[186:193]", "v[187:194]", "v[188:195]", "v[189:196]", - "v[190:197]", "v[191:198]", "v[192:199]", "v[193:200]", "v[194:201]", - "v[195:202]", "v[196:203]", "v[197:204]", "v[198:205]", "v[199:206]", - "v[200:207]", "v[201:208]", "v[202:209]", "v[203:210]", "v[204:211]", - "v[205:212]", "v[206:213]", "v[207:214]", "v[208:215]", "v[209:216]", - "v[210:217]", "v[211:218]", "v[212:219]", "v[213:220]", "v[214:221]", - "v[215:222]", "v[216:223]", "v[217:224]", "v[218:225]", "v[219:226]", - "v[220:227]", "v[221:228]", "v[222:229]", "v[223:230]", "v[224:231]", - "v[225:232]", "v[226:233]", "v[227:234]", "v[228:235]", "v[229:236]", - "v[230:237]", "v[231:238]", "v[232:239]", "v[233:240]", "v[234:241]", - "v[235:242]", "v[236:243]", "v[237:244]", "v[238:245]", "v[239:246]", - "v[240:247]", "v[241:248]", "v[242:249]", "v[243:250]", "v[244:251]", - "v[245:252]", "v[246:253]", "v[247:254]", "v[248:255]" -}; - -static const char *const VGPR512RegNames[] = { - "v[0:15]", "v[1:16]", "v[2:17]", "v[3:18]", "v[4:19]", - "v[5:20]", "v[6:21]", "v[7:22]", "v[8:23]", "v[9:24]", - "v[10:25]", "v[11:26]", "v[12:27]", "v[13:28]", "v[14:29]", - "v[15:30]", "v[16:31]", "v[17:32]", "v[18:33]", "v[19:34]", - "v[20:35]", "v[21:36]", "v[22:37]", "v[23:38]", "v[24:39]", - "v[25:40]", "v[26:41]", "v[27:42]", "v[28:43]", "v[29:44]", - "v[30:45]", "v[31:46]", "v[32:47]", "v[33:48]", "v[34:49]", - "v[35:50]", "v[36:51]", "v[37:52]", "v[38:53]", "v[39:54]", - "v[40:55]", "v[41:56]", "v[42:57]", "v[43:58]", "v[44:59]", - "v[45:60]", "v[46:61]", "v[47:62]", "v[48:63]", "v[49:64]", - "v[50:65]", "v[51:66]", "v[52:67]", "v[53:68]", "v[54:69]", - "v[55:70]", "v[56:71]", "v[57:72]", "v[58:73]", "v[59:74]", - "v[60:75]", "v[61:76]", "v[62:77]", "v[63:78]", "v[64:79]", - "v[65:80]", "v[66:81]", "v[67:82]", "v[68:83]", "v[69:84]", - "v[70:85]", "v[71:86]", "v[72:87]", "v[73:88]", "v[74:89]", - "v[75:90]", "v[76:91]", "v[77:92]", "v[78:93]", "v[79:94]", - "v[80:95]", "v[81:96]", "v[82:97]", "v[83:98]", "v[84:99]", - "v[85:100]", "v[86:101]", "v[87:102]", "v[88:103]", "v[89:104]", - "v[90:105]", "v[91:106]", "v[92:107]", "v[93:108]", "v[94:109]", - "v[95:110]", "v[96:111]", "v[97:112]", "v[98:113]", "v[99:114]", - "v[100:115]", "v[101:116]", "v[102:117]", "v[103:118]", "v[104:119]", - "v[105:120]", "v[106:121]", "v[107:122]", "v[108:123]", "v[109:124]", - "v[110:125]", "v[111:126]", "v[112:127]", "v[113:128]", "v[114:129]", - "v[115:130]", "v[116:131]", "v[117:132]", "v[118:133]", "v[119:134]", - "v[120:135]", "v[121:136]", "v[122:137]", "v[123:138]", "v[124:139]", - "v[125:140]", "v[126:141]", "v[127:142]", "v[128:143]", "v[129:144]", - "v[130:145]", "v[131:146]", "v[132:147]", "v[133:148]", "v[134:149]", - "v[135:150]", "v[136:151]", "v[137:152]", "v[138:153]", "v[139:154]", - "v[140:155]", "v[141:156]", "v[142:157]", "v[143:158]", "v[144:159]", - "v[145:160]", "v[146:161]", "v[147:162]", "v[148:163]", "v[149:164]", - "v[150:165]", "v[151:166]", "v[152:167]", "v[153:168]", "v[154:169]", - "v[155:170]", "v[156:171]", "v[157:172]", "v[158:173]", "v[159:174]", - "v[160:175]", "v[161:176]", "v[162:177]", "v[163:178]", "v[164:179]", - "v[165:180]", "v[166:181]", "v[167:182]", "v[168:183]", "v[169:184]", - "v[170:185]", "v[171:186]", "v[172:187]", "v[173:188]", "v[174:189]", - "v[175:190]", "v[176:191]", "v[177:192]", "v[178:193]", "v[179:194]", - "v[180:195]", "v[181:196]", "v[182:197]", "v[183:198]", "v[184:199]", - "v[185:200]", "v[186:201]", "v[187:202]", "v[188:203]", "v[189:204]", - "v[190:205]", "v[191:206]", "v[192:207]", "v[193:208]", "v[194:209]", - "v[195:210]", "v[196:211]", "v[197:212]", "v[198:213]", "v[199:214]", - "v[200:215]", "v[201:216]", "v[202:217]", "v[203:218]", "v[204:219]", - "v[205:220]", "v[206:221]", "v[207:222]", "v[208:223]", "v[209:224]", - "v[210:225]", "v[211:226]", "v[212:227]", "v[213:228]", "v[214:229]", - "v[215:230]", "v[216:231]", "v[217:232]", "v[218:233]", "v[219:234]", - "v[220:235]", "v[221:236]", "v[222:237]", "v[223:238]", "v[224:239]", - "v[225:240]", "v[226:241]", "v[227:242]", "v[228:243]", "v[229:244]", - "v[230:245]", "v[231:246]", "v[232:247]", "v[233:248]", "v[234:249]", - "v[235:250]", "v[236:251]", "v[237:252]", "v[238:253]", "v[239:254]", - "v[240:255]" -}; - -static const char *const SGPR64RegNames[] = { - "s[0:1]", "s[2:3]", "s[4:5]", "s[6:7]", "s[8:9]", "s[10:11]", - "s[12:13]", "s[14:15]", "s[16:17]", "s[18:19]", "s[20:21]", "s[22:23]", - "s[24:25]", "s[26:27]", "s[28:29]", "s[30:31]", "s[32:33]", "s[34:35]", - "s[36:37]", "s[38:39]", "s[40:41]", "s[42:43]", "s[44:45]", "s[46:47]", - "s[48:49]", "s[50:51]", "s[52:53]", "s[54:55]", "s[56:57]", "s[58:59]", - "s[60:61]", "s[62:63]", "s[64:65]", "s[66:67]", "s[68:69]", "s[70:71]", - "s[72:73]", "s[74:75]", "s[76:77]", "s[78:79]", "s[80:81]", "s[82:83]", - "s[84:85]", "s[86:87]", "s[88:89]", "s[90:91]", "s[92:93]", "s[94:95]", - "s[96:97]", "s[98:99]", "s[100:101]", "s[102:103]" -}; - -static const char *const SGPR128RegNames[] = { - "s[0:3]", "s[4:7]", "s[8:11]", "s[12:15]", "s[16:19]", "s[20:23]", - "s[24:27]", "s[28:31]", "s[32:35]", "s[36:39]", "s[40:43]", "s[44:47]", - "s[48:51]", "s[52:55]", "s[56:59]", "s[60:63]", "s[64:67]", "s[68:71]", - "s[72:75]", "s[76:79]", "s[80:83]", "s[84:87]", "s[88:91]", "s[92:95]", - "s[96:99]", "s[100:103]" -}; - -static const char *const SGPR256RegNames[] = { - "s[0:7]", "s[4:11]", "s[8:15]", "s[12:19]", "s[16:23]", - "s[20:27]", "s[24:31]", "s[28:35]", "s[32:39]", "s[36:43]", - "s[40:47]", "s[44:51]", "s[48:55]", "s[52:59]", "s[56:63]", - "s[60:67]", "s[64:71]", "s[68:75]", "s[72:79]", "s[76:83]", - "s[80:87]", "s[84:91]", "s[88:95]", "s[92:99]", "s[96:103]" -}; - -static const char *const SGPR512RegNames[] = { - "s[0:15]", "s[4:19]", "s[8:23]", "s[12:27]", "s[16:31]", "s[20:35]", - "s[24:39]", "s[28:43]", "s[32:47]", "s[36:51]", "s[40:55]", "s[44:59]", - "s[48:63]", "s[52:67]", "s[56:71]", "s[60:75]", "s[64:79]", "s[68:83]", - "s[72:87]", "s[76:91]", "s[80:95]", "s[84:99]", "s[88:103]" -}; - -#endif diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 7a760dcf7a90..815cbc5e26ee 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1,9 +1,8 @@ //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -14,9 +13,13 @@ #include "AMDGPURegisterBankInfo.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -31,6 +34,56 @@ using namespace llvm; +namespace { + +// Observer to apply a register bank to new registers created by LegalizerHelper. +class ApplyRegBankMapping final : public GISelChangeObserver { +private: + MachineRegisterInfo &MRI; + const RegisterBank *NewBank; + SmallVector<MachineInstr *, 4> NewInsts; + +public: + ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB) + : MRI(MRI_), NewBank(RB) {} + + ~ApplyRegBankMapping() { + for (MachineInstr *MI : NewInsts) + applyBank(*MI); + } + + /// Set any registers that don't have a set register class or bank to SALU. + void applyBank(MachineInstr &MI) { + for (MachineOperand &Op : MI.operands()) { + if (!Op.isReg()) + continue; + + Register Reg = Op.getReg(); + if (MRI.getRegClassOrRegBank(Reg)) + continue; + + const RegisterBank *RB = NewBank; + // FIXME: This might not be enough to detect when SCC should be used. + if (MRI.getType(Reg) == LLT::scalar(1)) + RB = (NewBank == &AMDGPU::SGPRRegBank ? + &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank); + + MRI.setRegBank(Reg, *RB); + } + } + + void erasingInstr(MachineInstr &MI) override {} + + void createdInstr(MachineInstr &MI) override { + // At this point, the instruction was just inserted and has no operands. + NewInsts.push_back(&MI); + } + + void changingInstr(MachineInstr &MI) override {} + void changedInstr(MachineInstr &MI) override {} +}; + +} AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) : AMDGPUGenRegisterBankInfo(), TRI(static_cast<const SIRegisterInfo*>(&TRI)) { @@ -52,43 +105,62 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) } -static bool isConstant(const MachineOperand &MO, int64_t &C) { - const MachineFunction *MF = MO.getParent()->getParent()->getParent(); - const MachineRegisterInfo &MRI = MF->getRegInfo(); - const MachineInstr *Def = MRI.getVRegDef(MO.getReg()); - if (!Def) - return false; - - if (Def->getOpcode() == AMDGPU::G_CONSTANT) { - C = Def->getOperand(1).getCImm()->getSExtValue(); - return true; - } - - if (Def->getOpcode() == AMDGPU::COPY) - return isConstant(Def->getOperand(1), C); - - return false; -} - unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, const RegisterBank &Src, unsigned Size) const { + // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? if (Dst.getID() == AMDGPU::SGPRRegBankID && Src.getID() == AMDGPU::VGPRRegBankID) { return std::numeric_limits<unsigned>::max(); } - // SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by - // the valu. - if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID && + // Bool values are tricky, because the meaning is based on context. The SCC + // and VCC banks are for the natural scalar and vector conditions produced by + // a compare. + // + // Legalization doesn't know about the necessary context, so an s1 use may + // have been a truncate from an arbitrary value, in which case a copy (lowered + // as a compare with 0) needs to be inserted. + if (Size == 1 && + (Dst.getID() == AMDGPU::SCCRegBankID || + Dst.getID() == AMDGPU::SGPRRegBankID) && (Src.getID() == AMDGPU::SGPRRegBankID || Src.getID() == AMDGPU::VGPRRegBankID || Src.getID() == AMDGPU::VCCRegBankID)) return std::numeric_limits<unsigned>::max(); + if (Dst.getID() == AMDGPU::SCCRegBankID && + Src.getID() == AMDGPU::VCCRegBankID) + return std::numeric_limits<unsigned>::max(); + return RegisterBankInfo::copyCost(Dst, Src, Size); } +unsigned AMDGPURegisterBankInfo::getBreakDownCost( + const ValueMapping &ValMapping, + const RegisterBank *CurBank) const { + // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to + // VGPR. + // FIXME: Is there a better way to do this? + if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) + return 10; // This is expensive. + + assert(ValMapping.NumBreakDowns == 2 && + ValMapping.BreakDown[0].Length == 32 && + ValMapping.BreakDown[0].StartIdx == 0 && + ValMapping.BreakDown[1].Length == 32 && + ValMapping.BreakDown[1].StartIdx == 32 && + ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); + + // 32-bit extract of a 64-bit value is just access of a subregister, so free. + // TODO: Cost of 0 hits assert, though it's not clear it's what we really + // want. + + // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR + // alignment restrictions, but this probably isn't important. + return 1; +} + const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( const TargetRegisterClass &RC) const { @@ -98,6 +170,163 @@ const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( return getRegBank(AMDGPU::VGPRRegBankID); } +template <unsigned NumOps> +RegisterBankInfo::InstructionMappings +AMDGPURegisterBankInfo::addMappingFromTable( + const MachineInstr &MI, const MachineRegisterInfo &MRI, + const std::array<unsigned, NumOps> RegSrcOpIdx, + ArrayRef<OpRegBankEntry<NumOps>> Table) const { + + InstructionMappings AltMappings; + + SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); + + unsigned Sizes[NumOps]; + for (unsigned I = 0; I < NumOps; ++I) { + Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); + Sizes[I] = getSizeInBits(Reg, MRI, *TRI); + } + + for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { + unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); + Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); + } + + unsigned MappingID = 0; + for (const auto &Entry : Table) { + for (unsigned I = 0; I < NumOps; ++I) { + int OpIdx = RegSrcOpIdx[I]; + Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); + } + + AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, + getOperandsMapping(Operands), + Operands.size())); + } + + return AltMappings; +} + +RegisterBankInfo::InstructionMappings +AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( + const MachineInstr &MI, const MachineRegisterInfo &MRI) const { + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + case Intrinsic::amdgcn_readlane: { + static const OpRegBankEntry<3> Table[2] = { + // Perfectly legal. + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, + + // Need a readfirstlane for the index. + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } + }; + + const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; + return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + case Intrinsic::amdgcn_writelane: { + static const OpRegBankEntry<4> Table[4] = { + // Perfectly legal. + { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, + + // Need readfirstlane of first op + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, + + // Need readfirstlane of second op + { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, + + // Need readfirstlane of both ops + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } + }; + + // rsrc, voffset, offset + const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; + return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + default: + return RegisterBankInfo::getInstrAlternativeMappings(MI); + } +} + +RegisterBankInfo::InstructionMappings +AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( + const MachineInstr &MI, const MachineRegisterInfo &MRI) const { + + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + case Intrinsic::amdgcn_buffer_load: { + static const OpRegBankEntry<3> Table[4] = { + // Perfectly legal. + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, + + // Waterfall loop needed for rsrc. In the worst case this will execute + // approximately an extra 10 * wavesize + 2 instructions. + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 } + }; + + // rsrc, voffset, offset + const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } }; + return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + case Intrinsic::amdgcn_s_buffer_load: { + static const OpRegBankEntry<2> Table[4] = { + // Perfectly legal. + { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, + + // Only need 1 register in loop + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, + + // Have to waterfall the resource. + { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, + + // Have to waterfall the resource, and the offset. + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } + }; + + // rsrc, offset + const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; + return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: { + // VGPR = M0, VGPR + static const OpRegBankEntry<3> Table[2] = { + // Perfectly legal. + { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, + + // Need a readfirstlane for m0 + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } + }; + + const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; + return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + case Intrinsic::amdgcn_s_sendmsg: + case Intrinsic::amdgcn_s_sendmsghalt: { + static const OpRegBankEntry<1> Table[2] = { + // Perfectly legal. + { { AMDGPU::SGPRRegBankID }, 1 }, + + // Need readlane + { { AMDGPU::VGPRRegBankID }, 3 } + }; + + const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; + return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } + default: + return RegisterBankInfo::getInstrAlternativeMappings(MI); + } +} + +static bool isInstrUniform(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + return AMDGPUInstrInfo::isUniformMMO(MMO); +} + RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappings( const MachineInstr &MI) const { @@ -108,31 +337,102 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( InstructionMappings AltMappings; switch (MI.getOpcode()) { - case TargetOpcode::G_LOAD: { + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); - // FIXME: Should we be hard coding the size for these mappings? - const InstructionMapping &SSMapping = getInstructionMapping( + + if (Size == 1) { + // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. + const InstructionMapping &SCCMapping = getInstructionMapping( 1, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), - 2); // Num Operands + {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&SCCMapping); + + const InstructionMapping &SGPRMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&SGPRMapping); + + const InstructionMapping &VCCMapping0 = getInstructionMapping( + 2, 10, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&VCCMapping0); + return AltMappings; + } + + if (Size != 64) + break; + + const InstructionMapping &SSMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), + 3); // Num Operands AltMappings.push_back(&SSMapping); const InstructionMapping &VVMapping = getInstructionMapping( + 2, 2, getOperandsMapping( + {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&VVMapping); + + const InstructionMapping &SVMapping = getInstructionMapping( + 3, 3, getOperandsMapping( + {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&SVMapping); + + // SGPR in LHS is slightly preferrable, so make it VS more expensive than + // SV. + const InstructionMapping &VSMapping = getInstructionMapping( + 3, 4, getOperandsMapping( + {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}), + 3); // Num Operands + AltMappings.push_back(&VSMapping); + break; + } + case TargetOpcode::G_LOAD: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); + // FIXME: Should we be hard coding the size for these mappings? + if (isInstrUniform(MI)) { + const InstructionMapping &SSMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.push_back(&SSMapping); + } + + const InstructionMapping &VVMapping = getInstructionMapping( 2, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), 2); // Num Operands AltMappings.push_back(&VVMapping); - // FIXME: Should this be the pointer-size (64-bits) or the size of the - // register that will hold the bufffer resourc (128-bits). - const InstructionMapping &VSMapping = getInstructionMapping( - 3, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), - 2); // Num Operands - AltMappings.push_back(&VSMapping); + // It may be possible to have a vgpr = load sgpr mapping here, because + // the mubuf instructions support this kind of load, but probably for only + // gfx7 and older. However, the addressing mode matching in the instruction + // selector should be able to do a better job of detecting and selecting + // these kinds of loads from the vgpr = load vgpr mapping. return AltMappings; @@ -184,15 +484,32 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( AltMappings.push_back(&SSMapping); const InstructionMapping &VVMapping = getInstructionMapping(2, 1, - getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 4); // Num Operands AltMappings.push_back(&VVMapping); return AltMappings; } + case TargetOpcode::G_SMIN: + case TargetOpcode::G_SMAX: + case TargetOpcode::G_UMIN: + case TargetOpcode::G_UMAX: { + static const OpRegBankEntry<3> Table[4] = { + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, + { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, + + // Scalar requires cmp+select, and extends if 16-bit. + // FIXME: Should there be separate costs for 32 and 16-bit + { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 } + }; + + const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } }; + return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + } case TargetOpcode::G_UADDE: case TargetOpcode::G_USUBE: case TargetOpcode::G_SADDE: @@ -234,23 +551,816 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( AltMappings.push_back(&VMapping); return AltMappings; } + case AMDGPU::G_INTRINSIC: + return getInstrAlternativeMappingsIntrinsic(MI, MRI); + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: + return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); default: break; } return RegisterBankInfo::getInstrAlternativeMappings(MI); } -void AMDGPURegisterBankInfo::applyMappingImpl( - const OperandsMapper &OpdMapper) const { - return applyDefaultMapping(OpdMapper); +void AMDGPURegisterBankInfo::split64BitValueForMapping( + MachineIRBuilder &B, + SmallVector<Register, 2> &Regs, + LLT HalfTy, + Register Reg) const { + assert(HalfTy.getSizeInBits() == 32); + MachineRegisterInfo *MRI = B.getMRI(); + Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); + Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); + const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); + MRI->setRegBank(LoLHS, *Bank); + MRI->setRegBank(HiLHS, *Bank); + + Regs.push_back(LoLHS); + Regs.push_back(HiLHS); + + B.buildInstr(AMDGPU::G_UNMERGE_VALUES) + .addDef(LoLHS) + .addDef(HiLHS) + .addUse(Reg); } -static bool isInstrUniform(const MachineInstr &MI) { - if (!MI.hasOneMemOperand()) +/// Replace the current type each register in \p Regs has with \p NewTy +static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, + LLT NewTy) { + for (Register Reg : Regs) { + assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); + MRI.setType(Reg, NewTy); + } +} + +static LLT getHalfSizedType(LLT Ty) { + if (Ty.isVector()) { + assert(Ty.getNumElements() % 2 == 0); + return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); + } + + assert(Ty.getSizeInBits() % 2 == 0); + return LLT::scalar(Ty.getSizeInBits() / 2); +} + +/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If +/// any of the required SGPR operands are VGPRs, perform a waterfall loop to +/// execute the instruction for each unique combination of values in all lanes +/// in the wave. The block will be split such that rest of the instructions are +/// moved to a new block. +/// +/// Essentially performs this loop: +// +/// Save Execution Mask +/// For (Lane : Wavefront) { +/// Enable Lane, Disable all other lanes +/// SGPR = read SGPR value for current lane from VGPR +/// VGPRResult[Lane] = use_op SGPR +/// } +/// Restore Execution Mask +/// +/// There is additional complexity to try for compare values to identify the +/// unique values used. +void AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const { + MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineBasicBlock::iterator I(MI); + + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + // Use a set to avoid extra readfirstlanes in the case where multiple operands + // are the same register. + SmallSet<Register, 4> SGPROperandRegs; + for (unsigned Op : OpIndices) { + assert(MI.getOperand(Op).isUse()); + Register Reg = MI.getOperand(Op).getReg(); + const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); + if (OpBank->getID() == AMDGPU::VGPRRegBankID) + SGPROperandRegs.insert(Reg); + } + + // No operands need to be replaced, so no need to loop. + if (SGPROperandRegs.empty()) + return; + + MachineIRBuilder B(MI); + SmallVector<Register, 4> ResultRegs; + SmallVector<Register, 4> InitResultRegs; + SmallVector<Register, 4> PhiRegs; + for (MachineOperand &Def : MI.defs()) { + LLT ResTy = MRI.getType(Def.getReg()); + const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); + ResultRegs.push_back(Def.getReg()); + Register InitReg = B.buildUndef(ResTy).getReg(0); + Register PhiReg = MRI.createGenericVirtualRegister(ResTy); + InitResultRegs.push_back(InitReg); + PhiRegs.push_back(PhiReg); + MRI.setRegBank(PhiReg, *DefBank); + MRI.setRegBank(InitReg, *DefBank); + } + + Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + // Don't bother using generic instructions/registers for the exec mask. + B.buildInstr(TargetOpcode::IMPLICIT_DEF) + .addDef(InitSaveExecReg); + + Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + // To insert the loop we need to split the block. Move everything before this + // point to a new block, and insert a new empty block before this instruction. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RestoreExecBB); + MF->insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(RestoreExecBB); + LoopBB->addSuccessor(LoopBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + + MBB.addSuccessor(LoopBB); + RestoreExecBB->addSuccessor(RemainderBB); + + B.setInsertPt(*LoopBB, LoopBB->end()); + + B.buildInstr(TargetOpcode::PHI) + .addDef(PhiExec) + .addReg(InitSaveExecReg) + .addMBB(&MBB) + .addReg(NewExec) + .addMBB(LoopBB); + + for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { + B.buildInstr(TargetOpcode::G_PHI) + .addDef(std::get<2>(Result)) + .addReg(std::get<0>(Result)) // Initial value / implicit_def + .addMBB(&MBB) + .addReg(std::get<1>(Result)) // Mid-loop value. + .addMBB(LoopBB); + } + + // Move the instruction into the loop. + LoopBB->splice(LoopBB->end(), &MBB, I); + I = std::prev(LoopBB->end()); + + B.setInstr(*I); + + Register CondReg; + + for (MachineOperand &Op : MI.uses()) { + if (!Op.isReg()) + continue; + + assert(!Op.isDef()); + if (SGPROperandRegs.count(Op.getReg())) { + LLT OpTy = MRI.getType(Op.getReg()); + unsigned OpSize = OpTy.getSizeInBits(); + + // Can only do a readlane of 32-bit pieces. + if (OpSize == 32) { + // Avoid extra copies in the simple case of one 32-bit register. + Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + MRI.setType(CurrentLaneOpReg, OpTy); + + constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg) + .addReg(Op.getReg()); + + Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; + + // Compare the just read M0 value to all possible Idx values. + B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(Op.getReg()); + Op.setReg(CurrentLaneOpReg); + + if (!First) { + Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + // If there are multiple operands to consider, and the conditions. + B.buildInstr(AMDGPU::S_AND_B64) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } else { + LLT S32 = LLT::scalar(32); + SmallVector<Register, 8> ReadlanePieces; + + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. + + bool Is64 = OpSize % 64 == 0; + + LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); + unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 + : AMDGPU::V_CMP_EQ_U32_e64; + + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. + + // Insert the unmerge before the loop. + + B.setMBB(MBB); + auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); + B.setInstr(*I); + + unsigned NumPieces = Unmerge->getNumOperands() - 1; + for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { + unsigned UnmergePiece = Unmerge.getReg(PieceIdx); + + Register CurrentLaneOpReg; + if (Is64) { + Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); + Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); + + MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); + MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); + MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegLo) + .addReg(UnmergePiece, 0, AMDGPU::sub0); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegHi) + .addReg(UnmergePiece, 0, AMDGPU::sub1); + + CurrentLaneOpReg = + B.buildMerge(LLT::scalar(64), + {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) + .getReg(0); + + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); + + if (OpTy.getScalarSizeInBits() == 64) { + // If we need to produce a 64-bit element vector, so use the + // merged pieces + ReadlanePieces.push_back(CurrentLaneOpReg); + } else { + // 32-bit element type. + ReadlanePieces.push_back(CurrentLaneOpRegLo); + ReadlanePieces.push_back(CurrentLaneOpRegHi); + } + } else { + CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(UnmergePiece); + ReadlanePieces.push_back(CurrentLaneOpReg); + } + + Register NewCondReg + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; + + B.buildInstr(CmpOp) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(UnmergePiece); + + if (!First) { + Register AndReg + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + // If there are multiple operands to consider, and the conditions. + B.buildInstr(AMDGPU::S_AND_B64) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } + + // FIXME: Build merge seems to switch to CONCAT_VECTORS but not + // BUILD_VECTOR + if (OpTy.isVector()) { + auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } else { + auto Merge = B.buildMerge(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } + + MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); + } + } + } + + B.setInsertPt(*LoopBB, LoopBB->end()); + + // Update EXEC, save the original EXEC value to VCC. + B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64) + .addDef(NewExec) + .addReg(CondReg, RegState::Kill); + + MRI.setSimpleHint(NewExec, CondReg); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + B.buildInstr(AMDGPU::S_XOR_B64_term) + .addDef(AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(NewExec); + + // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use + // s_cbranch_scc0? + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. + B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) + .addMBB(LoopBB); + + // Save the EXEC mask before the loop. + BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg) + .addReg(AMDGPU::EXEC); + + // Restore the EXEC mask after the loop. + B.setMBB(*RestoreExecBB); + B.buildInstr(AMDGPU::S_MOV_B64_term) + .addDef(AMDGPU::EXEC) + .addReg(SaveExecReg); +} + +// Legalize an operand that must be an SGPR by inserting a readfirstlane. +void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( + MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { + Register Reg = MI.getOperand(OpIdx).getReg(); + const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); + if (Bank != &AMDGPU::VGPRRegBank) + return; + + MachineIRBuilder B(MI); + Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) + .addDef(SGPR) + .addReg(Reg); + + const TargetRegisterClass *Constrained = + constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); + (void)Constrained; + assert(Constrained && "Failed to constrain readfirstlane src reg"); + + MI.getOperand(OpIdx).setReg(SGPR); +} + +// When regbankselect repairs registers, it will insert a repair instruction +// which defines the repaired register. Then it calls applyMapping and expects +// that the targets will either delete or rewrite the originally wrote to the +// repaired registers. Beccause of this, we end up in a situation where +// we have 2 instructions defining the same registers. +static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI, + Register Reg, + const MachineInstr &MI) { + // Is there some way we can assert that there are exactly 2 def instructions? + for (MachineInstr &Other : MRI.def_instructions(Reg)) { + if (&Other != &MI) + return &Other; + } + + return nullptr; +} + +bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const { + Register DstReg = MI.getOperand(0).getReg(); + const LLT LoadTy = MRI.getType(DstReg); + unsigned LoadSize = LoadTy.getSizeInBits(); + const unsigned MaxNonSmrdLoadSize = 128; + // 128-bit loads are supported for all instruction types. + if (LoadSize <= MaxNonSmrdLoadSize) return false; - const MachineMemOperand *MMO = *MI.memoperands_begin(); - return AMDGPUInstrInfo::isUniformMMO(MMO); + SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0)); + SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1)); + + // If the pointer is an SGPR, we have nothing to do. + if (SrcRegs.empty()) + return false; + + assert(LoadSize % MaxNonSmrdLoadSize == 0); + + // We want to get the repair instruction now, because it will help us + // determine which instruction the legalizer inserts that will also + // write to DstReg. + MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI); + + // RegBankSelect only emits scalar types, so we need to reset the pointer + // operand to a pointer type. + Register BasePtrReg = SrcRegs[0]; + LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); + MRI.setType(BasePtrReg, PtrTy); + + MachineIRBuilder B(MI); + + unsigned SplitElts = + MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits(); + const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType()); + ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank); + GISelObserverWrapper Observer(&O); + B.setChangeObserver(Observer); + LegalizerHelper Helper(B.getMF(), Observer, B); + if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) + return false; + + // At this point, the legalizer has split the original load into smaller + // loads. At the end of lowering, it inserts an instruction (LegalizedInst) + // that combines the outputs of the lower loads and writes it to DstReg. + // The register bank selector has also added the RepairInst which writes to + // DstReg as well. + + MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst); + + // Replace the output of the LegalizedInst with a temporary register, since + // RepairInst already defines DstReg. + Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg)); + LegalizedInst->getOperand(0).setReg(TmpReg); + B.setInsertPt(*RepairInst->getParent(), RepairInst); + + for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) { + Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + B.buildConstant(IdxReg, DefIdx); + MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID)); + B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg); + } + + MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); + return true; +} + +// For cases where only a single copy is inserted for matching register banks. +// Replace the register in the instruction operand +static void substituteSimpleCopyRegs( + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { + SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); + if (!SrcReg.empty()) { + assert(SrcReg.size() == 1); + OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); + } +} + +void AMDGPURegisterBankInfo::applyMappingImpl( + const OperandsMapper &OpdMapper) const { + MachineInstr &MI = OpdMapper.getMI(); + unsigned Opc = MI.getOpcode(); + MachineRegisterInfo &MRI = OpdMapper.getMRI(); + switch (Opc) { + case AMDGPU::G_SELECT: { + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy.getSizeInBits() != 64) + break; + + LLT HalfTy = getHalfSizedType(DstTy); + + SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); + SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1)); + SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); + SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); + + // All inputs are SGPRs, nothing special to do. + if (DefRegs.empty()) { + assert(Src1Regs.empty() && Src2Regs.empty()); + break; + } + + MachineIRBuilder B(MI); + if (Src0Regs.empty()) + Src0Regs.push_back(MI.getOperand(1).getReg()); + else { + assert(Src0Regs.size() == 1); + } + + if (Src1Regs.empty()) + split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); + else { + setRegsToType(MRI, Src1Regs, HalfTy); + } + + if (Src2Regs.empty()) + split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); + else + setRegsToType(MRI, Src2Regs, HalfTy); + + setRegsToType(MRI, DefRegs, HalfTy); + + B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]); + B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]); + + MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); + MI.eraseFromParent(); + return; + } + case AMDGPU::G_AND: + case AMDGPU::G_OR: + case AMDGPU::G_XOR: { + // 64-bit and is only available on the SALU, so split into 2 32-bit ops if + // there is a VGPR input. + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy.getSizeInBits() != 64) + break; + + LLT HalfTy = getHalfSizedType(DstTy); + SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); + SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); + SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); + + // All inputs are SGPRs, nothing special to do. + if (DefRegs.empty()) { + assert(Src0Regs.empty() && Src1Regs.empty()); + break; + } + + assert(DefRegs.size() == 2); + assert(Src0Regs.size() == Src1Regs.size() && + (Src0Regs.empty() || Src0Regs.size() == 2)); + + // Depending on where the source registers came from, the generic code may + // have decided to split the inputs already or not. If not, we still need to + // extract the values. + MachineIRBuilder B(MI); + + if (Src0Regs.empty()) + split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); + else + setRegsToType(MRI, Src0Regs, HalfTy); + + if (Src1Regs.empty()) + split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); + else + setRegsToType(MRI, Src1Regs, HalfTy); + + setRegsToType(MRI, DefRegs, HalfTy); + + B.buildInstr(Opc) + .addDef(DefRegs[0]) + .addUse(Src0Regs[0]) + .addUse(Src1Regs[0]); + + B.buildInstr(Opc) + .addDef(DefRegs[1]) + .addUse(Src0Regs[1]) + .addUse(Src1Regs[1]); + + MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); + MI.eraseFromParent(); + return; + } + case AMDGPU::G_ADD: + case AMDGPU::G_SUB: + case AMDGPU::G_MUL: { + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy != LLT::scalar(16)) + break; + + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + if (DstBank == &AMDGPU::VGPRRegBank) + break; + + // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. + MachineFunction *MF = MI.getParent()->getParent(); + MachineIRBuilder B(MI); + ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); + GISelObserverWrapper Observer(&ApplySALU); + LegalizerHelper Helper(*MF, Observer, B); + + if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != + LegalizerHelper::Legalized) + llvm_unreachable("widen scalar should have succeeded"); + return; + } + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: { + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + if (DstBank == &AMDGPU::VGPRRegBank) + break; + + MachineFunction *MF = MI.getParent()->getParent(); + MachineIRBuilder B(MI); + ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); + GISelObserverWrapper Observer(&ApplySALU); + LegalizerHelper Helper(*MF, Observer, B); + + // Turn scalar min/max into a compare and select. + LLT Ty = MRI.getType(DstReg); + LLT S32 = LLT::scalar(32); + LLT S16 = LLT::scalar(16); + + if (Ty == S16) { + // Need to widen to s32, and expand as cmp + select. + if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) + llvm_unreachable("widenScalar should have succeeded"); + + // FIXME: This is relying on widenScalar leaving MI in place. + if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized) + llvm_unreachable("lower should have succeeded"); + } else { + if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized) + llvm_unreachable("lower should have succeeded"); + } + + return; + } + case AMDGPU::G_SEXT: + case AMDGPU::G_ZEXT: { + Register SrcReg = MI.getOperand(1).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + bool Signed = Opc == AMDGPU::G_SEXT; + + MachineIRBuilder B(MI); + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy.isScalar() && + SrcBank != &AMDGPU::SGPRRegBank && + SrcBank != &AMDGPU::SCCRegBank && + SrcBank != &AMDGPU::VCCRegBank && + // FIXME: Should handle any type that round to s64 when irregular + // breakdowns supported. + DstTy.getSizeInBits() == 64 && + SrcTy.getSizeInBits() <= 32) { + const LLT S32 = LLT::scalar(32); + SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); + + // Extend to 32-bit, and then extend the low half. + if (Signed) { + // TODO: Should really be buildSExtOrCopy + B.buildSExtOrTrunc(DefRegs[0], SrcReg); + + // Replicate sign bit from 32-bit extended part. + auto ShiftAmt = B.buildConstant(S32, 31); + MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); + B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt); + } else { + B.buildZExtOrTrunc(DefRegs[0], SrcReg); + B.buildConstant(DefRegs[1], 0); + } + + MRI.setRegBank(DstReg, *SrcBank); + MI.eraseFromParent(); + return; + } + + if (SrcTy != LLT::scalar(1)) + return; + + if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) { + SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); + + const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ? + &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank; + + unsigned DstSize = DstTy.getSizeInBits(); + // 64-bit select is SGPR only + const bool UseSel64 = DstSize > 32 && + SrcBank->getID() == AMDGPU::SCCRegBankID; + + // TODO: Should s16 select be legal? + LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); + auto True = B.buildConstant(SelType, Signed ? -1 : 1); + auto False = B.buildConstant(SelType, 0); + + MRI.setRegBank(True.getReg(0), *DstBank); + MRI.setRegBank(False.getReg(0), *DstBank); + MRI.setRegBank(DstReg, *DstBank); + + if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) { + B.buildSelect(DefRegs[0], SrcReg, True, False); + B.buildCopy(DefRegs[1], DefRegs[0]); + } else if (DstSize < 32) { + auto Sel = B.buildSelect(SelType, SrcReg, True, False); + MRI.setRegBank(Sel.getReg(0), *DstBank); + B.buildTrunc(DstReg, Sel); + } else { + B.buildSelect(DstReg, SrcReg, True, False); + } + + MI.eraseFromParent(); + return; + } + + // Fixup the case with an s1 src that isn't a condition register. Use shifts + // instead of introducing a compare to avoid an unnecessary condition + // register (and since there's no scalar 16-bit compares). + auto Ext = B.buildAnyExt(DstTy, SrcReg); + auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1); + auto Shl = B.buildShl(DstTy, Ext, ShiftAmt); + + if (MI.getOpcode() == AMDGPU::G_SEXT) + B.buildAShr(DstReg, Shl, ShiftAmt); + else + B.buildLShr(DstReg, Shl, ShiftAmt); + + MRI.setRegBank(DstReg, *SrcBank); + MRI.setRegBank(Ext.getReg(0), *SrcBank); + MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); + MRI.setRegBank(Shl.getReg(0), *SrcBank); + MI.eraseFromParent(); + return; + } + case AMDGPU::G_EXTRACT_VECTOR_ELT: + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 2 }); + return; + case AMDGPU::G_INTRINSIC: { + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + case Intrinsic::amdgcn_s_buffer_load: { + // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS + executeInWaterfallLoop(MI, MRI, { 2, 3 }); + return; + } + case Intrinsic::amdgcn_readlane: { + substituteSimpleCopyRegs(OpdMapper, 2); + + assert(empty(OpdMapper.getVRegs(0))); + assert(empty(OpdMapper.getVRegs(3))); + + // Make sure the index is an SGPR. It doesn't make sense to run this in a + // waterfall loop, so assume it's a uniform value. + constrainOpWithReadfirstlane(MI, MRI, 3); // Index + return; + } + case Intrinsic::amdgcn_writelane: { + assert(empty(OpdMapper.getVRegs(0))); + assert(empty(OpdMapper.getVRegs(2))); + assert(empty(OpdMapper.getVRegs(3))); + + substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val + constrainOpWithReadfirstlane(MI, MRI, 2); // Source value + constrainOpWithReadfirstlane(MI, MRI, 3); // Index + return; + } + default: + break; + } + break; + } + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + case Intrinsic::amdgcn_buffer_load: { + executeInWaterfallLoop(MI, MRI, { 2 }); + return; + } + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: { + // This is only allowed to execute with 1 lane, so readfirstlane is safe. + assert(empty(OpdMapper.getVRegs(0))); + substituteSimpleCopyRegs(OpdMapper, 3); + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + return; + } + case Intrinsic::amdgcn_s_sendmsg: + case Intrinsic::amdgcn_s_sendmsghalt: { + // FIXME: Should this use a waterfall loop? + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + return; + } + default: + break; + } + break; + } + case AMDGPU::G_LOAD: { + if (applyMappingWideLoad(MI, OpdMapper, MRI)) + return; + break; + } + default: + break; + } + + return applyDefaultMapping(OpdMapper); } bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { @@ -259,7 +1369,7 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { if (!MI.getOperand(i).isReg()) continue; - unsigned Reg = MI.getOperand(i).getReg(); + Register Reg = MI.getOperand(i).getReg(); if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { if (Bank->getID() == AMDGPU::VGPRRegBankID) return false; @@ -299,7 +1409,7 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { if (MI.getOperand(OpdIdx).isIntrinsicID()) OpdsMapping[OpdIdx++] = nullptr; - unsigned Reg1 = MI.getOperand(OpdIdx).getReg(); + Register Reg1 = MI.getOperand(OpdIdx).getReg(); unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI); unsigned DefaultBankID = Size1 == 1 ? @@ -309,7 +1419,11 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1); for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) { - unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI); + const MachineOperand &MO = MI.getOperand(OpdIdx); + if (!MO.isReg()) + continue; + + unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI); unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size); } @@ -325,7 +1439,11 @@ AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - unsigned Size = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); + const MachineOperand &Op = MI.getOperand(I); + if (!Op.isReg()) + continue; + + unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); } @@ -340,6 +1458,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); const ValueMapping *ValMapping; @@ -350,7 +1469,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); } else { - ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); // FIXME: What would happen if we used SGPRRegBankID here? PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); } @@ -366,7 +1485,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { } unsigned -AMDGPURegisterBankInfo::getRegBankID(unsigned Reg, +AMDGPURegisterBankInfo::getRegBankID(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, unsigned Default) const { @@ -383,13 +1502,81 @@ AMDGPURegisterBankInfo::getRegBankID(unsigned Reg, /// const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { - const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + if (MI.isRegSequence()) { + // If any input is a VGPR, the result must be a VGPR. The default handling + // assumes any copy between banks is legal. + unsigned BankID = AMDGPU::SGPRRegBankID; + + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI); + // It doesn't make sense to use vcc or scc banks here, so just ignore + // them. + if (OpBank != AMDGPU::SGPRRegBankID) { + BankID = AMDGPU::VGPRRegBankID; + break; + } + } + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + + const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); + return getInstructionMapping( + 1, /*Cost*/ 1, + /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); + } + + // The default handling is broken and doesn't handle illegal SGPR->VGPR copies + // properly. + // + // TODO: There are additional exec masking dependencies to analyze. + if (MI.getOpcode() == TargetOpcode::G_PHI) { + // TODO: Generate proper invalid bank enum. + int ResultBank = -1; + + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + unsigned Reg = MI.getOperand(I).getReg(); + const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); + + // FIXME: Assuming VGPR for any undetermined inputs. + if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { + ResultBank = AMDGPU::VGPRRegBankID; + break; + } + + unsigned OpBank = Bank->getID(); + // scc, scc -> sgpr + if (OpBank == AMDGPU::SCCRegBankID) { + // There's only one SCC register, so a phi requires copying to SGPR. + OpBank = AMDGPU::SGPRRegBankID; + } else if (OpBank == AMDGPU::VCCRegBankID) { + // vcc, vcc -> vcc + // vcc, sgpr -> vgpr + if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) { + ResultBank = AMDGPU::VGPRRegBankID; + break; + } + } + + ResultBank = OpBank; + } + + assert(ResultBank != -1); + + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + + const ValueMapping &ValMap = + getValueMapping(0, Size, getRegBank(ResultBank)); + return getInstructionMapping( + 1, /*Cost*/ 1, + /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); + } + + const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); if (Mapping.isValid()) return Mapping; - const MachineFunction &MF = *MI.getParent()->getParent(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); switch (MI.getOpcode()) { @@ -401,18 +1588,86 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_XOR: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); if (Size == 1) { - OpdsMapping[0] = OpdsMapping[1] = - OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + const RegisterBank *DstBank + = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); + + unsigned TargetBankID = -1; + unsigned BankLHS = -1; + unsigned BankRHS = -1; + if (DstBank) { + TargetBankID = DstBank->getID(); + if (DstBank == &AMDGPU::VCCRegBank) { + TargetBankID = AMDGPU::VCCRegBankID; + BankLHS = AMDGPU::VCCRegBankID; + BankRHS = AMDGPU::VCCRegBankID; + } else if (DstBank == &AMDGPU::SCCRegBank) { + TargetBankID = AMDGPU::SCCRegBankID; + BankLHS = AMDGPU::SGPRRegBankID; + BankRHS = AMDGPU::SGPRRegBankID; + } else { + BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + } + } else { + BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + AMDGPU::VCCRegBankID); + BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::VCCRegBankID); + + // Both inputs should be true booleans to produce a boolean result. + if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { + TargetBankID = AMDGPU::VGPRRegBankID; + } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { + TargetBankID = AMDGPU::VCCRegBankID; + BankLHS = AMDGPU::VCCRegBankID; + BankRHS = AMDGPU::VCCRegBankID; + } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { + TargetBankID = AMDGPU::SGPRRegBankID; + } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) { + // The operation must be done on a 32-bit register, but it will set + // scc. The result type could interchangably be SCC or SGPR, since + // both values will be produced. + TargetBankID = AMDGPU::SCCRegBankID; + BankLHS = AMDGPU::SGPRRegBankID; + BankRHS = AMDGPU::SGPRRegBankID; + } + } + + OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); + OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); + break; + } + + if (Size == 64) { + + if (isSALUMapping(MI)) { + OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); + OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; + } else { + OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); + unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/); + OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); + + unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); + } + break; } LLVM_FALLTHROUGH; } + case AMDGPU::G_GEP: case AMDGPU::G_ADD: case AMDGPU::G_SUB: case AMDGPU::G_MUL: case AMDGPU::G_SHL: + case AMDGPU::G_LSHR: + case AMDGPU::G_ASHR: case AMDGPU::G_UADDO: case AMDGPU::G_SADDO: case AMDGPU::G_USUBO: @@ -421,6 +1676,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_SADDE: case AMDGPU::G_USUBE: case AMDGPU::G_SSUBE: + case AMDGPU::G_UMULH: + case AMDGPU::G_SMULH: + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); LLVM_FALLTHROUGH; @@ -431,11 +1692,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_FPTOUI: case AMDGPU::G_FMUL: case AMDGPU::G_FMA: + case AMDGPU::G_FSQRT: case AMDGPU::G_SITOFP: case AMDGPU::G_UITOFP: case AMDGPU::G_FPTRUNC: + case AMDGPU::G_FPEXT: case AMDGPU::G_FEXP2: case AMDGPU::G_FLOG2: + case AMDGPU::G_FCANONICALIZE: case AMDGPU::G_INTRINSIC_TRUNC: case AMDGPU::G_INTRINSIC_ROUND: return getDefaultMappingVOP(MI); @@ -473,7 +1737,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = nullptr; break; } - case AMDGPU::G_MERGE_VALUES: { + case AMDGPU::G_MERGE_VALUES: + case AMDGPU::G_BUILD_VECTOR: + case AMDGPU::G_CONCAT_VECTORS: { unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); @@ -502,8 +1768,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_TRUNC: { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); unsigned Bank = getRegBankID(Src, MRI, *TRI); unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); @@ -514,23 +1780,35 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_ZEXT: case AMDGPU::G_SEXT: case AMDGPU::G_ANYEXT: { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); - unsigned SrcBank = getRegBankID(Src, MRI, *TRI, - SrcSize == 1 ? AMDGPU::SGPRRegBankID : - AMDGPU::VGPRRegBankID); - unsigned DstBank = SrcBank; - if (SrcSize == 1) { - if (SrcBank == AMDGPU::SGPRRegBankID) - DstBank = AMDGPU::VGPRRegBankID; - else - DstBank = AMDGPU::SGPRRegBankID; - } - - OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); - OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank, SrcSize); + + unsigned DstBank; + const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); + assert(SrcBank); + switch (SrcBank->getID()) { + case AMDGPU::SCCRegBankID: + case AMDGPU::SGPRRegBankID: + DstBank = AMDGPU::SGPRRegBankID; + break; + default: + DstBank = AMDGPU::VGPRRegBankID; + break; + } + + // TODO: Should anyext be split into 32-bit part as well? + if (MI.getOpcode() == AMDGPU::G_ANYEXT) { + OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize); + } else { + // Scalar extend can use 64-bit BFE, but VGPRs require extending to + // 32-bits, and then to 64. + OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); + OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), + SrcSize); + } break; } case AMDGPU::G_FCMP: { @@ -542,16 +1820,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); break; } - case AMDGPU::G_GEP: { - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - if (!MI.getOperand(i).isReg()) - continue; - - unsigned Size = MRI.getType(MI.getOperand(i).getReg()).getSizeInBits(); - OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); - } - break; - } case AMDGPU::G_STORE: { assert(MI.getOperand(0).isReg()); unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); @@ -571,57 +1839,55 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case AMDGPU::G_ICMP: { + auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); - unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID && - Op3Bank == AMDGPU::SGPRRegBankID ? - AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; + + bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID && + Op3Bank == AMDGPU::SGPRRegBankID && + (Size == 32 || (Size == 64 && + (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && + MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64())); + + unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; + OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1); OpdsMapping[1] = nullptr; // Predicate Operand. OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size); break; } - - case AMDGPU::G_EXTRACT_VECTOR_ELT: { - unsigned IdxOp = 2; - int64_t Imm; - // XXX - Do we really need to fully handle these? The constant case should - // be legalized away before RegBankSelect? - - unsigned OutputBankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ? + unsigned OutputBankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; - + unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits()); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(1).getReg()).getSizeInBits()); + + OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize); + OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize); // The index can be either if the source vector is VGPR. - OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits()); + OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); break; } case AMDGPU::G_INSERT_VECTOR_ELT: { - // XXX - Do we really need to fully handle these? The constant case should - // be legalized away before RegBankSelect? - - int64_t Imm; - - unsigned IdxOp = MI.getOpcode() == AMDGPU::G_EXTRACT_VECTOR_ELT ? 2 : 3; - unsigned BankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ? - AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; - - + unsigned OutputBankID = isSALUMapping(MI) ? + AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; - // TODO: Can do SGPR indexing, which would obviate the need for the - // isConstant check. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); - OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); - } + unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); + unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); + OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); + OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize); + // The index can be either if the source vector is VGPR. + OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); break; } case AMDGPU::G_UNMERGE_VALUES: { @@ -637,14 +1903,70 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_INTRINSIC: { - switch (MI.getOperand(1).getIntrinsicID()) { + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { default: return getInvalidInstructionMapping(); case Intrinsic::maxnum: case Intrinsic::minnum: + case Intrinsic::amdgcn_div_fmas: + case Intrinsic::amdgcn_trig_preop: + case Intrinsic::amdgcn_sin: + case Intrinsic::amdgcn_cos: + case Intrinsic::amdgcn_log_clamp: + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_rsq: + case Intrinsic::amdgcn_rsq_legacy: + case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_ldexp: + case Intrinsic::amdgcn_frexp_mant: + case Intrinsic::amdgcn_frexp_exp: + case Intrinsic::amdgcn_fract: case Intrinsic::amdgcn_cvt_pkrtz: + case Intrinsic::amdgcn_cvt_pknorm_i16: + case Intrinsic::amdgcn_cvt_pknorm_u16: + case Intrinsic::amdgcn_cvt_pk_i16: + case Intrinsic::amdgcn_cvt_pk_u16: + case Intrinsic::amdgcn_fmed3: + case Intrinsic::amdgcn_cubeid: + case Intrinsic::amdgcn_cubema: + case Intrinsic::amdgcn_cubesc: + case Intrinsic::amdgcn_cubetc: + case Intrinsic::amdgcn_sffbh: + case Intrinsic::amdgcn_fmad_ftz: + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::amdgcn_mbcnt_hi: + case Intrinsic::amdgcn_ubfe: + case Intrinsic::amdgcn_sbfe: + case Intrinsic::amdgcn_lerp: + case Intrinsic::amdgcn_sad_u8: + case Intrinsic::amdgcn_msad_u8: + case Intrinsic::amdgcn_sad_hi_u8: + case Intrinsic::amdgcn_sad_u16: + case Intrinsic::amdgcn_qsad_pk_u16_u8: + case Intrinsic::amdgcn_mqsad_pk_u16_u8: + case Intrinsic::amdgcn_mqsad_u32_u8: + case Intrinsic::amdgcn_cvt_pk_u8_f32: + case Intrinsic::amdgcn_alignbit: + case Intrinsic::amdgcn_alignbyte: + case Intrinsic::amdgcn_fdot2: + case Intrinsic::amdgcn_sdot2: + case Intrinsic::amdgcn_udot2: + case Intrinsic::amdgcn_sdot4: + case Intrinsic::amdgcn_udot4: + case Intrinsic::amdgcn_sdot8: + case Intrinsic::amdgcn_udot8: + case Intrinsic::amdgcn_fdiv_fast: + case Intrinsic::amdgcn_wwm: + case Intrinsic::amdgcn_wqm: return getDefaultMappingVOP(MI); - case Intrinsic::amdgcn_kernarg_segment_ptr: { + case Intrinsic::amdgcn_ds_permute: + case Intrinsic::amdgcn_ds_bpermute: + case Intrinsic::amdgcn_update_dpp: + return getDefaultMappingAllVGPR(MI); + case Intrinsic::amdgcn_kernarg_segment_ptr: + case Intrinsic::amdgcn_s_getpc: + case Intrinsic::amdgcn_groupstaticsize: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; @@ -652,16 +1974,142 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_wqm_vote: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = OpdsMapping[2] - = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); + break; + } + case Intrinsic::amdgcn_s_buffer_load: { + // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS + Register RSrc = MI.getOperand(2).getReg(); // SGPR + Register Offset = MI.getOperand(3).getReg(); // SGPR/imm + + unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); + unsigned Size3 = MRI.getType(Offset).getSizeInBits(); + + unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); + unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); + + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0); + OpdsMapping[1] = nullptr; // intrinsic id + + // Lie and claim everything is legal, even though some need to be + // SGPRs. applyMapping will have to deal with it as a waterfall loop. + OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc + OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3); + OpdsMapping[4] = nullptr; + break; + } + case Intrinsic::amdgcn_div_scale: { + unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); + + unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); + OpdsMapping[3] = AMDGPU::getValueMapping( + getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize); + OpdsMapping[4] = AMDGPU::getValueMapping( + getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize); + + break; + } + case Intrinsic::amdgcn_class: { + Register Src0Reg = MI.getOperand(2).getReg(); + Register Src1Reg = MI.getOperand(3).getReg(); + unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); + unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); + OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI), + Src0Size); + OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI), + Src1Size); + break; + } + case Intrinsic::amdgcn_icmp: + case Intrinsic::amdgcn_fcmp: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + // This is not VCCRegBank because this is not used in boolean contexts. + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); + unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize); + OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize); + break; + } + case Intrinsic::amdgcn_readlane: { + // This must be an SGPR, but accept a VGPR. + unsigned IdxReg = MI.getOperand(3).getReg(); + unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); + unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); + OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); + LLVM_FALLTHROUGH; + } + case Intrinsic::amdgcn_readfirstlane: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); + break; + } + case Intrinsic::amdgcn_writelane: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned SrcReg = MI.getOperand(2).getReg(); + unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); + unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID); + unsigned IdxReg = MI.getOperand(3).getReg(); + unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); + unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + + // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted + // to legalize. + OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); + OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); + break; + } + case Intrinsic::amdgcn_if_break: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } } break; } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { - switch (MI.getOperand(0).getIntrinsicID()) { + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { default: return getInvalidInstructionMapping(); + case Intrinsic::amdgcn_s_getreg: + case Intrinsic::amdgcn_s_memtime: + case Intrinsic::amdgcn_s_memrealtime: + case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + return getDefaultMappingAllVGPR(MI); + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + break; + } case Intrinsic::amdgcn_exp_compr: OpdsMapping[0] = nullptr; // IntrinsicID // FIXME: These are immediate values which can't be read from registers. @@ -688,24 +2136,82 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); break; + case Intrinsic::amdgcn_buffer_load: { + Register RSrc = MI.getOperand(2).getReg(); // SGPR + Register VIndex = MI.getOperand(3).getReg(); // VGPR + Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm + + unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); + unsigned Size3 = MRI.getType(VIndex).getSizeInBits(); + unsigned Size4 = MRI.getType(Offset).getSizeInBits(); + + unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); + unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); + + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); + OpdsMapping[1] = nullptr; // intrinsic id + + // Lie and claim everything is legal, even though some need to be + // SGPRs. applyMapping will have to deal with it as a waterfall loop. + OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3); + OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4); + OpdsMapping[5] = nullptr; + OpdsMapping[6] = nullptr; + break; + } + case Intrinsic::amdgcn_s_sendmsg: + case Intrinsic::amdgcn_s_sendmsghalt: { + // This must be an SGPR, but accept a VGPR. + unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); + break; + } + case Intrinsic::amdgcn_end_cf: { + unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } } break; } case AMDGPU::G_SELECT: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - unsigned Op1Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, AMDGPU::SGPRRegBankID); - unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); - bool SGPRSrcs = Op1Bank == AMDGPU::SCCRegBankID && - Op2Bank == AMDGPU::SGPRRegBankID && + unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && Op3Bank == AMDGPU::SGPRRegBankID; - unsigned Bank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; - Op1Bank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; - OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); - OpdsMapping[1] = AMDGPU::getValueMapping(Op1Bank, 1); - OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); - OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); + + unsigned CondBankDefault = SGPRSrcs ? + AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; + unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + CondBankDefault); + if (CondBank == AMDGPU::SGPRRegBankID) + CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; + else if (CondBank == AMDGPU::VGPRRegBankID) + CondBank = AMDGPU::VCCRegBankID; + + unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ? + AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + + assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID); + + if (Size == 64) { + OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); + OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); + OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); + OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); + } else { + OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); + OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); + OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); + } + break; } @@ -737,6 +2243,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } } - return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), + return getInstructionMapping(/*ID*/1, /*Cost*/1, + getOperandsMapping(OpdsMapping), MI.getNumOperands()); } + diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index d29f4bc79a51..f3a96e2a6128 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -1,9 +1,8 @@ //===- AMDGPURegisterBankInfo -----------------------------------*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -14,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H +#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #define GET_REGBANK_DECLARATIONS @@ -22,6 +22,8 @@ namespace llvm { +class LLT; +class MachineIRBuilder; class SIRegisterInfo; class TargetRegisterInfo; @@ -36,16 +38,53 @@ protected: class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const SIRegisterInfo *TRI; + void executeInWaterfallLoop(MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const; + + void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI, + unsigned OpIdx) const; + bool applyMappingWideLoad(MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const; + /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; const RegisterBankInfo::InstructionMapping & getInstrMappingForLoad(const MachineInstr &MI) const; - unsigned getRegBankID(unsigned Reg, const MachineRegisterInfo &MRI, + unsigned getRegBankID(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, unsigned Default = AMDGPU::VGPRRegBankID) const; + /// Split 64-bit value \p Reg into two 32-bit halves and populate them into \p + /// Regs. This appropriately sets the regbank of the new registers. + void split64BitValueForMapping(MachineIRBuilder &B, + SmallVector<Register, 2> &Regs, + LLT HalfTy, + Register Reg) const; + + template <unsigned NumOps> + struct OpRegBankEntry { + int8_t RegBanks[NumOps]; + int16_t Cost; + }; + + template <unsigned NumOps> + InstructionMappings + addMappingFromTable(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const std::array<unsigned, NumOps> RegSrcOpIdx, + ArrayRef<OpRegBankEntry<NumOps>> Table) const; + + RegisterBankInfo::InstructionMappings + getInstrAlternativeMappingsIntrinsic( + const MachineInstr &MI, const MachineRegisterInfo &MRI) const; + + RegisterBankInfo::InstructionMappings + getInstrAlternativeMappingsIntrinsicWSideEffects( + const MachineInstr &MI, const MachineRegisterInfo &MRI) const; + bool isSALUMapping(const MachineInstr &MI) const; const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const; const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const; @@ -57,6 +96,9 @@ public: unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override; + unsigned getBreakDownCost(const ValueMapping &ValMapping, + const RegisterBank *CurBank = nullptr) const override; + const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC) const override; diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 570379a820e1..9555694fb106 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -1,9 +1,8 @@ //=- AMDGPURegisterBank.td - Describe the AMDGPU Banks -------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -15,7 +14,7 @@ def VGPRRegBank : RegisterBank<"VGPR", [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512] >; -def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS]>; +def SCCRegBank : RegisterBank <"SCC", [SReg_32, SCC_CLASS]>; // It is helpful to distinguish conditions from ordinary SGPRs. def VCCRegBank : RegisterBank <"VCC", [SReg_64]>; diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index 50f859addc2b..7cffdf1a4dcf 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -32,7 +31,10 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, - AMDGPU::sub15 + AMDGPU::sub15, AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, + AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, AMDGPU::sub24, + AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, AMDGPU::sub28, AMDGPU::sub29, + AMDGPU::sub30, AMDGPU::sub31 }; assert(Channel < array_lengthof(SubRegs)); @@ -83,7 +85,18 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, } } -unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const SIFrameLowering *TFI = + MF.getSubtarget<GCNSubtarget>().getFrameLowering(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - return FuncInfo->getFrameOffsetReg(); + return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() + : FuncInfo->getStackPtrOffsetReg(); +} + +const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { + return CSR_AMDGPU_AllVGPRs_RegMask; +} + +const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { + return CSR_AMDGPU_AllAllocatableSRegs_RegMask; } diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h index 922d974f2ebd..3453a8c1b0b3 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -1,9 +1,8 @@ //===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td index ceabae524414..ab71b7aa8a57 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.td +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td @@ -1,9 +1,8 @@ //===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,7 +12,7 @@ let Namespace = "AMDGPU" in { -foreach Index = 0-15 in { +foreach Index = 0-31 in { def sub#Index : SubRegIndex<32, !shl(Index, 5)>; } diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index efe501cb73c2..4f095087a57f 100644 --- a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -1,9 +1,8 @@ //===- AMDGPURewriteOutArgumentsPass.cpp - Create struct returns ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 9dbd7751b4d8..f8703c36127a 100644 --- a/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -1,9 +1,8 @@ //===-- AMDGPUSearchableTables.td - ------------------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -49,6 +48,8 @@ def : SourceOfDivergence<int_amdgcn_workitem_id_z>; def : SourceOfDivergence<int_amdgcn_interp_mov>; def : SourceOfDivergence<int_amdgcn_interp_p1>; def : SourceOfDivergence<int_amdgcn_interp_p2>; +def : SourceOfDivergence<int_amdgcn_interp_p1_f16>; +def : SourceOfDivergence<int_amdgcn_interp_p2_f16>; def : SourceOfDivergence<int_amdgcn_mbcnt_hi>; def : SourceOfDivergence<int_amdgcn_mbcnt_lo>; def : SourceOfDivergence<int_r600_read_tidig_x>; @@ -70,8 +71,59 @@ def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_smin>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_umin>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_smax>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_umax>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_and>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_smin>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_umin>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_smax>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_umax>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_and>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_ps_live>; def : SourceOfDivergence<int_amdgcn_ds_swizzle>; +def : SourceOfDivergence<int_amdgcn_ds_ordered_add>; +def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>; +def : SourceOfDivergence<int_amdgcn_permlane16>; +def : SourceOfDivergence<int_amdgcn_permlanex16>; +def : SourceOfDivergence<int_amdgcn_mov_dpp>; +def : SourceOfDivergence<int_amdgcn_mov_dpp8>; +def : SourceOfDivergence<int_amdgcn_update_dpp>; + +def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4f16>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_4x4x4i8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x2bf16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x1f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16f16>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x4i8>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x16i8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x2bf16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8bf16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x1f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4f16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8f16>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>; foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence<intr>; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index ed0cc70c3d9a..1eb9b83456c5 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -41,12 +40,17 @@ using namespace llvm; #undef AMDGPUSubtarget #include "R600GenSubtargetInfo.inc" +static cl::opt<bool> DisablePowerSched( + "amdgpu-disable-power-sched", + cl::desc("Disable scheduling to minimize mAI power bursts"), + cl::init(false)); + GCNSubtarget::~GCNSubtarget() = default; R600Subtarget & R600Subtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { - SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); + SmallString<256> FullFS("+promote-alloca,"); FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -65,7 +69,7 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT, GCNSubtarget & GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, - StringRef GPU, StringRef FS) { + StringRef GPU, StringRef FS) { // Determine default and user-specified characteristics // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be // enabled, but some instructions do not respect them and they run at the @@ -78,10 +82,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, // Similarly we want enable-prt-strict-null to be on by default and not to // unset everything else if it is disabled - SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); + // Assuming ECC is enabled is the conservative default. + SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. - FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; + FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; // FIXME: I don't think think Evergreen has any useful support for // denormals, but should be checked. Should we issue a warning somewhere @@ -94,6 +99,16 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS + // Disable mutually exclusive bits. + if (FS.find_lower("+wavefrontsize") != StringRef::npos) { + if (FS.find_lower("wavefrontsize16") == StringRef::npos) + FullFS += "-wavefrontsize16,"; + if (FS.find_lower("wavefrontsize32") == StringRef::npos) + FullFS += "-wavefrontsize32,"; + if (FS.find_lower("wavefrontsize64") == StringRef::npos) + FullFS += "-wavefrontsize64,"; + } + FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -124,8 +139,25 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, HasMovrel = true; } + // Don't crash on invalid devices. + if (WavefrontSize == 0) + WavefrontSize = 64; + HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; + if (DoesNotSupportXNACK && EnableXNACK) { + ToggleFeature(AMDGPU::FeatureXNACK); + EnableXNACK = false; + } + + // ECC is on by default, but turn it off if the hardware doesn't support it + // anyway. This matters for the gfx9 targets with d16 loads, but don't support + // ECC. + if (DoesNotSupportSRAMECC && EnableSRAMECC) { + ToggleFeature(AMDGPU::FeatureSRAMECC); + EnableSRAMECC = false; + } + return *this; } @@ -152,8 +184,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, AMDGPUGenSubtargetInfo(TT, GPU, FS), AMDGPUSubtarget(TT), TargetTriple(TT), - Gen(SOUTHERN_ISLANDS), - IsaVersion(ISAVersion0_0_0), + Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), InstrItins(getInstrItineraryForCPU(GPU)), LDSBankCount(0), MaxPrivateElementSize(0), @@ -162,7 +193,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HalfRate64Ops(false), FP64FP16Denormals(false), - DX10Clamp(false), FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), CodeObjectV3(false), @@ -171,11 +201,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasApertureRegs(false), EnableXNACK(false), + DoesNotSupportXNACK(false), + EnableCuMode(false), TrapHandler(false), - DebuggerInsertNops(false), - DebuggerEmitPrologue(false), - EnableHugePrivateBuffer(false), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), @@ -186,8 +215,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FP64(false), GCN3Encoding(false), CIInsts(false), - VIInsts(false), + GFX8Insts(false), GFX9Insts(false), + GFX10Insts(false), + GFX7GFX8GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), HasIntClamp(false), @@ -202,19 +233,47 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasSDWAMac(false), HasSDWAOutModsVOPC(false), HasDPP(false), + HasDPP8(false), HasR128A16(false), + HasNSAEncoding(false), HasDLInsts(false), - HasDotInsts(false), + HasDot1Insts(false), + HasDot2Insts(false), + HasDot3Insts(false), + HasDot4Insts(false), + HasDot5Insts(false), + HasDot6Insts(false), + HasMAIInsts(false), + HasPkFmacF16Inst(false), + HasAtomicFaddInsts(false), EnableSRAMECC(false), + DoesNotSupportSRAMECC(false), + HasNoSdstCMPX(false), + HasVscnt(false), + HasRegisterBanking(false), + HasVOP3Literal(false), + HasNoDataDepHazard(false), FlatAddressSpace(false), FlatInstOffsets(false), FlatGlobalInsts(false), FlatScratchInsts(false), + ScalarFlatScratchInsts(false), AddNoCarryInsts(false), HasUnpackedD16VMem(false), + LDSMisalignedBug(false), ScalarizeGlobal(false), + HasVcmpxPermlaneHazard(false), + HasVMEMtoScalarWriteHazard(false), + HasSMEMtoVectorWriteHazard(false), + HasInstFwdPrefetchBug(false), + HasVcmpxExecWARHazard(false), + HasLdsBranchVmemWARHazard(false), + HasNSAtoVMEMBug(false), + HasOffset3fBug(false), + HasFlatSegmentOffsetBug(false), + FeatureDisable(false), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), @@ -226,12 +285,34 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); } +unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { + if (getGeneration() < GFX10) + return 1; + + switch (Opcode) { + case AMDGPU::V_LSHLREV_B64: + case AMDGPU::V_LSHLREV_B64_gfx10: + case AMDGPU::V_LSHL_B64: + case AMDGPU::V_LSHRREV_B64: + case AMDGPU::V_LSHRREV_B64_gfx10: + case AMDGPU::V_LSHR_B64: + case AMDGPU::V_ASHRREV_I64: + case AMDGPU::V_ASHRREV_I64_gfx10: + case AMDGPU::V_ASHR_I64: + return 1; + } + + return 2; +} + unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, const Function &F) const { if (NWaves == 1) return getLocalMemorySize(); unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + if (!WorkGroupsPerCu) + return 0; unsigned MaxWaves = getMaxWavesPerEU(); return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } @@ -240,6 +321,8 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &F) const { unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + if (!WorkGroupsPerCu) + return 0; unsigned MaxWaves = getMaxWavesPerEU(); unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); @@ -260,7 +343,8 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: - return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); + return std::make_pair(getWavefrontSize() * 2, + std::max(getWavefrontSize() * 4, 256u)); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_LS: case CallingConv::AMDGPU_HS: @@ -280,12 +364,6 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( std::pair<unsigned, unsigned> Default = getDefaultFlatWorkGroupSize(F.getCallingConv()); - // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa - // starts using "amdgpu-flat-work-group-size" attribute. - Default.second = AMDGPU::getIntegerAttribute( - F, "amdgpu-max-work-group-size", Default.second); - Default.first = std::min(Default.first, Default.second); - // Requested minimum/maximum flat work group sizes. std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( F, "amdgpu-flat-work-group-size", Default); @@ -319,10 +397,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( getMaxWavesPerEU(FlatWorkGroupSizes.second); bool RequestedFlatWorkGroupSize = false; - // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa - // starts using "amdgpu-flat-work-group-size" attribute. - if (F.hasFnAttribute("amdgpu-max-work-group-size") || - F.hasFnAttribute("amdgpu-flat-work-group-size")) { + if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { Default.first = MinImpliedByFlatWorkGroupSize; RequestedFlatWorkGroupSize = true; } @@ -460,7 +535,6 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, FMA(false), CaymanISA(false), CFALUBug(false), - DX10Clamp(false), HasVertexCache(false), R600ALUInst(false), FP64(false), @@ -486,7 +560,14 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, Policy.ShouldTrackLaneMasks = true; } +bool GCNSubtarget::hasMadF16() const { + return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; +} + unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { + if (getGeneration() >= AMDGPUSubtarget::GFX10) + return 10; + if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) return 10; @@ -533,6 +614,9 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + if (getGeneration() >= AMDGPUSubtarget::GFX10) + return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. + if (MFI.hasFlatScratchInit()) { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). @@ -631,9 +715,7 @@ struct MemOpClusterMutation : ScheduleDAGMutation { MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} - void apply(ScheduleDAGInstrs *DAGInstrs) override { - ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); - + void apply(ScheduleDAGInstrs *DAG) override { SUnit *SUa = nullptr; // Search for two consequent memory operations and link them // to prevent scheduler from moving them apart. @@ -674,11 +756,130 @@ struct MemOpClusterMutation : ScheduleDAGMutation { } } }; + +struct FillMFMAShadowMutation : ScheduleDAGMutation { + const SIInstrInfo *TII; + + ScheduleDAGMI *DAG; + + FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} + + bool isSALU(const SUnit *SU) const { + const MachineInstr *MI = SU->getInstr(); + return MI && TII->isSALU(*MI) && !MI->isTerminator(); + } + + bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { + if (Pred->NodeNum < Succ->NodeNum) + return true; + + SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); + + for (unsigned I = 0; I < Succs.size(); ++I) { + for (const SDep &SI : Succs[I]->Succs) { + const SUnit *SU = SI.getSUnit(); + if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) + Succs.push_back(SU); + } + } + + SmallPtrSet<const SUnit*, 32> Visited; + while (!Preds.empty()) { + const SUnit *SU = Preds.pop_back_val(); + if (llvm::find(Succs, SU) != Succs.end()) + return false; + Visited.insert(SU); + for (const SDep &SI : SU->Preds) + if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) + Preds.push_back(SI.getSUnit()); + } + + return true; + } + + // Link as much SALU intructions in chain as possible. Return the size + // of the chain. Links up to MaxChain instructions. + unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, + SmallPtrSetImpl<SUnit *> &Visited) const { + SmallVector<SUnit *, 8> Worklist({To}); + unsigned Linked = 0; + + while (!Worklist.empty() && MaxChain-- > 0) { + SUnit *SU = Worklist.pop_back_val(); + if (!Visited.insert(SU).second) + continue; + + LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); + dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); + + if (SU->addPred(SDep(From, SDep::Artificial), false)) + ++Linked; + + for (SDep &SI : From->Succs) { + SUnit *SUv = SI.getSUnit(); + if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU)) + SUv->addPred(SDep(SU, SDep::Artificial), false); + } + + for (SDep &SI : SU->Succs) { + SUnit *Succ = SI.getSUnit(); + if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) + Worklist.push_back(Succ); + } + } + + return Linked; + } + + void apply(ScheduleDAGInstrs *DAGInstrs) override { + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); + if (!ST.hasMAIInsts() || DisablePowerSched) + return; + DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + // Scan for MFMA long latency instructions and try to add a dependency + // of available SALU instructions to give them a chance to fill MFMA + // shadow. That is desirable to fill MFMA shadow with SALU instructions + // rather than VALU to prevent power consumption bursts and throttle. + auto LastSALU = DAG->SUnits.begin(); + auto E = DAG->SUnits.end(); + SmallPtrSet<SUnit*, 32> Visited; + for (SUnit &SU : DAG->SUnits) { + MachineInstr &MAI = *SU.getInstr(); + if (!TII->isMAI(MAI) || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) + continue; + + unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; + + LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); + dbgs() << "Need " << Lat + << " instructions to cover latency.\n"); + + // Find up to Lat independent scalar instructions as early as + // possible such that they can be scheduled after this MFMA. + for ( ; Lat && LastSALU != E; ++LastSALU) { + if (Visited.count(&*LastSALU)) + continue; + + if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) + continue; + + Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); + } + } + } +}; } // namespace void GCNSubtarget::getPostRAMutations( std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); + Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo)); } const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 5584759e5580..78c3b823946d 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -1,9 +1,8 @@ //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// // @@ -56,7 +55,8 @@ public: SOUTHERN_ISLANDS = 4, SEA_ISLANDS = 5, VOLCANIC_ISLANDS = 6, - GFX9 = 7 + GFX9 = 7, + GFX10 = 8 }; private: @@ -246,26 +246,6 @@ public: class GCNSubtarget : public AMDGPUGenSubtargetInfo, public AMDGPUSubtarget { public: - enum { - ISAVersion0_0_0, - ISAVersion6_0_0, - ISAVersion6_0_1, - ISAVersion7_0_0, - ISAVersion7_0_1, - ISAVersion7_0_2, - ISAVersion7_0_3, - ISAVersion7_0_4, - ISAVersion8_0_1, - ISAVersion8_0_2, - ISAVersion8_0_3, - ISAVersion8_1_0, - ISAVersion9_0_0, - ISAVersion9_0_2, - ISAVersion9_0_4, - ISAVersion9_0_6, - ISAVersion9_0_9, - }; - enum TrapHandlerAbi { TrapHandlerAbiNone = 0, TrapHandlerAbiHsa = 1 @@ -297,7 +277,6 @@ protected: // Basic subtarget description. Triple TargetTriple; unsigned Gen; - unsigned IsaVersion; InstrItineraryData InstrItins; int LDSBankCount; unsigned MaxPrivateElementSize; @@ -308,7 +287,6 @@ protected: // Dynamially set bits that enable features. bool FP64FP16Denormals; - bool DX10Clamp; bool FlatForGlobal; bool AutoWaitcntBeforeBarrier; bool CodeObjectV3; @@ -316,12 +294,11 @@ protected: bool UnalignedBufferAccess; bool HasApertureRegs; bool EnableXNACK; + bool DoesNotSupportXNACK; + bool EnableCuMode; bool TrapHandler; - bool DebuggerInsertNops; - bool DebuggerEmitPrologue; // Used as options. - bool EnableHugePrivateBuffer; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; @@ -336,8 +313,10 @@ protected: bool IsGCN; bool GCN3Encoding; bool CIInsts; - bool VIInsts; + bool GFX8Insts; bool GFX9Insts; + bool GFX10Insts; + bool GFX7GFX8GFX9Insts; bool SGPRInitBug; bool HasSMemRealTime; bool HasIntClamp; @@ -352,23 +331,51 @@ protected: bool HasSDWAMac; bool HasSDWAOutModsVOPC; bool HasDPP; + bool HasDPP8; bool HasR128A16; + bool HasNSAEncoding; bool HasDLInsts; - bool HasDotInsts; + bool HasDot1Insts; + bool HasDot2Insts; + bool HasDot3Insts; + bool HasDot4Insts; + bool HasDot5Insts; + bool HasDot6Insts; + bool HasMAIInsts; + bool HasPkFmacF16Inst; + bool HasAtomicFaddInsts; bool EnableSRAMECC; + bool DoesNotSupportSRAMECC; + bool HasNoSdstCMPX; + bool HasVscnt; + bool HasRegisterBanking; + bool HasVOP3Literal; + bool HasNoDataDepHazard; bool FlatAddressSpace; bool FlatInstOffsets; bool FlatGlobalInsts; bool FlatScratchInsts; + bool ScalarFlatScratchInsts; bool AddNoCarryInsts; bool HasUnpackedD16VMem; bool R600ALUInst; bool CaymanISA; bool CFALUBug; + bool LDSMisalignedBug; bool HasVertexCache; short TexVTXClauseSize; bool ScalarizeGlobal; + bool HasVcmpxPermlaneHazard; + bool HasVMEMtoScalarWriteHazard; + bool HasSMEMtoVectorWriteHazard; + bool HasInstFwdPrefetchBug; + bool HasVcmpxExecWARHazard; + bool HasLdsBranchVmemWARHazard; + bool HasNSAtoVMEMBug; + bool HasOffset3fBug; + bool HasFlatSegmentOffsetBug; + // Dummy feature to use for assembler in tablegen. bool FeatureDisable; @@ -378,6 +385,9 @@ private: SITargetLowering TLInfo; SIFrameLowering FrameLowering; + // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. + static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); + public: GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM); @@ -437,6 +447,11 @@ public: return Log2_32(WavefrontSize); } + /// Return the number of high bits known to be zero fror a frame index. + unsigned getKnownHighZeroBitsForFrameIndex() const { + return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); + } + int getLDSBankCount() const { return LDSBankCount; } @@ -445,6 +460,8 @@ public: return MaxPrivateElementSize; } + unsigned getConstantBusLimit(unsigned Opcode) const; + bool hasIntClamp() const { return HasIntClamp; } @@ -473,6 +490,12 @@ public: return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); } + // Return true if the target only has the reverse operand versions of VALU + // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). + bool hasOnlyRevVALUShifts() const { + return getGeneration() >= VOLCANIC_ISLANDS; + } + bool hasBFE() const { return true; } @@ -525,14 +548,48 @@ public: return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; } - bool enableHugePrivateBuffer() const { - return EnableHugePrivateBuffer; + /// True if the offset field of DS instructions works as expected. On SI, the + /// offset uses a 16-bit adder and does not always wrap properly. + bool hasUsableDSOffset() const { + return getGeneration() >= SEA_ISLANDS; } bool unsafeDSOffsetFoldingEnabled() const { return EnableUnsafeDSOffsetFolding; } + /// Condition output from div_scale is usable. + bool hasUsableDivScaleConditionOutput() const { + return getGeneration() != SOUTHERN_ISLANDS; + } + + /// Extra wait hazard is needed in some cases before + /// s_cbranch_vccnz/s_cbranch_vccz. + bool hasReadVCCZBug() const { + return getGeneration() <= SEA_ISLANDS; + } + + /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR + /// was written by a VALU instruction. + bool hasSMRDReadVALUDefHazard() const { + return getGeneration() == SOUTHERN_ISLANDS; + } + + /// A read of an SGPR by a VMEM instruction requires 5 wait states when the + /// SGPR was written by a VALU Instruction. + bool hasVMEMReadSGPRVALUDefHazard() const { + return getGeneration() >= VOLCANIC_ISLANDS; + } + + bool hasRFEHazards() const { + return getGeneration() >= VOLCANIC_ISLANDS; + } + + /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. + unsigned getSetRegWaitStates() const { + return getGeneration() <= SEA_ISLANDS ? 1 : 2; + } + bool dumpCode() const { return DumpCode; } @@ -554,14 +611,6 @@ public: return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool enableDX10Clamp() const { - return DX10Clamp; - } - - bool enableIEEEBit(const MachineFunction &MF) const { - return AMDGPU::isCompute(MF.getFunction().getCallingConv()); - } - bool useFlatForGlobal() const { return FlatForGlobal; } @@ -572,6 +621,11 @@ public: return CIInsts && EnableDS128; } + /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 + bool haveRoundOpsF64() const { + return CIInsts; + } + /// \returns If MUBUF instructions always perform range checking, even for /// buffer resources used for private memory access. bool privateMemoryResourceIsRangeChecked() const { @@ -613,10 +667,18 @@ public: return EnableXNACK; } + bool isCuModeEnabled() const { + return EnableCuMode; + } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } + bool hasFlatScrRegister() const { + return hasFlatAddressSpace(); + } + bool hasFlatInstOffsets() const { return FlatInstOffsets; } @@ -629,6 +691,14 @@ public: return FlatScratchInsts; } + bool hasScalarFlatScratchInsts() const { + return ScalarFlatScratchInsts; + } + + bool hasFlatSegmentOffsetBug() const { + return HasFlatSegmentOffsetBug; + } + bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; } @@ -637,12 +707,34 @@ public: return getGeneration() >= GFX9; } + bool d16PreservesUnusedBits() const { + return hasD16LoadStore() && !isSRAMECCEnabled(); + } + + bool hasD16Images() const { + return getGeneration() >= VOLCANIC_ISLANDS; + } + /// Return if most LDS instructions have an m0 use that require m0 to be /// iniitalized. bool ldsRequiresM0Init() const { return getGeneration() < GFX9; } + // True if the hardware rewinds and replays GWS operations if a wave is + // preempted. + // + // If this is false, a GWS operation requires testing if a nack set the + // MEM_VIOL bit, and repeating if so. + bool hasGWSAutoReplay() const { + return getGeneration() >= GFX9; + } + + /// \returns if target has ds_gws_sema_release_all instruction. + bool hasGWSSemaReleaseAll() const { + return CIInsts; + } + bool hasAddNoCarry() const { return AddNoCarryInsts; } @@ -680,22 +772,74 @@ public: return HasSDWAOutModsVOPC; } - bool vmemWriteNeedsExpWaitcnt() const { - return getGeneration() < SEA_ISLANDS; - } - bool hasDLInsts() const { return HasDLInsts; } - bool hasDotInsts() const { - return HasDotInsts; + bool hasDot1Insts() const { + return HasDot1Insts; + } + + bool hasDot2Insts() const { + return HasDot2Insts; + } + + bool hasDot3Insts() const { + return HasDot3Insts; + } + + bool hasDot4Insts() const { + return HasDot4Insts; + } + + bool hasDot5Insts() const { + return HasDot5Insts; + } + + bool hasDot6Insts() const { + return HasDot6Insts; + } + + bool hasMAIInsts() const { + return HasMAIInsts; + } + + bool hasPkFmacF16Inst() const { + return HasPkFmacF16Inst; + } + + bool hasAtomicFaddInsts() const { + return HasAtomicFaddInsts; } bool isSRAMECCEnabled() const { return EnableSRAMECC; } + bool hasNoSdstCMPX() const { + return HasNoSdstCMPX; + } + + bool hasVscnt() const { + return HasVscnt; + } + + bool hasRegisterBanking() const { + return HasRegisterBanking; + } + + bool hasVOP3Literal() const { + return HasVOP3Literal; + } + + bool hasNoDataDepHazard() const { + return HasNoDataDepHazard; + } + + bool vmemWriteNeedsExpWaitcnt() const { + return getGeneration() < SEA_ISLANDS; + } + // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspecive of an arbitrary workitem, this // is 4-byte aligned. @@ -792,29 +936,34 @@ public: return HasScalarAtomics; } + bool hasLDSFPAtomics() const { + return GFX8Insts; + } bool hasDPP() const { return HasDPP; } + bool hasDPP8() const { + return HasDPP8; + } + bool hasR128A16() const { return HasR128A16; } - bool enableSIScheduler() const { - return EnableSIScheduler; + bool hasOffset3fBug() const { + return HasOffset3fBug; } - bool debuggerSupported() const { - return debuggerInsertNops() && debuggerEmitPrologue(); + bool hasNSAEncoding() const { + return HasNSAEncoding; } - bool debuggerInsertNops() const { - return DebuggerInsertNops; - } + bool hasMadF16() const; - bool debuggerEmitPrologue() const { - return DebuggerEmitPrologue; + bool enableSIScheduler() const { + return EnableSIScheduler; } bool loadStoreOptEnabled() const { @@ -835,15 +984,48 @@ public: } bool hasSMovFedHazard() const { - return getGeneration() >= AMDGPUSubtarget::GFX9; + return getGeneration() == AMDGPUSubtarget::GFX9; } bool hasReadM0MovRelInterpHazard() const { - return getGeneration() >= AMDGPUSubtarget::GFX9; + return getGeneration() == AMDGPUSubtarget::GFX9; } bool hasReadM0SendMsgHazard() const { - return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; + return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && + getGeneration() <= AMDGPUSubtarget::GFX9; + } + + bool hasVcmpxPermlaneHazard() const { + return HasVcmpxPermlaneHazard; + } + + bool hasVMEMtoScalarWriteHazard() const { + return HasVMEMtoScalarWriteHazard; + } + + bool hasSMEMtoVectorWriteHazard() const { + return HasSMEMtoVectorWriteHazard; + } + + bool hasLDSMisalignedBug() const { + return LDSMisalignedBug && !EnableCuMode; + } + + bool hasInstFwdPrefetchBug() const { + return HasInstFwdPrefetchBug; + } + + bool hasVcmpxExecWARHazard() const { + return HasVcmpxExecWARHazard; + } + + bool hasLdsBranchVmemWARHazard() const { + return HasLdsBranchVmemWARHazard; + } + + bool hasNSAtoVMEMBug() const { + return HasNSAtoVMEMBug; } /// Return the maximum number of waves per SIMD for kernels using \p SGPRs @@ -957,6 +1139,14 @@ public: std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const override; + bool isWave32() const { + return WavefrontSize == 32; + } + + const TargetRegisterClass *getBoolRC() const { + return getRegisterInfo()->getBoolRC(); + } + /// \returns Maximum number of work groups per compute unit supported by the /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { @@ -994,7 +1184,6 @@ private: bool FMA; bool CaymanISA; bool CFALUBug; - bool DX10Clamp; bool HasVertexCache; bool R600ALUInst; bool FP64; diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e8cefdbf74b9..0ea8db04c298 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -25,11 +24,14 @@ #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" #include "R600MachineScheduler.h" +#include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" @@ -67,6 +69,11 @@ EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(false)); +static cl::opt<bool> +OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, + cl::desc("Run pre-RA exec mask optimizations"), + cl::init(true)); + static cl::opt<bool> EnableR600IfConvert( "r600-if-convert", cl::desc("Use if conversion pass"), @@ -109,7 +116,7 @@ static cl::opt<bool> EnableSDWAPeephole( static cl::opt<bool> EnableDPPCombine( "amdgpu-dpp-combine", cl::desc("Enable DPP combiner"), - cl::init(false)); + cl::init(true)); // Enable address space based alias analysis static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, @@ -123,11 +130,11 @@ static cl::opt<bool, true> LateCFGStructurize( cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden); -static cl::opt<bool, true> EnableAMDGPUFunctionCalls( +static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt( "amdgpu-function-calls", cl::desc("Enable AMDGPU function call support"), cl::location(AMDGPUTargetMachine::EnableFunctionCalls), - cl::init(false), + cl::init(true), cl::Hidden); // Enable lib calls simplifications @@ -143,6 +150,12 @@ static cl::opt<bool> EnableLowerKernelArguments( cl::init(true), cl::Hidden); +static cl::opt<bool> EnableRegReassign( + "amdgpu-reassign-regs", + cl::desc("Enable register reassign optimizations on gfx10+"), + cl::init(true), + cl::Hidden); + // Enable atomic optimization static cl::opt<bool> EnableAtomicOptimizations( "amdgpu-atomic-optimizations", @@ -157,6 +170,18 @@ static cl::opt<bool> EnableSIModeRegisterPass( cl::init(true), cl::Hidden); +// Option is used in lit tests to prevent deadcoding of patterns inspected. +static cl::opt<bool> +EnableDCEInRA("amdgpu-dce-in-ra", + cl::init(true), cl::Hidden, + cl::desc("Enable machine DCE inside regalloc")); + +static cl::opt<bool> EnableScalarIRPasses( + "amdgpu-scalar-ir-passes", + cl::desc("Enable scalar IR passes"), + cl::init(true), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); @@ -172,6 +197,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUDAGToDAGISelPass(*PR); initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); + initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); initializeSIFixupVectorISelPass(*PR); @@ -192,6 +218,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); + initializeAMDGPUPropagateAttributesEarlyPass(*PR); + initializeAMDGPUPropagateAttributesLatePass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); @@ -201,9 +229,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); - initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); - initializeSIFixWWMLivenessPass(*PR); + initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); @@ -211,6 +238,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUInlinerPass(*PR); + initializeGCNRegBankReassignPass(*PR); + initializeGCNNSAReassignPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -295,10 +324,11 @@ static StringRef computeDataLayout(const Triple &TT) { } // 32-bit private, local, and region pointers. 64-bit global, constant and - // flat. + // flat, non-integral buffer fat pointers. return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"; + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + "-ni:7"; } LLVM_READNONE @@ -306,8 +336,9 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { if (!GPU.empty()) return GPU; + // Need to default to a target with flat support for HSA. if (TT.getArch() == Triple::amdgcn) - return "generic"; + return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; return "r600"; } @@ -363,24 +394,25 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { bool EnableOpt = getOptLevel() > CodeGenOpt::None; bool Internalize = InternalizeSymbols; - bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls; + bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; - if (EnableAMDGPUFunctionCalls) { + if (EnableFunctionCalls) { delete Builder.Inliner; Builder.Inliner = createAMDGPUFunctionInliningPass(); } Builder.addExtension( PassManagerBuilder::EP_ModuleOptimizerEarly, - [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &, - legacy::PassManagerBase &PM) { + [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { if (AMDGPUAA) { PM.add(createAMDGPUAAWrapperPass()); PM.add(createAMDGPUExternalAAWrapperPass()); } PM.add(createAMDGPUUnifyMetadataPass()); + PM.add(createAMDGPUPropagateAttributesLatePass(this)); if (Internalize) { PM.add(createInternalizePass(mustPreserveGV)); PM.add(createGlobalDCEPass()); @@ -392,15 +424,16 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { const auto &Opt = Options; Builder.addExtension( PassManagerBuilder::EP_EarlyAsPossible, - [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &, - legacy::PassManagerBase &PM) { + [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { if (AMDGPUAA) { PM.add(createAMDGPUAAWrapperPass()); PM.add(createAMDGPUExternalAAWrapperPass()); } + PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); PM.add(llvm::createAMDGPUUseNativeCallsPass()); if (LibCallSimplify) - PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt)); + PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this)); }); Builder.addExtension( @@ -428,6 +461,11 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL, bool JIT) : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { setRequiresStructuredCFG(true); + + // Override the default since calls aren't supported for r600. + if (EnableFunctionCalls && + EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0) + EnableFunctionCalls = false; } const R600Subtarget *R600TargetMachine::getSubtargetImpl( @@ -528,8 +566,14 @@ public: bool addPreISel() override; bool addInstSelector() override; bool addGCPasses() override; + + std::unique_ptr<CSEConfigBase> getCSEConfig() const override; }; +std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const { + return getStandardCSEConfigForOpt(TM->getOptLevel()); +} + class R600PassConfig final : public AMDGPUPassConfig { public: R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) @@ -572,9 +616,10 @@ public: bool addLegalizeMachineIR() override; bool addRegBankSelect() override; bool addGlobalInstructionSelect() override; - void addFastRegAlloc(FunctionPass *RegAllocPass) override; - void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; + void addFastRegAlloc() override; + void addOptimizedRegAlloc() override; void addPreRegAlloc() override; + bool addPreRewrite() override; void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; @@ -614,12 +659,16 @@ void AMDGPUPassConfig::addIRPasses() { disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); - addPass(createAtomicExpandPass()); - // This must occur before inlining, as the inliner will not look through // bitcast calls. addPass(createAMDGPUFixFunctionBitcastsPass()); + // A call to propagate attributes pass in the backend in case opt was not run. + addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); + + addPass(createAtomicExpandPass()); + + addPass(createAMDGPULowerIntrinsicsPass()); // Function calls are not supported, so make sure we inline everything. @@ -652,7 +701,8 @@ void AMDGPUPassConfig::addIRPasses() { if (EnableSROA) addPass(createSROAPass()); - addStraightLineScalarOptimizationPasses(); + if (EnableScalarIRPasses) + addStraightLineScalarOptimizationPasses(); if (EnableAMDGPUAliasAnalysis) { addPass(createAMDGPUAAWrapperPass()); @@ -678,15 +728,20 @@ void AMDGPUPassConfig::addIRPasses() { // %1 = shl %a, 2 // // but EarlyCSE can do neither of them. - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses) addEarlyCSEOrGVNPass(); } void AMDGPUPassConfig::addCodeGenPrepare() { + if (TM->getTargetTriple().getArch() == Triple::amdgcn) + addPass(createAMDGPUAnnotateKernelFeaturesPass()); + if (TM->getTargetTriple().getArch() == Triple::amdgcn && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); + addPass(&AMDGPUPerfHintAnalysisID); + TargetPassConfig::addCodeGenPrepare(); if (EnableLoadStoreVectorizer) @@ -700,7 +755,8 @@ bool AMDGPUPassConfig::addPreISel() { } bool AMDGPUPassConfig::addInstSelector() { - addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); + // Defer the verifier until FinalizeISel. + addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false); return false; } @@ -770,7 +826,6 @@ bool GCNPassConfig::addPreISel() { // FIXME: We need to run a pass to propagate the attributes when calls are // supported. - addPass(createAMDGPUAnnotateKernelFeaturesPass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. @@ -783,6 +838,7 @@ bool GCNPassConfig::addPreISel() { if (!LateCFGStructurize) { addPass(createSIAnnotateControlFlowPass()); } + addPass(createLCSSAPass()); return false; } @@ -856,7 +912,7 @@ void GCNPassConfig::addPreRegAlloc() { addPass(createSIWholeQuadModePass()); } -void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { +void GCNPassConfig::addFastRegAlloc() { // FIXME: We have to disable the verifier here because of PHIElimination + // TwoAddressInstructions disabling it. @@ -865,28 +921,40 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); - // This must be run after SILowerControlFlow, since it needs to use the - // machine-level CFG, but before register allocation. - insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + // This must be run just after RegisterCoalescing. + insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false); - TargetPassConfig::addFastRegAlloc(RegAllocPass); + TargetPassConfig::addFastRegAlloc(); } -void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { - insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); - - insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID); +void GCNPassConfig::addOptimizedRegAlloc() { + if (OptExecMaskPreRA) { + insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); + insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID); + } else { + insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + } // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); - // This must be run after SILowerControlFlow, since it needs to use the - // machine-level CFG, but before register allocation. - insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + // This must be run just after RegisterCoalescing. + insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false); + + if (EnableDCEInRA) + insertPass(&RenameIndependentSubregsID, &DeadMachineInstructionElimID); - TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); + TargetPassConfig::addOptimizedRegAlloc(); +} + +bool GCNPassConfig::addPreRewrite() { + if (EnableRegReassign) { + addPass(&GCNNSAReassignID); + addPass(&GCNRegBankReassignID); + } + return true; } void GCNPassConfig::addPostRegAlloc() { @@ -894,6 +962,9 @@ void GCNPassConfig::addPostRegAlloc() { if (getOptLevel() > CodeGenOpt::None) addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); } void GCNPassConfig::addPreSched2() { @@ -919,10 +990,164 @@ void GCNPassConfig::addPreEmitPass() { addPass(&PostRAHazardRecognizerID); addPass(&SIInsertSkipsPassID); - addPass(createSIDebuggerInsertNopsPass()); addPass(&BranchRelaxationPassID); } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { return new GCNPassConfig(*this, PM); } + +yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { + return new yaml::SIMachineFunctionInfo(); +} + +yaml::MachineFunctionInfo * +GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + return new yaml::SIMachineFunctionInfo(*MFI, + *MF.getSubtarget().getRegisterInfo()); +} + +bool GCNTargetMachine::parseMachineFunctionInfo( + const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, SMRange &SourceRange) const { + const yaml::SIMachineFunctionInfo &YamlMFI = + reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_); + MachineFunction &MF = PFS.MF; + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + MFI->initializeBaseYamlFields(YamlMFI); + + auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) { + if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) { + SourceRange = RegName.SourceRange; + return true; + } + + return false; + }; + + auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { + // Create a diagnostic for a the register string literal. + const MemoryBuffer &Buffer = + *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); + Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, + RegName.Value.size(), SourceMgr::DK_Error, + "incorrect register class for field", RegName.Value, + None, None); + SourceRange = RegName.SourceRange; + return true; + }; + + if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || + parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) || + parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || + parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) + return true; + + if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && + !AMDGPU::SReg_128RegClass.contains(MFI->ScratchRSrcReg)) { + return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); + } + + if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG && + !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) { + return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg); + } + + if (MFI->FrameOffsetReg != AMDGPU::FP_REG && + !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { + return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); + } + + if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && + !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { + return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); + } + + auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, + const TargetRegisterClass &RC, + ArgDescriptor &Arg, unsigned UserSGPRs, + unsigned SystemSGPRs) { + // Skip parsing if it's not present. + if (!A) + return false; + + if (A->IsRegister) { + unsigned Reg; + if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { + SourceRange = A->RegisterName.SourceRange; + return true; + } + if (!RC.contains(Reg)) + return diagnoseRegisterClass(A->RegisterName); + Arg = ArgDescriptor::createRegister(Reg); + } else + Arg = ArgDescriptor::createStack(A->StackOffset); + // Check and apply the optional mask. + if (A->Mask) + Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue()); + + MFI->NumUserSGPRs += UserSGPRs; + MFI->NumSystemSGPRs += SystemSGPRs; + return false; + }; + + if (YamlMFI.ArgInfo && + (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, + AMDGPU::SReg_128RegClass, + MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, + AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, + 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, + MFI->ArgInfo.QueuePtr, 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, + AMDGPU::SReg_64RegClass, + MFI->ArgInfo.KernargSegmentPtr, 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, + AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, + 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, + AMDGPU::SReg_64RegClass, + MFI->ArgInfo.FlatScratchInit, 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, + AMDGPU::SGPR_32RegClass, + MFI->ArgInfo.PrivateSegmentSize, 0, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, + AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, + 0, 1) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, + AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, + 0, 1) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, + AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, + 0, 1) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, + AMDGPU::SGPR_32RegClass, + MFI->ArgInfo.WorkGroupInfo, 0, 1) || + parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, + AMDGPU::SGPR_32RegClass, + MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || + parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, + AMDGPU::SReg_64RegClass, + MFI->ArgInfo.ImplicitArgPtr, 0, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, + AMDGPU::SReg_64RegClass, + MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, + AMDGPU::VGPR_32RegClass, + MFI->ArgInfo.WorkItemIDX, 0, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, + AMDGPU::VGPR_32RegClass, + MFI->ArgInfo.WorkItemIDY, 0, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, + AMDGPU::VGPR_32RegClass, + MFI->ArgInfo.WorkItemIDZ, 0, 0))) + return true; + + MFI->Mode.IEEE = YamlMFI.Mode.IEEE; + MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; + + return false; +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 62fbe71d1902..70fa3961236f 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -1,9 +1,8 @@ //===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,7 +14,6 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/StringMap.h" @@ -95,7 +93,6 @@ public: class GCNTargetMachine final : public AMDGPUTargetMachine { private: - AMDGPUIntrinsicInfo IntrinsicInfo; mutable StringMap<std::unique_ptr<GCNSubtarget>> SubtargetMap; public: @@ -110,13 +107,17 @@ public: TargetTransformInfo getTargetTransformInfo(const Function &F) override; - const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { - return &IntrinsicInfo; - } - bool useIPRA() const override { return true; } + + yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override; + yaml::MachineFunctionInfo * + convertFuncInfoToYAML(const MachineFunction &MF) const override; + bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, + PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, + SMRange &SourceRange) const override; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp index c4e1efde130b..6569980d2c75 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h index a4ae1a2c18c2..819bebb7932d 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -1,9 +1,8 @@ //===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 11e4ba4b5010..aaed280a1270 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1,9 +1,8 @@ //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -118,8 +117,10 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, // Add a small bonus for each of such "if" statements. if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) { if (UP.Threshold < MaxBoost && Br->isConditional()) { - if (L->isLoopExiting(Br->getSuccessor(0)) || - L->isLoopExiting(Br->getSuccessor(1))) + BasicBlock *Succ0 = Br->getSuccessor(0); + BasicBlock *Succ1 = Br->getSuccessor(1); + if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) || + (L->contains(Succ1) && L->isLoopExiting(Succ1))) continue; if (dependsOnLocalPhi(L, Br->getCondition())) { UP.Threshold += UnrollThresholdIf; @@ -141,7 +142,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned Threshold = 0; if (AS == AMDGPUAS::PRIVATE_ADDRESS) Threshold = ThresholdPrivate; - else if (AS == AMDGPUAS::LOCAL_ADDRESS) + else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) Threshold = ThresholdLocal; else continue; @@ -159,7 +160,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; if (AllocaSize > MaxAlloca) continue; - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || + AS == AMDGPUAS::REGION_ADDRESS) { LocalGEPsSeen++; // Inhibit unroll for local memory if we have seen addressing not to // a variable, most likely we will be unable to combine it. @@ -254,7 +256,8 @@ unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || - AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) { return 512; } @@ -308,6 +311,8 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, switch (Inst->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -399,7 +404,7 @@ int GCNTTIImpl::getArithmeticInstrCost( if (SLT == MVT::f64) { int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); // Add cost of workaround. - if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (!ST->hasUsableDivScaleConditionOutput()) Cost += 3 * getFullRateInstrCost(); return LT.first * Cost * NElts; @@ -577,6 +582,8 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { return false; case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_icmp: + case Intrinsic::amdgcn_fcmp: return true; } } @@ -607,7 +614,7 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, } bool GCNTTIImpl::areInlineCompatible(const Function *Caller, - const Function *Callee) const { + const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); const FeatureBitset &CallerBits = TM.getSubtargetImpl(*Caller)->getFeatureBits(); @@ -616,7 +623,14 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; - return ((RealCallerBits & RealCalleeBits) == RealCalleeBits); + if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) + return false; + + // FIXME: dx10_clamp can just take the caller setting, but there seems to be + // no way to support merge for backend defined attributes. + AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); + AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); + return CallerMode.isInlineCompatible(CalleeMode); } void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 397c5c6fa6fb..6f1bf5a26f0d 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -1,9 +1,8 @@ //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -78,13 +77,16 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureAutoWaitcntBeforeBarrier, - AMDGPU::FeatureDebuggerEmitPrologue, - AMDGPU::FeatureDebuggerInsertNops, // Property of the kernel/environment which can't actually differ. AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK, AMDGPU::FeatureTrapHandler, + AMDGPU::FeatureCodeObjectV3, + + // The default assumption needs to be ecc is enabled, but no directly + // exposed operations depend on it, so it can be safely inlined. + AMDGPU::FeatureSRAMECC, // Perf-tuning features AMDGPU::FeatureFastFMAF32, @@ -178,8 +180,7 @@ public: // don't use flat addressing. if (IsGraphicsShader) return -1; - return ST->hasFlatAddressSpace() ? - AMDGPUAS::FLAT_ADDRESS : AMDGPUAS::UNKNOWN_ADDRESS_SPACE; + return AMDGPUAS::FLAT_ADDRESS; } unsigned getVectorSplitCost() { return 0; } @@ -190,7 +191,9 @@ public: bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - unsigned getInliningThresholdMultiplier() { return 9; } + unsigned getInliningThresholdMultiplier() { return 7; } + + int getInlinerVectorBonusPercent() { return 0; } int getArithmeticReductionCost(unsigned Opcode, Type *Ty, diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index ced3f6f567e2..396e0ed2e76c 100644 --- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -1,9 +1,8 @@ //===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -199,14 +198,11 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); } else { // Conditional branch. // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BasicBlock::Create(F.getContext(), - "TransitionBlock", &F); - - // Move BI from BB to the new transition block. - BI->removeFromParent(); - TransitionBB->getInstList().push_back(BI); + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - // Create a branch that will always branch to the transition block. + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); } } diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp index 1f6d9234c1ed..d4401a22a1ad 100644 --- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp +++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp @@ -1,9 +1,8 @@ //===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 11cd49e5b3dc..12f2e9519c9e 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1,9 +1,8 @@ //===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h index 289642aaa2d0..3e658a144c1f 100644 --- a/lib/Target/AMDGPU/AMDKernelCodeT.h +++ b/lib/Target/AMDGPU/AMDKernelCodeT.h @@ -1,9 +1,8 @@ //===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file AMDKernelCodeT.h @@ -127,8 +126,12 @@ enum amd_code_property_mask_t { AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, - AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10, - AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6, + AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT = 10, + AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32 = ((1 << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, + + AMD_CODE_PROPERTY_RESERVED1_SHIFT = 11, + AMD_CODE_PROPERTY_RESERVED1_WIDTH = 5, AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT, /// Control wave ID base counter for GDS ordered-append. Used to set diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 3f9af27a2e5e..6d678966c98e 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1,9 +1,8 @@ //===- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -13,6 +12,7 @@ #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "SIDefines.h" #include "SIInstrInfo.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDKernelCodeTUtils.h" @@ -69,7 +69,7 @@ namespace { class AMDGPUAsmParser; -enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL }; +enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_AGPR, IS_TTMP, IS_SPECIAL }; //===----------------------------------------------------------------------===// // Operand @@ -103,14 +103,14 @@ public: int64_t getFPModifiersOperand() const { int64_t Operand = 0; - Operand |= Abs ? SISrcMods::ABS : 0; - Operand |= Neg ? SISrcMods::NEG : 0; + Operand |= Abs ? SISrcMods::ABS : 0u; + Operand |= Neg ? SISrcMods::NEG : 0u; return Operand; } int64_t getIntModifiersOperand() const { int64_t Operand = 0; - Operand |= Sext ? SISrcMods::SEXT : 0; + Operand |= Sext ? SISrcMods::SEXT : 0u; return Operand; } @@ -140,21 +140,25 @@ public: ImmTyInstOffset, ImmTyOffset0, ImmTyOffset1, + ImmTyDLC, ImmTyGLC, ImmTySLC, ImmTyTFE, ImmTyD16, ImmTyClampSI, ImmTyOModSI, + ImmTyDPP8, ImmTyDppCtrl, ImmTyDppRowMask, ImmTyDppBankMask, ImmTyDppBoundCtrl, + ImmTyDppFi, ImmTySdwaDstSel, ImmTySdwaSrc0Sel, ImmTySdwaSrc1Sel, ImmTySdwaDstUnused, ImmTyDMask, + ImmTyDim, ImmTyUNorm, ImmTyDA, ImmTyR128A16, @@ -174,9 +178,15 @@ public: ImmTyNegLo, ImmTyNegHi, ImmTySwizzle, - ImmTyHigh + ImmTyGprIdxMode, + ImmTyHigh, + ImmTyBLGP, + ImmTyCBSZ, + ImmTyABID, + ImmTyEndpgm, }; +private: struct TokOp { const char *Data; unsigned Length; @@ -191,7 +201,6 @@ public: struct RegOp { unsigned RegNo; - bool IsForcedVOP3; Modifiers Mods; }; @@ -202,6 +211,7 @@ public: const MCExpr *Expr; }; +public: bool isToken() const override { if (Kind == Token) return true; @@ -231,32 +241,32 @@ public: return isRegKind() && !hasModifiers(); } - bool isRegOrImmWithInputMods(MVT type) const { - return isRegKind() || isInlinableImm(type); + bool isRegOrImmWithInputMods(unsigned RCID, MVT type) const { + return isRegClass(RCID) || isInlinableImm(type) || isLiteralImm(type); } bool isRegOrImmWithInt16InputMods() const { - return isRegOrImmWithInputMods(MVT::i16); + return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i16); } bool isRegOrImmWithInt32InputMods() const { - return isRegOrImmWithInputMods(MVT::i32); + return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i32); } bool isRegOrImmWithInt64InputMods() const { - return isRegOrImmWithInputMods(MVT::i64); + return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::i64); } bool isRegOrImmWithFP16InputMods() const { - return isRegOrImmWithInputMods(MVT::f16); + return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f16); } bool isRegOrImmWithFP32InputMods() const { - return isRegOrImmWithInputMods(MVT::f32); + return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f32); } bool isRegOrImmWithFP64InputMods() const { - return isRegOrImmWithInputMods(MVT::f64); + return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64); } bool isVReg() const { @@ -268,8 +278,12 @@ public: isRegClass(AMDGPU::VReg_512RegClassID); } + bool isVReg32() const { + return isRegClass(AMDGPU::VGPR_32RegClassID); + } + bool isVReg32OrOff() const { - return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID); + return isOff() || isVReg32(); } bool isSDWAOperand(MVT type) const; @@ -289,6 +303,7 @@ public: bool isClampSI() const { return isImmTy(ImmTyClampSI); } bool isOModSI() const { return isImmTy(ImmTyOModSI); } bool isDMask() const { return isImmTy(ImmTyDMask); } + bool isDim() const { return isImmTy(ImmTyDim); } bool isUNorm() const { return isImmTy(ImmTyUNorm); } bool isDA() const { return isImmTy(ImmTyDA); } bool isR128A16() const { return isImmTy(ImmTyR128A16); } @@ -301,13 +316,13 @@ public: bool isIdxen() const { return isImmTy(ImmTyIdxen); } bool isAddr64() const { return isImmTy(ImmTyAddr64); } bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); } - bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); } + bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<8>(getImm()); } bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); } - bool isOffsetU12() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isUInt<12>(getImm()); } - bool isOffsetS13() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isInt<13>(getImm()); } + bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); } bool isGDS() const { return isImmTy(ImmTyGDS); } bool isLDS() const { return isImmTy(ImmTyLDS); } + bool isDLC() const { return isImmTy(ImmTyDLC); } bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } bool isTFE() const { return isImmTy(ImmTyTFE); } @@ -316,6 +331,7 @@ public: bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } bool isRowMask() const { return isImmTy(ImmTyDppRowMask); } bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); } + bool isFI() const { return isImmTy(ImmTyDppFi); } bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); } bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); } bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); } @@ -339,6 +355,8 @@ public: bool isRegClass(unsigned RCID) const; + bool isInlineValue() const; + bool isRegOrInlineNoMods(unsigned RCID, MVT type) const { return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers(); } @@ -359,6 +377,8 @@ public: return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64); } + bool isBoolReg() const; + bool isSCSrcF16() const { return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16); } @@ -411,6 +431,11 @@ public: return isSSrcF16(); } + bool isSSrcOrLdsB32() const { + return isRegOrInlineNoMods(AMDGPU::SRegOrLds_32RegClassID, MVT::i32) || + isLiteralImm(MVT::i32) || isExpr(); + } + bool isVCSrcB32() const { return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32); } @@ -456,8 +481,7 @@ public: } bool isVSrcV2B16() const { - llvm_unreachable("cannot happen"); - return isVSrcB16(); + return isVSrcB16() || isLiteralImm(MVT::v2i16); } bool isVSrcF32() const { @@ -473,8 +497,127 @@ public: } bool isVSrcV2F16() const { - llvm_unreachable("cannot happen"); - return isVSrcF16(); + return isVSrcF16() || isLiteralImm(MVT::v2f16); + } + + bool isVISrcB32() const { + return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32); + } + + bool isVISrcB16() const { + return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i16); + } + + bool isVISrcV2B16() const { + return isVISrcB16(); + } + + bool isVISrcF32() const { + return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::f32); + } + + bool isVISrcF16() const { + return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::f16); + } + + bool isVISrcV2F16() const { + return isVISrcF16() || isVISrcB32(); + } + + bool isAISrcB32() const { + return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i32); + } + + bool isAISrcB16() const { + return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i16); + } + + bool isAISrcV2B16() const { + return isAISrcB16(); + } + + bool isAISrcF32() const { + return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::f32); + } + + bool isAISrcF16() const { + return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::f16); + } + + bool isAISrcV2F16() const { + return isAISrcF16() || isAISrcB32(); + } + + bool isAISrc_128B32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i32); + } + + bool isAISrc_128B16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i16); + } + + bool isAISrc_128V2B16() const { + return isAISrc_128B16(); + } + + bool isAISrc_128F32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::f32); + } + + bool isAISrc_128F16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::f16); + } + + bool isAISrc_128V2F16() const { + return isAISrc_128F16() || isAISrc_128B32(); + } + + bool isAISrc_512B32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i32); + } + + bool isAISrc_512B16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i16); + } + + bool isAISrc_512V2B16() const { + return isAISrc_512B16(); + } + + bool isAISrc_512F32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::f32); + } + + bool isAISrc_512F16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::f16); + } + + bool isAISrc_512V2F16() const { + return isAISrc_512F16() || isAISrc_512B32(); + } + + bool isAISrc_1024B32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::i32); + } + + bool isAISrc_1024B16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::i16); + } + + bool isAISrc_1024V2B16() const { + return isAISrc_1024B16(); + } + + bool isAISrc_1024F32() const { + return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::f32); + } + + bool isAISrc_1024F16() const { + return isRegOrInlineNoMods(AMDGPU::AReg_1024RegClassID, MVT::f16); + } + + bool isAISrc_1024V2F16() const { + return isAISrc_1024F16() || isAISrc_1024B32(); } bool isKImmFP32() const { @@ -504,10 +647,15 @@ public: bool isSMRDOffset8() const; bool isSMRDOffset20() const; bool isSMRDLiteralOffset() const; + bool isDPP8() const; bool isDPPCtrl() const; + bool isBLGP() const; + bool isCBSZ() const; + bool isABID() const; bool isGPRIdxMode() const; bool isS16Imm() const; bool isU16Imm() const; + bool isEndpgm() const; StringRef getExpressionAsToken() const { assert(isExpr()); @@ -535,6 +683,7 @@ public: } unsigned getReg() const override { + assert(isRegKind()); return Reg.RegNo; } @@ -594,6 +743,10 @@ public: void addRegOperands(MCInst &Inst, unsigned N) const; + void addBoolRegOperands(MCInst &Inst, unsigned N) const { + addRegOperands(Inst, N); + } + void addRegOrImmOperands(MCInst &Inst, unsigned N) const { if (isRegKind()) addRegOperands(Inst, N); @@ -661,6 +814,7 @@ public: case ImmTyInstOffset: OS << "InstOffset"; break; case ImmTyOffset0: OS << "Offset0"; break; case ImmTyOffset1: OS << "Offset1"; break; + case ImmTyDLC: OS << "DLC"; break; case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; case ImmTyTFE: OS << "TFE"; break; @@ -668,15 +822,18 @@ public: case ImmTyFORMAT: OS << "FORMAT"; break; case ImmTyClampSI: OS << "ClampSI"; break; case ImmTyOModSI: OS << "OModSI"; break; + case ImmTyDPP8: OS << "DPP8"; break; case ImmTyDppCtrl: OS << "DppCtrl"; break; case ImmTyDppRowMask: OS << "DppRowMask"; break; case ImmTyDppBankMask: OS << "DppBankMask"; break; case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break; + case ImmTyDppFi: OS << "FI"; break; case ImmTySdwaDstSel: OS << "SdwaDstSel"; break; case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break; case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break; case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break; case ImmTyDMask: OS << "DMask"; break; + case ImmTyDim: OS << "Dim"; break; case ImmTyUNorm: OS << "UNorm"; break; case ImmTyDA: OS << "DA"; break; case ImmTyR128A16: OS << "R128A16"; break; @@ -695,7 +852,12 @@ public: case ImmTyNegLo: OS << "NegLo"; break; case ImmTyNegHi: OS << "NegHi"; break; case ImmTySwizzle: OS << "Swizzle"; break; + case ImmTyGprIdxMode: OS << "GprIdxMode"; break; case ImmTyHigh: OS << "High"; break; + case ImmTyBLGP: OS << "BLGP"; break; + case ImmTyCBSZ: OS << "CBSZ"; break; + case ImmTyABID: OS << "ABID"; break; + case ImmTyEndpgm: OS << "Endpgm"; break; } } @@ -747,12 +909,10 @@ public: static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser, unsigned RegNo, SMLoc S, - SMLoc E, - bool ForceVOP3) { + SMLoc E) { auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser); Op->Reg.RegNo = RegNo; Op->Reg.Mods = Modifiers(); - Op->Reg.IsForcedVOP3 = ForceVOP3; Op->StartLoc = S; Op->EndLoc = E; return Op; @@ -817,6 +977,7 @@ public: void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) { switch (RegKind) { case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break; + case IS_AGPR: // fall through case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break; default: break; } @@ -853,6 +1014,8 @@ private: /// \param VCCUsed [in] Whether VCC special SGPR is reserved. /// \param FlatScrUsed [in] Whether FLAT_SCRATCH special SGPR is reserved. /// \param XNACKUsed [in] Whether XNACK_MASK special SGPR is reserved. + /// \param EnableWavefrontSize32 [in] Value of ENABLE_WAVEFRONT_SIZE32 kernel + /// descriptor field, if valid. /// \param NextFreeVGPR [in] Max VGPR number referenced, plus one. /// \param VGPRRange [in] Token range, used for VGPR diagnostics. /// \param NextFreeSGPR [in] Max SGPR number referenced, plus one. @@ -861,9 +1024,10 @@ private: /// \param SGPRBlocks [out] Result SGPR block count. bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed, - unsigned NextFreeVGPR, SMRange VGPRRange, - unsigned NextFreeSGPR, SMRange SGPRRange, - unsigned &VGPRBlocks, unsigned &SGPRBlocks); + Optional<bool> EnableWavefrontSize32, unsigned NextFreeVGPR, + SMRange VGPRRange, unsigned NextFreeSGPR, + SMRange SGPRRange, unsigned &VGPRBlocks, + unsigned &SGPRBlocks); bool ParseDirectiveAMDGCNTarget(); bool ParseDirectiveAMDHSAKernel(); bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor); @@ -876,7 +1040,15 @@ private: bool ParseDirectiveISAVersion(); bool ParseDirectiveHSAMetadata(); + bool ParseDirectivePALMetadataBegin(); bool ParseDirectivePALMetadata(); + bool ParseDirectiveAMDGPULDS(); + + /// Common code to parse out a block of text (typically YAML) between start and + /// end directives. + bool ParseToEndDirective(const char *AssemblerDirectiveBegin, + const char *AssemblerDirectiveEnd, + std::string &CollectString); bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, @@ -884,6 +1056,8 @@ private: bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex); + bool isRegister(); + bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const; Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind); void initializeGprCountSymbol(RegisterKind RegKind); bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex, @@ -897,6 +1071,10 @@ public: enum AMDGPUMatchResultTy { Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY }; + enum OperandMode { + OperandMode_Default, + OperandMode_NSA, + }; using OptionalImmIndexMap = std::map<AMDGPUOperand::ImmTy, unsigned>; @@ -908,7 +1086,7 @@ public: if (getFeatureBits().none()) { // Set default features. - copySTI().ToggleFeature("SOUTHERN_ISLANDS"); + copySTI().ToggleFeature("southern-islands"); } setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits())); @@ -924,6 +1102,10 @@ public: MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number")); Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); + Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_minor")); + Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx)); + Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_stepping")); + Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); } else { MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); @@ -969,6 +1151,10 @@ public: return AMDGPU::isGFX9(getSTI()); } + bool isGFX10() const { + return AMDGPU::isGFX10(getSTI()); + } + bool hasInv2PiInlineImm() const { return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; } @@ -978,7 +1164,11 @@ public: } bool hasSGPR102_SGPR103() const { - return !isVI(); + return !isVI() && !isGFX9(); + } + + bool hasSGPR104_SGPR105() const { + return isGFX10(); } bool hasIntClamp() const { @@ -1024,7 +1214,8 @@ public: uint64_t &ErrorInfo, bool MatchingInlineAsm) override; bool ParseDirective(AsmToken DirectiveID) override; - OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); + OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic, + OperandMode Mode = OperandMode_Default); StringRef parseMnemonicSuffix(StringRef Name); bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; @@ -1037,11 +1228,11 @@ public: AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, bool (*ConvertResult)(int64_t &) = nullptr); - OperandMatchResultTy parseOperandArrayWithPrefix( - const char *Prefix, - OperandVector &Operands, - AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, - bool (*ConvertResult)(int64_t&) = nullptr); + OperandMatchResultTy + parseOperandArrayWithPrefix(const char *Prefix, + OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + bool (*ConvertResult)(int64_t&) = nullptr); OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, @@ -1049,10 +1240,15 @@ public: OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, StringRef &Value); - bool parseAbsoluteExpr(int64_t &Val, bool AbsMod = false); - OperandMatchResultTy parseImm(OperandVector &Operands, bool AbsMod = false); + bool isModifier(); + bool isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const; + bool isRegOrOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const; + bool isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const; + bool isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const; + bool parseSP3NegModifier(); + OperandMatchResultTy parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false); OperandMatchResultTy parseReg(OperandVector &Operands); - OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool AbsMod = false); + OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false); OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true); OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true); OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands); @@ -1073,33 +1269,63 @@ private: struct OperandInfoTy { int64_t Id; bool IsSymbolic = false; + bool IsDefined = false; OperandInfoTy(int64_t Id_) : Id(Id_) {} }; - bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId); - bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width); + bool parseSendMsgBody(OperandInfoTy &Msg, OperandInfoTy &Op, OperandInfoTy &Stream); + bool validateSendMsg(const OperandInfoTy &Msg, + const OperandInfoTy &Op, + const OperandInfoTy &Stream, + const SMLoc Loc); + + bool parseHwregBody(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width); + bool validateHwreg(const OperandInfoTy &HwReg, + const int64_t Offset, + const int64_t Width, + const SMLoc Loc); void errorExpTgt(); OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val); + SMLoc getFlatOffsetLoc(const OperandVector &Operands) const; - bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc); + bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands); + bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands); + bool validateSOPLiteral(const MCInst &Inst) const; bool validateConstantBusLimitations(const MCInst &Inst); bool validateEarlyClobberLimitations(const MCInst &Inst); bool validateIntClampSupported(const MCInst &Inst); bool validateMIMGAtomicDMask(const MCInst &Inst); bool validateMIMGGatherDMask(const MCInst &Inst); bool validateMIMGDataSize(const MCInst &Inst); + bool validateMIMGAddrSize(const MCInst &Inst); bool validateMIMGD16(const MCInst &Inst); + bool validateMIMGDim(const MCInst &Inst); + bool validateLdsDirect(const MCInst &Inst); + bool validateOpSel(const MCInst &Inst); + bool validateVccOperand(unsigned Reg) const; + bool validateVOP3Literal(const MCInst &Inst) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + bool isId(const StringRef Id) const; + bool isId(const AsmToken &Token, const StringRef Id) const; + bool isToken(const AsmToken::TokenKind Kind) const; bool trySkipId(const StringRef Id); + bool trySkipId(const StringRef Id, const AsmToken::TokenKind Kind); bool trySkipToken(const AsmToken::TokenKind Kind); bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg); bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string"); + void peekTokens(MutableArrayRef<AsmToken> Tokens); + AsmToken::TokenKind getTokenKind() const; bool parseExpr(int64_t &Imm); + StringRef getTokenStr() const; + AsmToken peekToken(); + AsmToken getToken() const; + SMLoc getLoc() const; + void lex(); public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); @@ -1110,6 +1336,7 @@ public: OperandMatchResultTy parseInterpSlot(OperandVector &Operands); OperandMatchResultTy parseInterpAttr(OperandVector &Operands); OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + OperandMatchResultTy parseBoolReg(OperandVector &Operands); bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op, const unsigned MinVal, @@ -1124,20 +1351,23 @@ public: bool parseSwizzleSwap(int64_t &Imm); bool parseSwizzleReverse(int64_t &Imm); + OperandMatchResultTy parseGPRIdxMode(OperandVector &Operands); + int64_t parseGPRIdxMacro(); + void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); } void cvtMtbuf(MCInst &Inst, const OperandVector &Operands); + AMDGPUOperand::Ptr defaultDLC() const; AMDGPUOperand::Ptr defaultGLC() const; AMDGPUOperand::Ptr defaultSLC() const; AMDGPUOperand::Ptr defaultSMRDOffset8() const; AMDGPUOperand::Ptr defaultSMRDOffset20() const; AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; - AMDGPUOperand::Ptr defaultOffsetU12() const; - AMDGPUOperand::Ptr defaultOffsetS13() const; + AMDGPUOperand::Ptr defaultFlatOffset() const; OperandMatchResultTy parseOModOperand(OperandVector &Operands); @@ -1153,11 +1383,15 @@ public: bool IsAtomic = false); void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseDim(OperandVector &Operands); + OperandMatchResultTy parseDPP8(OperandVector &Operands); OperandMatchResultTy parseDPPCtrl(OperandVector &Operands); AMDGPUOperand::Ptr defaultRowMask() const; AMDGPUOperand::Ptr defaultBankMask() const; AMDGPUOperand::Ptr defaultBoundCtrl() const; - void cvtDPP(MCInst &Inst, const OperandVector &Operands); + AMDGPUOperand::Ptr defaultFI() const; + void cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8 = false); + void cvtDPP8(MCInst &Inst, const OperandVector &Operands) { cvtDPP(Inst, Operands, true); } OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix, AMDGPUOperand::ImmTy Type); @@ -1168,6 +1402,13 @@ public: void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands); void cvtSDWA(MCInst &Inst, const OperandVector &Operands, uint64_t BasicInstType, bool skipVcc = false); + + AMDGPUOperand::Ptr defaultBLGP() const; + AMDGPUOperand::Ptr defaultCBSZ() const; + AMDGPUOperand::Ptr defaultABID() const; + + OperandMatchResultTy parseEndpgmOp(OperandVector &Operands); + AMDGPUOperand::Ptr defaultEndpgmImmOperands() const; }; struct OptionalOperand { @@ -1203,6 +1444,8 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: return &APFloat::IEEEsingle(); case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: @@ -1215,6 +1458,12 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: return &APFloat::IEEEhalf(); default: llvm_unreachable("unsupported fp type"); @@ -1243,7 +1492,20 @@ static bool canLosslesslyConvertToFPType(APFloat &FPLiteral, MVT VT) { return true; } +static bool isSafeTruncation(int64_t Val, unsigned Size) { + return isUIntN(Size, Val) || isIntN(Size, Val); +} + bool AMDGPUOperand::isInlinableImm(MVT type) const { + + // This is a hack to enable named inline values like + // shared_base with both 32-bit and 64-bit operands. + // Note that these values are defined as + // 32-bit operands only. + if (isInlineValue()) { + return true; + } + if (!isImmTy(ImmTyNone)) { // Only plain immediates are inlinable (e.g. "clamp" attribute is not) return false; @@ -1282,6 +1544,10 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { AsmParser->hasInv2PiInlineImm()); } + if (!isSafeTruncation(Imm.Val, type.getScalarSizeInBits())) { + return false; + } + if (type.getScalarSizeInBits() == 16) { return AMDGPU::isInlinableLiteral16( static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()), @@ -1315,7 +1581,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { // FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP // types. - return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val); + return isSafeTruncation(Imm.Val, Size); } // We got fp literal token @@ -1330,8 +1596,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { return false; } + // We allow fp literals with f16x2 operands assuming that the specified + // literal goes into the lower half and the upper half is zero. We also + // require that the literal may be losslesly converted to f16. + MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 : + (type == MVT::v2i16)? MVT::i16 : type; + APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val)); - return canLosslesslyConvertToFPType(FPLiteral, type); + return canLosslesslyConvertToFPType(FPLiteral, ExpectedType); } bool AMDGPUOperand::isRegClass(unsigned RCID) const { @@ -1340,9 +1612,9 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const { bool AMDGPUOperand::isSDWAOperand(MVT type) const { if (AsmParser->isVI()) - return isVReg(); - else if (AsmParser->isGFX9()) - return isRegKind() || isInlinableImm(type); + return isVReg32(); + else if (AsmParser->isGFX9() || AsmParser->isGFX10()) + return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(type); else return false; } @@ -1363,6 +1635,11 @@ bool AMDGPUOperand::isSDWAInt32Operand() const { return isSDWAOperand(MVT::i32); } +bool AMDGPUOperand::isBoolReg() const { + return AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? + isSCSrcB64() : isSCSrcB32(); +} + uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const { assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers()); @@ -1441,12 +1718,20 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: { bool lost; APFloat FPLiteral(APFloat::IEEEdouble(), Literal); // Convert literal to single precision @@ -1456,11 +1741,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo // checked earlier in isLiteralImm() uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue(); - if (OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || - OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) { - ImmVal |= (ImmVal << 16); - } - Inst.addOperand(MCOperand::createImm(ImmVal)); return; } @@ -1471,15 +1751,18 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo return; } - // We got int literal token. + // We got int literal token. // Only sign extend inline immediates. - // FIXME: No errors on truncation switch (OpTy) { case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: - if (isInt<32>(Val) && + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + if (isSafeTruncation(Val, 32) && AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); @@ -1505,7 +1788,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - if (isInt<16>(Val) && + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + if (isSafeTruncation(Val, 16) && AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); @@ -1516,14 +1801,14 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo return; case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { - auto LiteralVal = static_cast<uint16_t>(Literal.getLoBits(16).getZExtValue()); - assert(AMDGPU::isInlinableLiteral16(LiteralVal, + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { + assert(isSafeTruncation(Val, 16)); + assert(AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val), AsmParser->hasInv2PiInlineImm())); - uint32_t ImmVal = static_cast<uint32_t>(LiteralVal) << 16 | - static_cast<uint32_t>(LiteralVal); - Inst.addOperand(MCOperand::createImm(ImmVal)); + Inst.addOperand(MCOperand::createImm(Val)); return; } default: @@ -1552,6 +1837,27 @@ void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const { Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), AsmParser->getSTI()))); } +static bool isInlineValue(unsigned Reg) { + switch (Reg) { + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + return true; + case AMDGPU::SRC_VCCZ: + case AMDGPU::SRC_EXECZ: + case AMDGPU::SRC_SCC: + return true; + default: + return false; + } +} + +bool AMDGPUOperand::isInlineValue() const { + return isRegKind() && ::isInlineValue(getReg()); +} + //===----------------------------------------------------------------------===// // AsmParser //===----------------------------------------------------------------------===// @@ -1585,6 +1891,15 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 8: return AMDGPU::SGPR_256RegClassID; case 16: return AMDGPU::SGPR_512RegClassID; } + } else if (Is == IS_AGPR) { + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::AGPR_32RegClassID; + case 2: return AMDGPU::AReg_64RegClassID; + case 4: return AMDGPU::AReg_128RegClassID; + case 16: return AMDGPU::AReg_512RegClassID; + case 32: return AMDGPU::AReg_1024RegClassID; + } } return -1; } @@ -1595,8 +1910,25 @@ static unsigned getSpecialRegForName(StringRef RegName) { .Case("vcc", AMDGPU::VCC) .Case("flat_scratch", AMDGPU::FLAT_SCR) .Case("xnack_mask", AMDGPU::XNACK_MASK) + .Case("shared_base", AMDGPU::SRC_SHARED_BASE) + .Case("src_shared_base", AMDGPU::SRC_SHARED_BASE) + .Case("shared_limit", AMDGPU::SRC_SHARED_LIMIT) + .Case("src_shared_limit", AMDGPU::SRC_SHARED_LIMIT) + .Case("private_base", AMDGPU::SRC_PRIVATE_BASE) + .Case("src_private_base", AMDGPU::SRC_PRIVATE_BASE) + .Case("private_limit", AMDGPU::SRC_PRIVATE_LIMIT) + .Case("src_private_limit", AMDGPU::SRC_PRIVATE_LIMIT) + .Case("pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) + .Case("src_pops_exiting_wave_id", AMDGPU::SRC_POPS_EXITING_WAVE_ID) + .Case("lds_direct", AMDGPU::LDS_DIRECT) + .Case("src_lds_direct", AMDGPU::LDS_DIRECT) .Case("m0", AMDGPU::M0) - .Case("scc", AMDGPU::SCC) + .Case("vccz", AMDGPU::SRC_VCCZ) + .Case("src_vccz", AMDGPU::SRC_VCCZ) + .Case("execz", AMDGPU::SRC_EXECZ) + .Case("src_execz", AMDGPU::SRC_EXECZ) + .Case("scc", AMDGPU::SRC_SCC) + .Case("src_scc", AMDGPU::SRC_SCC) .Case("tba", AMDGPU::TBA) .Case("tma", AMDGPU::TMA) .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) @@ -1611,6 +1943,7 @@ static unsigned getSpecialRegForName(StringRef RegName) { .Case("tma_hi", AMDGPU::TMA_HI) .Case("tba_lo", AMDGPU::TBA_LO) .Case("tba_hi", AMDGPU::TBA_HI) + .Case("null", AMDGPU::SGPR_NULL) .Default(0); } @@ -1663,6 +1996,7 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, return false; case IS_VGPR: case IS_SGPR: + case IS_AGPR: case IS_TTMP: if (Reg1 != Reg + RegWidth) { return false; @@ -1674,6 +2008,53 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, } } +static const StringRef Registers[] = { + { "v" }, + { "s" }, + { "ttmp" }, + { "acc" }, + { "a" }, +}; + +bool +AMDGPUAsmParser::isRegister(const AsmToken &Token, + const AsmToken &NextToken) const { + + // A list of consecutive registers: [s0,s1,s2,s3] + if (Token.is(AsmToken::LBrac)) + return true; + + if (!Token.is(AsmToken::Identifier)) + return false; + + // A single register like s0 or a range of registers like s[0:1] + + StringRef RegName = Token.getString(); + + for (StringRef Reg : Registers) { + if (RegName.startswith(Reg)) { + if (Reg.size() < RegName.size()) { + unsigned RegNum; + // A single register with an index: rXX + if (!RegName.substr(Reg.size()).getAsInteger(10, RegNum)) + return true; + } else { + // A range of registers: r[XX:YY]. + if (NextToken.is(AsmToken::LBrac)) + return true; + } + } + } + + return getSpecialRegForName(RegName); +} + +bool +AMDGPUAsmParser::isRegister() +{ + return isRegister(getToken(), peekToken()); +} + bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, unsigned &RegNum, unsigned &RegWidth, unsigned *DwordRegIndex) { @@ -1692,6 +2073,9 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, } else if (RegName[0] == 's') { RegNumIndex = 1; RegKind = IS_SGPR; + } else if (RegName[0] == 'a') { + RegNumIndex = RegName.startswith("acc") ? 3 : 1; + RegKind = IS_AGPR; } else if (RegName.startswith("ttmp")) { RegNumIndex = strlen("ttmp"); RegKind = IS_TTMP; @@ -1773,6 +2157,7 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, break; case IS_VGPR: case IS_SGPR: + case IS_AGPR: case IS_TTMP: { unsigned Size = 1; @@ -1859,6 +2244,8 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() { unsigned Reg, RegNum, RegWidth, DwordRegIndex; if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) { + //FIXME: improve error messages (bug 41303). + Error(StartLoc, "not a valid operand."); return nullptr; } if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) { @@ -1866,201 +2253,260 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() { return nullptr; } else KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth); - return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false); + return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc); } -bool -AMDGPUAsmParser::parseAbsoluteExpr(int64_t &Val, bool AbsMod) { - if (AbsMod && getLexer().peekTok().is(AsmToken::Pipe) && - (getLexer().getKind() == AsmToken::Integer || - getLexer().getKind() == AsmToken::Real)) { - // This is a workaround for handling operands like these: - // |1.0| - // |-1| - // This syntax is not compatible with syntax of standard - // MC expressions (due to the trailing '|'). - - SMLoc EndLoc; - const MCExpr *Expr; +OperandMatchResultTy +AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) { + // TODO: add syntactic sugar for 1/(2*PI) - if (getParser().parsePrimaryExpr(Expr, EndLoc)) { - return true; - } + assert(!isRegister()); + assert(!isModifier()); - return !Expr->evaluateAsAbsolute(Val); + const auto& Tok = getToken(); + const auto& NextTok = peekToken(); + bool IsReal = Tok.is(AsmToken::Real); + SMLoc S = getLoc(); + bool Negate = false; + + if (!IsReal && Tok.is(AsmToken::Minus) && NextTok.is(AsmToken::Real)) { + lex(); + IsReal = true; + Negate = true; } - return getParser().parseAbsoluteExpression(Val); -} + if (IsReal) { + // Floating-point expressions are not supported. + // Can only allow floating-point literals with an + // optional sign. -OperandMatchResultTy -AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) { - // TODO: add syntactic sugar for 1/(2*PI) - bool Minus = false; - if (getLexer().getKind() == AsmToken::Minus) { - const AsmToken NextToken = getLexer().peekTok(); - if (!NextToken.is(AsmToken::Integer) && - !NextToken.is(AsmToken::Real)) { - return MatchOperand_NoMatch; - } - Minus = true; - Parser.Lex(); - } + StringRef Num = getTokenStr(); + lex(); - SMLoc S = Parser.getTok().getLoc(); - switch(getLexer().getKind()) { - case AsmToken::Integer: { - int64_t IntVal; - if (parseAbsoluteExpr(IntVal, AbsMod)) + APFloat RealVal(APFloat::IEEEdouble()); + auto roundMode = APFloat::rmNearestTiesToEven; + if (RealVal.convertFromString(Num, roundMode) == APFloat::opInvalidOp) { return MatchOperand_ParseFail; - if (Minus) - IntVal *= -1; - Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S)); + } + if (Negate) + RealVal.changeSign(); + + Operands.push_back( + AMDGPUOperand::CreateImm(this, RealVal.bitcastToAPInt().getZExtValue(), S, + AMDGPUOperand::ImmTyNone, true)); + return MatchOperand_Success; - } - case AsmToken::Real: { + + } else { int64_t IntVal; - if (parseAbsoluteExpr(IntVal, AbsMod)) - return MatchOperand_ParseFail; + const MCExpr *Expr; + SMLoc S = getLoc(); + + if (HasSP3AbsModifier) { + // This is a workaround for handling expressions + // as arguments of SP3 'abs' modifier, for example: + // |1.0| + // |-1| + // |1+x| + // This syntax is not compatible with syntax of standard + // MC expressions (due to the trailing '|'). + SMLoc EndLoc; + if (getParser().parsePrimaryExpr(Expr, EndLoc)) + return MatchOperand_ParseFail; + } else { + if (Parser.parseExpression(Expr)) + return MatchOperand_ParseFail; + } + + if (Expr->evaluateAsAbsolute(IntVal)) { + Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S)); + } else { + Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); + } - APFloat F(BitsToDouble(IntVal)); - if (Minus) - F.changeSign(); - Operands.push_back( - AMDGPUOperand::CreateImm(this, F.bitcastToAPInt().getZExtValue(), S, - AMDGPUOperand::ImmTyNone, true)); return MatchOperand_Success; } - default: - return MatchOperand_NoMatch; - } + + return MatchOperand_NoMatch; } OperandMatchResultTy AMDGPUAsmParser::parseReg(OperandVector &Operands) { + if (!isRegister()) + return MatchOperand_NoMatch; + if (auto R = parseRegister()) { assert(R->isReg()); - R->Reg.IsForcedVOP3 = isForcedVOP3(); Operands.push_back(std::move(R)); return MatchOperand_Success; } - return MatchOperand_NoMatch; + return MatchOperand_ParseFail; } OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool AbsMod) { - auto res = parseImm(Operands, AbsMod); +AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod) { + auto res = parseReg(Operands); if (res != MatchOperand_NoMatch) { return res; + } else if (isModifier()) { + return MatchOperand_NoMatch; + } else { + return parseImm(Operands, HasSP3AbsMod); } +} - return parseReg(Operands); +bool +AMDGPUAsmParser::isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const { + if (Token.is(AsmToken::Identifier) && NextToken.is(AsmToken::LParen)) { + const auto &str = Token.getString(); + return str == "abs" || str == "neg" || str == "sext"; + } + return false; } -OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, - bool AllowImm) { - bool Negate = false, Negate2 = false, Abs = false, Abs2 = false; +bool +AMDGPUAsmParser::isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const { + return Token.is(AsmToken::Identifier) && NextToken.is(AsmToken::Colon); +} - if (getLexer().getKind()== AsmToken::Minus) { - const AsmToken NextToken = getLexer().peekTok(); +bool +AMDGPUAsmParser::isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const { + return isNamedOperandModifier(Token, NextToken) || Token.is(AsmToken::Pipe); +} - // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead. - if (NextToken.is(AsmToken::Minus)) { - Error(Parser.getTok().getLoc(), "invalid syntax, expected 'neg' modifier"); - return MatchOperand_ParseFail; - } +bool +AMDGPUAsmParser::isRegOrOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const { + return isRegister(Token, NextToken) || isOperandModifier(Token, NextToken); +} + +// Check if this is an operand modifier or an opcode modifier +// which may look like an expression but it is not. We should +// avoid parsing these modifiers as expressions. Currently +// recognized sequences are: +// |...| +// abs(...) +// neg(...) +// sext(...) +// -reg +// -|...| +// -abs(...) +// name:... +// Note that simple opcode modifiers like 'gds' may be parsed as +// expressions; this is a special case. See getExpressionAsToken. +// +bool +AMDGPUAsmParser::isModifier() { - // '-' followed by an integer literal N should be interpreted as integer - // negation rather than a floating-point NEG modifier applied to N. - // Beside being contr-intuitive, such use of floating-point NEG modifier - // results in different meaning of integer literals used with VOP1/2/C - // and VOP3, for example: - // v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF - // v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001 - // Negative fp literals should be handled likewise for unifomtity - if (!NextToken.is(AsmToken::Integer) && !NextToken.is(AsmToken::Real)) { - Parser.Lex(); - Negate = true; - } + AsmToken Tok = getToken(); + AsmToken NextToken[2]; + peekTokens(NextToken); + + return isOperandModifier(Tok, NextToken[0]) || + (Tok.is(AsmToken::Minus) && isRegOrOperandModifier(NextToken[0], NextToken[1])) || + isOpcodeModifierWithVal(Tok, NextToken[0]); +} + +// Check if the current token is an SP3 'neg' modifier. +// Currently this modifier is allowed in the following context: +// +// 1. Before a register, e.g. "-v0", "-v[...]" or "-[v0,v1]". +// 2. Before an 'abs' modifier: -abs(...) +// 3. Before an SP3 'abs' modifier: -|...| +// +// In all other cases "-" is handled as a part +// of an expression that follows the sign. +// +// Note: When "-" is followed by an integer literal, +// this is interpreted as integer negation rather +// than a floating-point NEG modifier applied to N. +// Beside being contr-intuitive, such use of floating-point +// NEG modifier would have resulted in different meaning +// of integer literals used with VOP1/2/C and VOP3, +// for example: +// v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF +// v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001 +// Negative fp literals with preceding "-" are +// handled likewise for unifomtity +// +bool +AMDGPUAsmParser::parseSP3NegModifier() { + + AsmToken NextToken[2]; + peekTokens(NextToken); + + if (isToken(AsmToken::Minus) && + (isRegister(NextToken[0], NextToken[1]) || + NextToken[0].is(AsmToken::Pipe) || + isId(NextToken[0], "abs"))) { + lex(); + return true; } - if (getLexer().getKind() == AsmToken::Identifier && - Parser.getTok().getString() == "neg") { - if (Negate) { - Error(Parser.getTok().getLoc(), "expected register or immediate"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - Negate2 = true; - if (getLexer().isNot(AsmToken::LParen)) { - Error(Parser.getTok().getLoc(), "expected left paren after neg"); - return MatchOperand_ParseFail; - } - Parser.Lex(); + return false; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, + bool AllowImm) { + bool Neg, SP3Neg; + bool Abs, SP3Abs; + SMLoc Loc; + + // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead. + if (isToken(AsmToken::Minus) && peekToken().is(AsmToken::Minus)) { + Error(getLoc(), "invalid syntax, expected 'neg' modifier"); + return MatchOperand_ParseFail; } - if (getLexer().getKind() == AsmToken::Identifier && - Parser.getTok().getString() == "abs") { - Parser.Lex(); - Abs2 = true; - if (getLexer().isNot(AsmToken::LParen)) { - Error(Parser.getTok().getLoc(), "expected left paren after abs"); - return MatchOperand_ParseFail; - } - Parser.Lex(); + SP3Neg = parseSP3NegModifier(); + + Loc = getLoc(); + Neg = trySkipId("neg"); + if (Neg && SP3Neg) { + Error(Loc, "expected register or immediate"); + return MatchOperand_ParseFail; } + if (Neg && !skipToken(AsmToken::LParen, "expected left paren after neg")) + return MatchOperand_ParseFail; - if (getLexer().getKind() == AsmToken::Pipe) { - if (Abs2) { - Error(Parser.getTok().getLoc(), "expected register or immediate"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - Abs = true; + Abs = trySkipId("abs"); + if (Abs && !skipToken(AsmToken::LParen, "expected left paren after abs")) + return MatchOperand_ParseFail; + + Loc = getLoc(); + SP3Abs = trySkipToken(AsmToken::Pipe); + if (Abs && SP3Abs) { + Error(Loc, "expected register or immediate"); + return MatchOperand_ParseFail; } OperandMatchResultTy Res; if (AllowImm) { - Res = parseRegOrImm(Operands, Abs); + Res = parseRegOrImm(Operands, SP3Abs); } else { Res = parseReg(Operands); } if (Res != MatchOperand_Success) { - return Res; + return (SP3Neg || Neg || SP3Abs || Abs)? MatchOperand_ParseFail : Res; } - AMDGPUOperand::Modifiers Mods; - if (Abs) { - if (getLexer().getKind() != AsmToken::Pipe) { - Error(Parser.getTok().getLoc(), "expected vertical bar"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - Mods.Abs = true; - } - if (Abs2) { - if (getLexer().isNot(AsmToken::RParen)) { - Error(Parser.getTok().getLoc(), "expected closing parentheses"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - Mods.Abs = true; - } + if (SP3Abs && !skipToken(AsmToken::Pipe, "expected vertical bar")) + return MatchOperand_ParseFail; + if (Abs && !skipToken(AsmToken::RParen, "expected closing parentheses")) + return MatchOperand_ParseFail; + if (Neg && !skipToken(AsmToken::RParen, "expected closing parentheses")) + return MatchOperand_ParseFail; - if (Negate) { - Mods.Neg = true; - } else if (Negate2) { - if (getLexer().isNot(AsmToken::RParen)) { - Error(Parser.getTok().getLoc(), "expected closing parentheses"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - Mods.Neg = true; - } + AMDGPUOperand::Modifiers Mods; + Mods.Abs = Abs || SP3Abs; + Mods.Neg = Neg || SP3Neg; if (Mods.hasFPModifiers()) { AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); + if (Op.isExpr()) { + Error(Op.getStartLoc(), "expected an absolute expression"); + return MatchOperand_ParseFail; + } Op.setModifiers(Mods); } return MatchOperand_Success; @@ -2069,18 +2515,9 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, OperandMatchResultTy AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) { - bool Sext = false; - - if (getLexer().getKind() == AsmToken::Identifier && - Parser.getTok().getString() == "sext") { - Parser.Lex(); - Sext = true; - if (getLexer().isNot(AsmToken::LParen)) { - Error(Parser.getTok().getLoc(), "expected left paren after sext"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - } + bool Sext = trySkipId("sext"); + if (Sext && !skipToken(AsmToken::LParen, "expected left paren after sext")) + return MatchOperand_ParseFail; OperandMatchResultTy Res; if (AllowImm) { @@ -2089,21 +2526,21 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, Res = parseReg(Operands); } if (Res != MatchOperand_Success) { - return Res; + return Sext? MatchOperand_ParseFail : Res; } + if (Sext && !skipToken(AsmToken::RParen, "expected closing parentheses")) + return MatchOperand_ParseFail; + AMDGPUOperand::Modifiers Mods; - if (Sext) { - if (getLexer().isNot(AsmToken::RParen)) { - Error(Parser.getTok().getLoc(), "expected closing parentheses"); - return MatchOperand_ParseFail; - } - Parser.Lex(); - Mods.Sext = true; - } + Mods.Sext = Sext; if (Mods.hasIntModifiers()) { AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); + if (Op.isExpr()) { + Error(Op.getStartLoc(), "expected an absolute expression"); + return MatchOperand_ParseFail; + } Op.setModifiers(Mods); } @@ -2121,21 +2558,24 @@ AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) { } OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) { + auto Loc = getLoc(); + if (trySkipId("off")) { + Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Loc, + AMDGPUOperand::ImmTyOff, false)); + return MatchOperand_Success; + } + + if (!isRegister()) + return MatchOperand_NoMatch; + std::unique_ptr<AMDGPUOperand> Reg = parseRegister(); if (Reg) { Operands.push_back(std::move(Reg)); return MatchOperand_Success; } - const AsmToken &Tok = Parser.getTok(); - if (Tok.getString() == "off") { - Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Tok.getLoc(), - AMDGPUOperand::ImmTyOff, false)); - Parser.Lex(); - return MatchOperand_Success; - } + return MatchOperand_ParseFail; - return MatchOperand_NoMatch; } unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { @@ -2163,15 +2603,6 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { } } - if ((TSFlags & SIInstrFlags::FLAT) && !hasFlatOffsets()) { - // FIXME: Produces error without correct column reported. - auto OpNum = - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset); - const auto &Op = Inst.getOperand(OpNum); - if (Op.getImm() != 0) - return Match_InvalidOperand; - } - return Match_Success; } @@ -2214,7 +2645,10 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { switch (Reg) { case AMDGPU::FLAT_SCR: case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: case AMDGPU::M0: + case AMDGPU::SGPR_NULL: return Reg; default: break; @@ -2248,7 +2682,11 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, case 2: { const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType; if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || - OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) { + OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 || + OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16 || + OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) { return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm()); } else { return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm()); @@ -2272,6 +2710,8 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { const unsigned Opcode = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opcode); unsigned ConstantBusUseCount = 0; + unsigned NumLiterals = 0; + unsigned LiteralSize; if (Desc.TSFlags & (SIInstrFlags::VOPC | @@ -2283,8 +2723,10 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { ++ConstantBusUseCount; } + SmallDenseSet<unsigned> SGPRsUsed; unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst); if (SGPRUsed != AMDGPU::NoRegister) { + SGPRsUsed.insert(SGPRUsed); ++ConstantBusUseCount; } @@ -2307,16 +2749,41 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { // flat_scratch_lo, flat_scratch_hi // are theoretically valid but they are disabled anyway. // Note that this code mimics SIInstrInfo::verifyInstruction - if (Reg != SGPRUsed) { + if (!SGPRsUsed.count(Reg)) { + SGPRsUsed.insert(Reg); ++ConstantBusUseCount; } - SGPRUsed = Reg; } else { // Expression or a literal - ++ConstantBusUseCount; + + if (Desc.OpInfo[OpIdx].OperandType == MCOI::OPERAND_IMMEDIATE) + continue; // special operand like VINTERP attr_chan + + // An instruction may use only one literal. + // This has been validated on the previous step. + // See validateVOP3Literal. + // This literal may be used as more than one operand. + // If all these operands are of the same size, + // this literal counts as one scalar value. + // Otherwise it counts as 2 scalar values. + // See "GFX10 Shader Programming", section 3.6.2.3. + + unsigned Size = AMDGPU::getOperandSize(Desc, OpIdx); + if (Size < 4) Size = 4; + + if (NumLiterals == 0) { + NumLiterals = 1; + LiteralSize = Size; + } else if (LiteralSize != Size) { + NumLiterals = 2; + } } } } } + ConstantBusUseCount += NumLiterals; + + if (isGFX10()) + return ConstantBusUseCount <= 2; return ConstantBusUseCount <= 1; } @@ -2405,6 +2872,46 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) { return (VDataSize / 4) == DataSize + TFESize; } +bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) { + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0 || !isGFX10()) + return true; + + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); + int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); + int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim); + + assert(VAddr0Idx != -1); + assert(SrsrcIdx != -1); + assert(DimIdx != -1); + assert(SrsrcIdx > VAddr0Idx); + + unsigned Dim = Inst.getOperand(DimIdx).getImm(); + const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); + bool IsNSA = SrsrcIdx - VAddr0Idx > 1; + unsigned VAddrSize = + IsNSA ? SrsrcIdx - VAddr0Idx + : AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4; + + unsigned AddrSize = BaseOpcode->NumExtraArgs + + (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) + + (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + if (!IsNSA) { + if (AddrSize > 8) + AddrSize = 16; + else if (AddrSize > 4) + AddrSize = 8; + } + + return VAddrSize == AddrSize; +} + bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) { const unsigned Opc = Inst.getOpcode(); @@ -2461,8 +2968,346 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) { return true; } +bool AMDGPUAsmParser::validateMIMGDim(const MCInst &Inst) { + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) + return true; + + int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim); + if (DimIdx < 0) + return true; + + long Imm = Inst.getOperand(DimIdx).getImm(); + if (Imm < 0 || Imm >= 8) + return false; + + return true; +} + +static bool IsRevOpcode(const unsigned Opcode) +{ + switch (Opcode) { + case AMDGPU::V_SUBREV_F32_e32: + case AMDGPU::V_SUBREV_F32_e64: + case AMDGPU::V_SUBREV_F32_e32_gfx10: + case AMDGPU::V_SUBREV_F32_e32_gfx6_gfx7: + case AMDGPU::V_SUBREV_F32_e32_vi: + case AMDGPU::V_SUBREV_F32_e64_gfx10: + case AMDGPU::V_SUBREV_F32_e64_gfx6_gfx7: + case AMDGPU::V_SUBREV_F32_e64_vi: + + case AMDGPU::V_SUBREV_I32_e32: + case AMDGPU::V_SUBREV_I32_e64: + case AMDGPU::V_SUBREV_I32_e32_gfx6_gfx7: + case AMDGPU::V_SUBREV_I32_e64_gfx6_gfx7: + + case AMDGPU::V_SUBBREV_U32_e32: + case AMDGPU::V_SUBBREV_U32_e64: + case AMDGPU::V_SUBBREV_U32_e32_gfx6_gfx7: + case AMDGPU::V_SUBBREV_U32_e32_vi: + case AMDGPU::V_SUBBREV_U32_e64_gfx6_gfx7: + case AMDGPU::V_SUBBREV_U32_e64_vi: + + case AMDGPU::V_SUBREV_U32_e32: + case AMDGPU::V_SUBREV_U32_e64: + case AMDGPU::V_SUBREV_U32_e32_gfx9: + case AMDGPU::V_SUBREV_U32_e32_vi: + case AMDGPU::V_SUBREV_U32_e64_gfx9: + case AMDGPU::V_SUBREV_U32_e64_vi: + + case AMDGPU::V_SUBREV_F16_e32: + case AMDGPU::V_SUBREV_F16_e64: + case AMDGPU::V_SUBREV_F16_e32_gfx10: + case AMDGPU::V_SUBREV_F16_e32_vi: + case AMDGPU::V_SUBREV_F16_e64_gfx10: + case AMDGPU::V_SUBREV_F16_e64_vi: + + case AMDGPU::V_SUBREV_U16_e32: + case AMDGPU::V_SUBREV_U16_e64: + case AMDGPU::V_SUBREV_U16_e32_vi: + case AMDGPU::V_SUBREV_U16_e64_vi: + + case AMDGPU::V_SUBREV_CO_U32_e32_gfx9: + case AMDGPU::V_SUBREV_CO_U32_e64_gfx10: + case AMDGPU::V_SUBREV_CO_U32_e64_gfx9: + + case AMDGPU::V_SUBBREV_CO_U32_e32_gfx9: + case AMDGPU::V_SUBBREV_CO_U32_e64_gfx9: + + case AMDGPU::V_SUBREV_NC_U32_e32_gfx10: + case AMDGPU::V_SUBREV_NC_U32_e64_gfx10: + + case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_e64_gfx10: + + case AMDGPU::V_LSHRREV_B32_e32: + case AMDGPU::V_LSHRREV_B32_e64: + case AMDGPU::V_LSHRREV_B32_e32_gfx6_gfx7: + case AMDGPU::V_LSHRREV_B32_e64_gfx6_gfx7: + case AMDGPU::V_LSHRREV_B32_e32_vi: + case AMDGPU::V_LSHRREV_B32_e64_vi: + case AMDGPU::V_LSHRREV_B32_e32_gfx10: + case AMDGPU::V_LSHRREV_B32_e64_gfx10: + + case AMDGPU::V_ASHRREV_I32_e32: + case AMDGPU::V_ASHRREV_I32_e64: + case AMDGPU::V_ASHRREV_I32_e32_gfx10: + case AMDGPU::V_ASHRREV_I32_e32_gfx6_gfx7: + case AMDGPU::V_ASHRREV_I32_e32_vi: + case AMDGPU::V_ASHRREV_I32_e64_gfx10: + case AMDGPU::V_ASHRREV_I32_e64_gfx6_gfx7: + case AMDGPU::V_ASHRREV_I32_e64_vi: + + case AMDGPU::V_LSHLREV_B32_e32: + case AMDGPU::V_LSHLREV_B32_e64: + case AMDGPU::V_LSHLREV_B32_e32_gfx10: + case AMDGPU::V_LSHLREV_B32_e32_gfx6_gfx7: + case AMDGPU::V_LSHLREV_B32_e32_vi: + case AMDGPU::V_LSHLREV_B32_e64_gfx10: + case AMDGPU::V_LSHLREV_B32_e64_gfx6_gfx7: + case AMDGPU::V_LSHLREV_B32_e64_vi: + + case AMDGPU::V_LSHLREV_B16_e32: + case AMDGPU::V_LSHLREV_B16_e64: + case AMDGPU::V_LSHLREV_B16_e32_vi: + case AMDGPU::V_LSHLREV_B16_e64_vi: + case AMDGPU::V_LSHLREV_B16_gfx10: + + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_LSHRREV_B16_e64: + case AMDGPU::V_LSHRREV_B16_e32_vi: + case AMDGPU::V_LSHRREV_B16_e64_vi: + case AMDGPU::V_LSHRREV_B16_gfx10: + + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_ASHRREV_I16_e64: + case AMDGPU::V_ASHRREV_I16_e32_vi: + case AMDGPU::V_ASHRREV_I16_e64_vi: + case AMDGPU::V_ASHRREV_I16_gfx10: + + case AMDGPU::V_LSHLREV_B64: + case AMDGPU::V_LSHLREV_B64_gfx10: + case AMDGPU::V_LSHLREV_B64_vi: + + case AMDGPU::V_LSHRREV_B64: + case AMDGPU::V_LSHRREV_B64_gfx10: + case AMDGPU::V_LSHRREV_B64_vi: + + case AMDGPU::V_ASHRREV_I64: + case AMDGPU::V_ASHRREV_I64_gfx10: + case AMDGPU::V_ASHRREV_I64_vi: + + case AMDGPU::V_PK_LSHLREV_B16: + case AMDGPU::V_PK_LSHLREV_B16_gfx10: + case AMDGPU::V_PK_LSHLREV_B16_vi: + + case AMDGPU::V_PK_LSHRREV_B16: + case AMDGPU::V_PK_LSHRREV_B16_gfx10: + case AMDGPU::V_PK_LSHRREV_B16_vi: + case AMDGPU::V_PK_ASHRREV_I16: + case AMDGPU::V_PK_ASHRREV_I16_gfx10: + case AMDGPU::V_PK_ASHRREV_I16_vi: + return true; + default: + return false; + } +} + +bool AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) { + + using namespace SIInstrFlags; + const unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opcode); + + // lds_direct register is defined so that it can be used + // with 9-bit operands only. Ignore encodings which do not accept these. + if ((Desc.TSFlags & (VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA)) == 0) + return true; + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + const int SrcIndices[] = { Src1Idx, Src2Idx }; + + // lds_direct cannot be specified as either src1 or src2. + for (int SrcIdx : SrcIndices) { + if (SrcIdx == -1) break; + const MCOperand &Src = Inst.getOperand(SrcIdx); + if (Src.isReg() && Src.getReg() == LDS_DIRECT) { + return false; + } + } + + if (Src0Idx == -1) + return true; + + const MCOperand &Src = Inst.getOperand(Src0Idx); + if (!Src.isReg() || Src.getReg() != LDS_DIRECT) + return true; + + // lds_direct is specified as src0. Check additional limitations. + return (Desc.TSFlags & SIInstrFlags::SDWA) == 0 && !IsRevOpcode(Opcode); +} + +SMLoc AMDGPUAsmParser::getFlatOffsetLoc(const OperandVector &Operands) const { + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + if (Op.isFlatOffset()) + return Op.getStartLoc(); + } + return getLoc(); +} + +bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst, + const OperandVector &Operands) { + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if ((TSFlags & SIInstrFlags::FLAT) == 0) + return true; + + auto Opcode = Inst.getOpcode(); + auto OpNum = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset); + assert(OpNum != -1); + + const auto &Op = Inst.getOperand(OpNum); + if (!hasFlatOffsets() && Op.getImm() != 0) { + Error(getFlatOffsetLoc(Operands), + "flat offset modifier is not supported on this GPU"); + return false; + } + + // Address offset is 12-bit signed for GFX10, 13-bit for GFX9. + // For FLAT segment the offset must be positive; + // MSB is ignored and forced to zero. + unsigned OffsetSize = isGFX9() ? 13 : 12; + if (TSFlags & SIInstrFlags::IsNonFlatSeg) { + if (!isIntN(OffsetSize, Op.getImm())) { + Error(getFlatOffsetLoc(Operands), + isGFX9() ? "expected a 13-bit signed offset" : + "expected a 12-bit signed offset"); + return false; + } + } else { + if (!isUIntN(OffsetSize - 1, Op.getImm())) { + Error(getFlatOffsetLoc(Operands), + isGFX9() ? "expected a 12-bit unsigned offset" : + "expected an 11-bit unsigned offset"); + return false; + } + } + + return true; +} + +bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const { + unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opcode); + if (!(Desc.TSFlags & (SIInstrFlags::SOP2 | SIInstrFlags::SOPC))) + return true; + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + + const int OpIndices[] = { Src0Idx, Src1Idx }; + + unsigned NumLiterals = 0; + uint32_t LiteralValue; + + for (int OpIdx : OpIndices) { + if (OpIdx == -1) break; + + const MCOperand &MO = Inst.getOperand(OpIdx); + if (MO.isImm() && + // Exclude special imm operands (like that used by s_set_gpr_idx_on) + AMDGPU::isSISrcOperand(Desc, OpIdx) && + !isInlineConstant(Inst, OpIdx)) { + uint32_t Value = static_cast<uint32_t>(MO.getImm()); + if (NumLiterals == 0 || LiteralValue != Value) { + LiteralValue = Value; + ++NumLiterals; + } + } + } + + return NumLiterals <= 1; +} + +bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { + const unsigned Opc = Inst.getOpcode(); + if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 || + Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) { + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + + if (OpSel & ~3) + return false; + } + return true; +} + +// Check if VCC register matches wavefront size +bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const { + auto FB = getFeatureBits(); + return (FB[AMDGPU::FeatureWavefrontSize64] && Reg == AMDGPU::VCC) || + (FB[AMDGPU::FeatureWavefrontSize32] && Reg == AMDGPU::VCC_LO); +} + +// VOP3 literal is only allowed in GFX10+ and only one can be used +bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const { + unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opcode); + if (!(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P))) + return true; + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + + unsigned NumLiterals = 0; + uint32_t LiteralValue; + + for (int OpIdx : OpIndices) { + if (OpIdx == -1) break; + + const MCOperand &MO = Inst.getOperand(OpIdx); + if (!MO.isImm() || !AMDGPU::isSISrcOperand(Desc, OpIdx)) + continue; + + if (!isInlineConstant(Inst, OpIdx)) { + uint32_t Value = static_cast<uint32_t>(MO.getImm()); + if (NumLiterals == 0 || LiteralValue != Value) { + LiteralValue = Value; + ++NumLiterals; + } + } + } + + return !NumLiterals || + (NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]); +} + bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, - const SMLoc &IDLoc) { + const SMLoc &IDLoc, + const OperandVector &Operands) { + if (!validateLdsDirect(Inst)) { + Error(IDLoc, + "invalid use of lds_direct"); + return false; + } + if (!validateSOPLiteral(Inst)) { + Error(IDLoc, + "only one literal operand is allowed"); + return false; + } + if (!validateVOP3Literal(Inst)) { + Error(IDLoc, + "invalid literal operand"); + return false; + } if (!validateConstantBusLimitations(Inst)) { Error(IDLoc, "invalid operand (violates constant bus restrictions)"); @@ -2478,17 +3323,31 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, "integer clamping is not supported on this GPU"); return false; } + if (!validateOpSel(Inst)) { + Error(IDLoc, + "invalid op_sel operand"); + return false; + } // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate. if (!validateMIMGD16(Inst)) { Error(IDLoc, "d16 modifier is not supported on this GPU"); return false; } + if (!validateMIMGDim(Inst)) { + Error(IDLoc, "dim modifier is required on this GPU"); + return false; + } if (!validateMIMGDataSize(Inst)) { Error(IDLoc, "image data size does not match dmask and tfe"); return false; } + if (!validateMIMGAddrSize(Inst)) { + Error(IDLoc, + "image address size does not match dim and a16"); + return false; + } if (!validateMIMGAtomicDMask(Inst)) { Error(IDLoc, "invalid atomic image dmask"); @@ -2499,11 +3358,15 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, "invalid image_gather dmask: only one bit must be set"); return false; } + if (!validateFlatOffset(Inst, Operands)) { + return false; + } return true; } -static std::string AMDGPUMnemonicSpellCheck(StringRef S, uint64_t FBS, +static std::string AMDGPUMnemonicSpellCheck(StringRef S, + const FeatureBitset &FBS, unsigned VariantID = 0); bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -2538,7 +3401,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, switch (Result) { default: break; case Match_Success: - if (!validateInstruction(Inst, IDLoc)) { + if (!validateInstruction(Inst, IDLoc, Operands)) { return true; } Inst.setLoc(IDLoc); @@ -2549,7 +3412,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, "instruction not supported on this GPU"); case Match_MnemonicFail: { - uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); std::string Suggestion = AMDGPUMnemonicSpellCheck( ((AMDGPUOperand &)*Operands[0]).getToken(), FBS); return Error(IDLoc, "invalid instruction" + Suggestion, @@ -2632,32 +3495,39 @@ bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) { bool AMDGPUAsmParser::calculateGPRBlocks( const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed, - bool XNACKUsed, unsigned NextFreeVGPR, SMRange VGPRRange, - unsigned NextFreeSGPR, SMRange SGPRRange, unsigned &VGPRBlocks, - unsigned &SGPRBlocks) { + bool XNACKUsed, Optional<bool> EnableWavefrontSize32, unsigned NextFreeVGPR, + SMRange VGPRRange, unsigned NextFreeSGPR, SMRange SGPRRange, + unsigned &VGPRBlocks, unsigned &SGPRBlocks) { // TODO(scott.linder): These calculations are duplicated from // AMDGPUAsmPrinter::getSIProgramInfo and could be unified. IsaVersion Version = getIsaVersion(getSTI().getCPU()); unsigned NumVGPRs = NextFreeVGPR; unsigned NumSGPRs = NextFreeSGPR; - unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(&getSTI()); - if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) && - NumSGPRs > MaxAddressableNumSGPRs) - return OutOfRangeError(SGPRRange); + if (Version.Major >= 10) + NumSGPRs = 0; + else { + unsigned MaxAddressableNumSGPRs = + IsaInfo::getAddressableNumSGPRs(&getSTI()); - NumSGPRs += - IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed); + if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) && + NumSGPRs > MaxAddressableNumSGPRs) + return OutOfRangeError(SGPRRange); - if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) && - NumSGPRs > MaxAddressableNumSGPRs) - return OutOfRangeError(SGPRRange); + NumSGPRs += + IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed); - if (Features.test(FeatureSGPRInitBug)) - NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) && + NumSGPRs > MaxAddressableNumSGPRs) + return OutOfRangeError(SGPRRange); - VGPRBlocks = IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs); + if (Features.test(FeatureSGPRInitBug)) + NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + } + + VGPRBlocks = + IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs, EnableWavefrontSize32); SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs); return false; @@ -2674,7 +3544,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (getParser().parseIdentifier(KernelName)) return true; - kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(); + kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(&getSTI()); StringSet<> Seen; @@ -2688,6 +3558,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { bool ReserveVCC = true; bool ReserveFlatScr = true; bool ReserveXNACK = hasXNACK(); + Optional<bool> EnableWavefrontSize32; while (true) { while (getLexer().is(AsmToken::EndOfStatement)) @@ -2736,37 +3607,45 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 4; } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_queue_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_dispatch_id") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_private_segment_size") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, Val, ValRange); - UserSGPRCount++; + UserSGPRCount += 1; + } else if (ID == ".amdhsa_wavefront_size32") { + if (IVersion.Major < 10) + return getParser().Error(IDRange.Start, "directive requires gfx10+", + IDRange); + EnableWavefrontSize32 = Val; + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, + Val, ValRange); } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") { PARSE_BITS_ENTRY( KD.compute_pgm_rsrc2, @@ -2841,6 +3720,24 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { IDRange); PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val, ValRange); + } else if (ID == ".amdhsa_workgroup_processor_mode") { + if (IVersion.Major < 10) + return getParser().Error(IDRange.Start, "directive requires gfx10+", + IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_WGP_MODE, Val, + ValRange); + } else if (ID == ".amdhsa_memory_ordered") { + if (IVersion.Major < 10) + return getParser().Error(IDRange.Start, "directive requires gfx10+", + IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_MEM_ORDERED, Val, + ValRange); + } else if (ID == ".amdhsa_forward_progress") { + if (IVersion.Major < 10) + return getParser().Error(IDRange.Start, "directive requires gfx10+", + IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val, + ValRange); } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") { PARSE_BITS_ENTRY( KD.compute_pgm_rsrc2, @@ -2888,8 +3785,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { unsigned VGPRBlocks; unsigned SGPRBlocks; if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr, - ReserveXNACK, NextFreeVGPR, VGPRRange, NextFreeSGPR, - SGPRRange, VGPRBlocks, SGPRBlocks)) + ReserveXNACK, EnableWavefrontSize32, NextFreeVGPR, + VGPRRange, NextFreeSGPR, SGPRRange, VGPRBlocks, + SGPRBlocks)) return true; if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>( @@ -2994,6 +3892,46 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, return TokError(Err.str()); } Lex(); + + if (ID == "enable_wavefront_size32") { + if (Header.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) { + if (!isGFX10()) + return TokError("enable_wavefront_size32=1 is only allowed on GFX10+"); + if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32]) + return TokError("enable_wavefront_size32=1 requires +WavefrontSize32"); + } else { + if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64]) + return TokError("enable_wavefront_size32=0 requires +WavefrontSize64"); + } + } + + if (ID == "wavefront_size") { + if (Header.wavefront_size == 5) { + if (!isGFX10()) + return TokError("wavefront_size=5 is only allowed on GFX10+"); + if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32]) + return TokError("wavefront_size=5 requires +WavefrontSize32"); + } else if (Header.wavefront_size == 6) { + if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64]) + return TokError("wavefront_size=6 requires +WavefrontSize64"); + } + } + + if (ID == "enable_wgp_mode") { + if (G_00B848_WGP_MODE(Header.compute_pgm_resource_registers) && !isGFX10()) + return TokError("enable_wgp_mode=1 is only allowed on GFX10+"); + } + + if (ID == "enable_mem_ordered") { + if (G_00B848_MEM_ORDERED(Header.compute_pgm_resource_registers) && !isGFX10()) + return TokError("enable_mem_ordered=1 is only allowed on GFX10+"); + } + + if (ID == "enable_fwd_progress") { + if (G_00B848_FWD_PROGRESS(Header.compute_pgm_resource_registers) && !isGFX10()) + return TokError("enable_fwd_progress=1 is only allowed on GFX10+"); + } + return false; } @@ -3081,14 +4019,35 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { } std::string HSAMetadataString; - raw_string_ostream YamlStream(HSAMetadataString); + if (ParseToEndDirective(AssemblerDirectiveBegin, AssemblerDirectiveEnd, + HSAMetadataString)) + return true; + + if (IsaInfo::hasCodeObjectV3(&getSTI())) { + if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString)) + return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); + } else { + if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString)) + return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); + } + + return false; +} + +/// Common code to parse out a block of text (typically YAML) between start and +/// end directives. +bool AMDGPUAsmParser::ParseToEndDirective(const char *AssemblerDirectiveBegin, + const char *AssemblerDirectiveEnd, + std::string &CollectString) { + + raw_string_ostream CollectStream(CollectString); getLexer().setSkipSpace(false); bool FoundEnd = false; while (!getLexer().is(AsmToken::Eof)) { while (getLexer().is(AsmToken::Space)) { - YamlStream << getLexer().getTok().getString(); + CollectStream << getLexer().getTok().getString(); Lex(); } @@ -3101,8 +4060,8 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { } } - YamlStream << Parser.parseStringToEndOfStatement() - << getContext().getAsmInfo()->getSeparatorString(); + CollectStream << Parser.parseStringToEndOfStatement() + << getContext().getAsmInfo()->getSeparatorString(); Parser.eatToEndOfStatement(); } @@ -3111,22 +4070,27 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { if (getLexer().is(AsmToken::Eof) && !FoundEnd) { return TokError(Twine("expected directive ") + - Twine(HSAMD::AssemblerDirectiveEnd) + Twine(" not found")); + Twine(AssemblerDirectiveEnd) + Twine(" not found")); } - YamlStream.flush(); + CollectStream.flush(); + return false; +} - if (IsaInfo::hasCodeObjectV3(&getSTI())) { - if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString)) - return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); - } else { - if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString)) - return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); - } +/// Parse the assembler directive for new MsgPack-format PAL metadata. +bool AMDGPUAsmParser::ParseDirectivePALMetadataBegin() { + std::string String; + if (ParseToEndDirective(AMDGPU::PALMD::AssemblerDirectiveBegin, + AMDGPU::PALMD::AssemblerDirectiveEnd, String)) + return true; + auto PALMetadata = getTargetStreamer().getPALMetadata(); + if (!PALMetadata->setFromString(String)) + return Error(getParser().getTok().getLoc(), "invalid PAL metadata"); return false; } +/// Parse the assembler directive for old linear-format PAL metadata. bool AMDGPUAsmParser::ParseDirectivePALMetadata() { if (getSTI().getTargetTriple().getOS() != Triple::AMDPAL) { return Error(getParser().getTok().getLoc(), @@ -3134,19 +4098,82 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadata() { "not available on non-amdpal OSes")).str()); } - PALMD::Metadata PALMetadata; + auto PALMetadata = getTargetStreamer().getPALMetadata(); + PALMetadata->setLegacy(); for (;;) { - uint32_t Value; + uint32_t Key, Value; + if (ParseAsAbsoluteExpression(Key)) { + return TokError(Twine("invalid value in ") + + Twine(PALMD::AssemblerDirective)); + } + if (getLexer().isNot(AsmToken::Comma)) { + return TokError(Twine("expected an even number of values in ") + + Twine(PALMD::AssemblerDirective)); + } + Lex(); if (ParseAsAbsoluteExpression(Value)) { return TokError(Twine("invalid value in ") + Twine(PALMD::AssemblerDirective)); } - PALMetadata.push_back(Value); + PALMetadata->setRegister(Key, Value); if (getLexer().isNot(AsmToken::Comma)) break; Lex(); } - getTargetStreamer().EmitPALMetadata(PALMetadata); + return false; +} + +/// ParseDirectiveAMDGPULDS +/// ::= .amdgpu_lds identifier ',' size_expression [',' align_expression] +bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() { + if (getParser().checkForValidSection()) + return true; + + StringRef Name; + SMLoc NameLoc = getLexer().getLoc(); + if (getParser().parseIdentifier(Name)) + return TokError("expected identifier in directive"); + + MCSymbol *Symbol = getContext().getOrCreateSymbol(Name); + if (parseToken(AsmToken::Comma, "expected ','")) + return true; + + unsigned LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(&getSTI()); + + int64_t Size; + SMLoc SizeLoc = getLexer().getLoc(); + if (getParser().parseAbsoluteExpression(Size)) + return true; + if (Size < 0) + return Error(SizeLoc, "size must be non-negative"); + if (Size > LocalMemorySize) + return Error(SizeLoc, "size is too large"); + + int64_t Align = 4; + if (getLexer().is(AsmToken::Comma)) { + Lex(); + SMLoc AlignLoc = getLexer().getLoc(); + if (getParser().parseAbsoluteExpression(Align)) + return true; + if (Align < 0 || !isPowerOf2_64(Align)) + return Error(AlignLoc, "alignment must be a power of two"); + + // Alignment larger than the size of LDS is possible in theory, as long + // as the linker manages to place to symbol at address 0, but we do want + // to make sure the alignment fits nicely into a 32-bit integer. + if (Align >= 1u << 31) + return Error(AlignLoc, "alignment is too large"); + } + + if (parseToken(AsmToken::EndOfStatement, + "unexpected token in '.amdgpu_lds' directive")) + return true; + + Symbol->redefineIfPossible(); + if (!Symbol->isUndefined()) + return Error(NameLoc, "invalid symbol redefinition"); + + getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align); return false; } @@ -3183,6 +4210,12 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { return ParseDirectiveHSAMetadata(); } + if (IDVal == ".amdgpu_lds") + return ParseDirectiveAMDGPULDS(); + + if (IDVal == PALMD::AssemblerDirectiveBegin) + return ParseDirectivePALMetadataBegin(); + if (IDVal == PALMD::AssemblerDirective) return ParseDirectivePALMetadata(); @@ -3195,21 +4228,36 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true); R.isValid(); ++R) { if (*R == RegNo) - return isGFX9(); + return isGFX9() || isGFX10(); + } + + // GFX10 has 2 more SGPRs 104 and 105. + for (MCRegAliasIterator R(AMDGPU::SGPR104_SGPR105, &MRI, true); + R.isValid(); ++R) { + if (*R == RegNo) + return hasSGPR104_SGPR105(); } switch (RegNo) { + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + return !isCI() && !isSI() && !isVI(); case AMDGPU::TBA: case AMDGPU::TBA_LO: case AMDGPU::TBA_HI: case AMDGPU::TMA: case AMDGPU::TMA_LO: case AMDGPU::TMA_HI: - return !isGFX9(); + return !isGFX9() && !isGFX10(); case AMDGPU::XNACK_MASK: case AMDGPU::XNACK_MASK_LO: case AMDGPU::XNACK_MASK_HI: - return !isCI() && !isSI() && hasXNACK(); + return !isCI() && !isSI() && !isGFX10() && hasXNACK(); + case AMDGPU::SGPR_NULL: + return isGFX10(); default: break; } @@ -3217,8 +4265,10 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, if (isCI()) return true; - if (isSI()) { - // No flat_scr + if (isSI() || isGFX10()) { + // No flat_scr on SI. + // On GFX10 flat scratch is not a valid register operand and can only be + // accessed with s_setreg/s_getreg. switch (RegNo) { case AMDGPU::FLAT_SCR: case AMDGPU::FLAT_SCR_LO: @@ -3234,14 +4284,15 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true); R.isValid(); ++R) { if (*R == RegNo) - return false; + return hasSGPR102_SGPR103(); } return true; } OperandMatchResultTy -AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { +AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic, + OperandMode Mode) { // Try to parse with a custom parser OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); @@ -3255,28 +4306,36 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { getLexer().is(AsmToken::EndOfStatement)) return ResTy; - ResTy = parseRegOrImm(Operands); + if (Mode == OperandMode_NSA && getLexer().is(AsmToken::LBrac)) { + unsigned Prefix = Operands.size(); + SMLoc LBraceLoc = getTok().getLoc(); + Parser.Lex(); // eat the '[' - if (ResTy == MatchOperand_Success) - return ResTy; + for (;;) { + ResTy = parseReg(Operands); + if (ResTy != MatchOperand_Success) + return ResTy; - const auto &Tok = Parser.getTok(); - SMLoc S = Tok.getLoc(); + if (getLexer().is(AsmToken::RBrac)) + break; - const MCExpr *Expr = nullptr; - if (!Parser.parseExpression(Expr)) { - Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); - return MatchOperand_Success; - } + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); + } - // Possibly this is an instruction flag like 'gds'. - if (Tok.getKind() == AsmToken::Identifier) { - Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), S)); - Parser.Lex(); + if (Operands.size() - Prefix > 1) { + Operands.insert(Operands.begin() + Prefix, + AMDGPUOperand::CreateToken(this, "[", LBraceLoc)); + Operands.push_back(AMDGPUOperand::CreateToken(this, "]", + getTok().getLoc())); + } + + Parser.Lex(); // eat the ']' return MatchOperand_Success; } - return MatchOperand_NoMatch; + return parseRegOrImm(Operands); } StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) { @@ -3308,8 +4367,13 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, Name = parseMnemonicSuffix(Name); Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc)); + bool IsMIMG = Name.startswith("image_"); + while (!getLexer().is(AsmToken::EndOfStatement)) { - OperandMatchResultTy Res = parseOperand(Operands, Name); + OperandMode Mode = OperandMode_Default; + if (IsMIMG && isGFX10() && Operands.size() == 2) + Mode = OperandMode_NSA; + OperandMatchResultTy Res = parseOperand(Operands, Name, Mode); // Eat the comma or space if there is one. if (getLexer().is(AsmToken::Comma)) @@ -3318,12 +4382,14 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, switch (Res) { case MatchOperand_Success: break; case MatchOperand_ParseFail: + // FIXME: use real operand location rather than the current location. Error(getLexer().getLoc(), "failed parsing operand."); while (!getLexer().is(AsmToken::EndOfStatement)) { Parser.Lex(); } return true; case MatchOperand_NoMatch: + // FIXME: use real operand location rather than the current location. Error(getLexer().getLoc(), "not a valid operand."); while (!getLexer().is(AsmToken::EndOfStatement)) { Parser.Lex(); @@ -3340,46 +4406,19 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, //===----------------------------------------------------------------------===// OperandMatchResultTy -AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) { - switch(getLexer().getKind()) { - default: return MatchOperand_NoMatch; - case AsmToken::Identifier: { - StringRef Name = Parser.getTok().getString(); - if (!Name.equals(Prefix)) { - return MatchOperand_NoMatch; - } - - Parser.Lex(); - if (getLexer().isNot(AsmToken::Colon)) - return MatchOperand_ParseFail; - - Parser.Lex(); - - bool IsMinus = false; - if (getLexer().getKind() == AsmToken::Minus) { - Parser.Lex(); - IsMinus = true; - } +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &IntVal) { - if (getLexer().isNot(AsmToken::Integer)) - return MatchOperand_ParseFail; - - if (getParser().parseAbsoluteExpression(Int)) - return MatchOperand_ParseFail; + if (!trySkipId(Prefix, AsmToken::Colon)) + return MatchOperand_NoMatch; - if (IsMinus) - Int = -Int; - break; - } - } - return MatchOperand_Success; + return parseExpr(IntVal) ? MatchOperand_Success : MatchOperand_ParseFail; } OperandMatchResultTy AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy, bool (*ConvertResult)(int64_t&)) { - SMLoc S = Parser.getTok().getLoc(); + SMLoc S = getLoc(); int64_t Value = 0; OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value); @@ -3387,59 +4426,55 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, return Res; if (ConvertResult && !ConvertResult(Value)) { - return MatchOperand_ParseFail; + Error(S, "invalid " + StringRef(Prefix) + " value."); } Operands.push_back(AMDGPUOperand::CreateImm(this, Value, S, ImmTy)); return MatchOperand_Success; } -OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix( - const char *Prefix, - OperandVector &Operands, - AMDGPUOperand::ImmTy ImmTy, - bool (*ConvertResult)(int64_t&)) { - StringRef Name = Parser.getTok().getString(); - if (!Name.equals(Prefix)) +OperandMatchResultTy +AMDGPUAsmParser::parseOperandArrayWithPrefix(const char *Prefix, + OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy, + bool (*ConvertResult)(int64_t&)) { + SMLoc S = getLoc(); + if (!trySkipId(Prefix, AsmToken::Colon)) return MatchOperand_NoMatch; - Parser.Lex(); - if (getLexer().isNot(AsmToken::Colon)) - return MatchOperand_ParseFail; - - Parser.Lex(); - if (getLexer().isNot(AsmToken::LBrac)) + if (!skipToken(AsmToken::LBrac, "expected a left square bracket")) return MatchOperand_ParseFail; - Parser.Lex(); unsigned Val = 0; - SMLoc S = Parser.getTok().getLoc(); + const unsigned MaxSize = 4; // FIXME: How to verify the number of elements matches the number of src // operands? - for (int I = 0; I < 4; ++I) { - if (I != 0) { - if (getLexer().is(AsmToken::RBrac)) - break; + for (int I = 0; ; ++I) { + int64_t Op; + SMLoc Loc = getLoc(); + if (!parseExpr(Op)) + return MatchOperand_ParseFail; - if (getLexer().isNot(AsmToken::Comma)) - return MatchOperand_ParseFail; - Parser.Lex(); + if (Op != 0 && Op != 1) { + Error(Loc, "invalid " + StringRef(Prefix) + " value."); + return MatchOperand_ParseFail; } - if (getLexer().isNot(AsmToken::Integer)) - return MatchOperand_ParseFail; + Val |= (Op << I); - int64_t Op; - if (getParser().parseAbsoluteExpression(Op)) + if (trySkipToken(AsmToken::RBrac)) + break; + + if (I + 1 == MaxSize) { + Error(getLoc(), "expected a closing square bracket"); return MatchOperand_ParseFail; + } - if (Op != 0 && Op != 1) + if (!skipToken(AsmToken::Comma, "expected a comma")) return MatchOperand_ParseFail; - Val |= (Op << I); } - Parser.Lex(); Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy)); return MatchOperand_Success; } @@ -3459,7 +4494,7 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, if (Tok == Name) { if (Tok == "r128" && isGFX9()) Error(S, "r128 modifier is not supported on this GPU"); - if (Tok == "a16" && !isGFX9()) + if (Tok == "a16" && !isGFX9() && !isGFX10()) Error(S, "a16 modifier is not supported on this GPU"); Bit = 1; Parser.Lex(); @@ -3476,6 +4511,9 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, } } + if (!isGFX10() && ImmTy == AMDGPUOperand::ImmTyDLC) + return MatchOperand_ParseFail; + Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy)); return MatchOperand_Success; } @@ -3616,7 +4654,8 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, } AMDGPUOperand::ImmTy OffsetType = - (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si || + (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx10 || + Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7 || Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle : AMDGPUOperand::ImmTyOffset; @@ -3716,20 +4755,18 @@ encodeCnt( } bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { - StringRef CntName = Parser.getTok().getString(); - int64_t CntVal; - Parser.Lex(); - if (getLexer().isNot(AsmToken::LParen)) - return true; + SMLoc CntLoc = getLoc(); + StringRef CntName = getTokenStr(); - Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) - return true; + if (!skipToken(AsmToken::Identifier, "expected a counter name") || + !skipToken(AsmToken::LParen, "expected a left parenthesis")) + return false; - SMLoc ValLoc = Parser.getTok().getLoc(); - if (getParser().parseAbsoluteExpression(CntVal)) - return true; + int64_t CntVal; + SMLoc ValLoc = getLoc(); + if (!parseExpr(CntVal)) + return false; AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); @@ -3742,265 +4779,240 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeExpcnt, decodeExpcnt); } else if (CntName == "lgkmcnt" || CntName == "lgkmcnt_sat") { Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeLgkmcnt, decodeLgkmcnt); + } else { + Error(CntLoc, "invalid counter name " + CntName); + return false; } if (Failed) { Error(ValLoc, "too large value for " + CntName); - return true; + return false; } - if (getLexer().isNot(AsmToken::RParen)) { - return true; - } + if (!skipToken(AsmToken::RParen, "expected a closing parenthesis")) + return false; - Parser.Lex(); - if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) { - const AsmToken NextToken = getLexer().peekTok(); - if (NextToken.is(AsmToken::Identifier)) { - Parser.Lex(); + if (trySkipToken(AsmToken::Amp) || trySkipToken(AsmToken::Comma)) { + if (isToken(AsmToken::EndOfStatement)) { + Error(getLoc(), "expected a counter name"); + return false; } } - return false; + return true; } OperandMatchResultTy AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); int64_t Waitcnt = getWaitcntBitMask(ISA); - SMLoc S = Parser.getTok().getLoc(); - - switch(getLexer().getKind()) { - default: return MatchOperand_ParseFail; - case AsmToken::Integer: - // The operand can be an integer value. - if (getParser().parseAbsoluteExpression(Waitcnt)) - return MatchOperand_ParseFail; - break; + SMLoc S = getLoc(); - case AsmToken::Identifier: - do { - if (parseCnt(Waitcnt)) - return MatchOperand_ParseFail; - } while(getLexer().isNot(AsmToken::EndOfStatement)); - break; + // If parse failed, do not return error code + // to avoid excessive error messages. + if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) { + while (parseCnt(Waitcnt) && !isToken(AsmToken::EndOfStatement)); + } else { + parseExpr(Waitcnt); } + Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S)); return MatchOperand_Success; } -bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, - int64_t &Width) { - using namespace llvm::AMDGPU::Hwreg; +bool +AMDGPUOperand::isSWaitCnt() const { + return isImm(); +} - if (Parser.getTok().getString() != "hwreg") - return true; - Parser.Lex(); +//===----------------------------------------------------------------------===// +// hwreg +//===----------------------------------------------------------------------===// - if (getLexer().isNot(AsmToken::LParen)) - return true; - Parser.Lex(); +bool +AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg, + int64_t &Offset, + int64_t &Width) { + using namespace llvm::AMDGPU::Hwreg; - if (getLexer().is(AsmToken::Identifier)) { + // The register may be specified by name or using a numeric code + if (isToken(AsmToken::Identifier) && + (HwReg.Id = getHwregId(getTokenStr())) >= 0) { HwReg.IsSymbolic = true; - HwReg.Id = ID_UNKNOWN_; - const StringRef tok = Parser.getTok().getString(); - int Last = ID_SYMBOLIC_LAST_; - if (isSI() || isCI() || isVI()) - Last = ID_SYMBOLIC_FIRST_GFX9_; - for (int i = ID_SYMBOLIC_FIRST_; i < Last; ++i) { - if (tok == IdSymbolic[i]) { - HwReg.Id = i; - break; - } - } - Parser.Lex(); - } else { - HwReg.IsSymbolic = false; - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(HwReg.Id)) - return true; - } - - if (getLexer().is(AsmToken::RParen)) { - Parser.Lex(); + lex(); // skip message name + } else if (!parseExpr(HwReg.Id)) { return false; } - // optional params - if (getLexer().isNot(AsmToken::Comma)) - return true; - Parser.Lex(); - - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(Offset)) + if (trySkipToken(AsmToken::RParen)) return true; - if (getLexer().isNot(AsmToken::Comma)) - return true; - Parser.Lex(); + // parse optional params + return + skipToken(AsmToken::Comma, "expected a comma or a closing parenthesis") && + parseExpr(Offset) && + skipToken(AsmToken::Comma, "expected a comma") && + parseExpr(Width) && + skipToken(AsmToken::RParen, "expected a closing parenthesis"); +} - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(Width)) - return true; +bool +AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg, + const int64_t Offset, + const int64_t Width, + const SMLoc Loc) { - if (getLexer().isNot(AsmToken::RParen)) - return true; - Parser.Lex(); + using namespace llvm::AMDGPU::Hwreg; - return false; + if (HwReg.IsSymbolic && !isValidHwreg(HwReg.Id, getSTI())) { + Error(Loc, "specified hardware register is not supported on this GPU"); + return false; + } else if (!isValidHwreg(HwReg.Id)) { + Error(Loc, "invalid code of hardware register: only 6-bit values are legal"); + return false; + } else if (!isValidHwregOffset(Offset)) { + Error(Loc, "invalid bit offset: only 5-bit values are legal"); + return false; + } else if (!isValidHwregWidth(Width)) { + Error(Loc, "invalid bitfield width: only values from 1 to 32 are legal"); + return false; + } + return true; } -OperandMatchResultTy AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { +OperandMatchResultTy +AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { using namespace llvm::AMDGPU::Hwreg; - int64_t Imm16Val = 0; - SMLoc S = Parser.getTok().getLoc(); - - switch(getLexer().getKind()) { - default: return MatchOperand_NoMatch; - case AsmToken::Integer: - // The operand can be an integer value. - if (getParser().parseAbsoluteExpression(Imm16Val)) - return MatchOperand_NoMatch; - if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) { - Error(S, "invalid immediate: only 16-bit values are legal"); - // Do not return error code, but create an imm operand anyway and proceed - // to the next operand, if any. That avoids unneccessary error messages. - } - break; - - case AsmToken::Identifier: { - OperandInfoTy HwReg(ID_UNKNOWN_); - int64_t Offset = OFFSET_DEFAULT_; - int64_t Width = WIDTH_M1_DEFAULT_ + 1; - if (parseHwregConstruct(HwReg, Offset, Width)) - return MatchOperand_ParseFail; - if (HwReg.Id < 0 || !isUInt<ID_WIDTH_>(HwReg.Id)) { - if (HwReg.IsSymbolic) - Error(S, "invalid symbolic name of hardware register"); - else - Error(S, "invalid code of hardware register: only 6-bit values are legal"); - } - if (Offset < 0 || !isUInt<OFFSET_WIDTH_>(Offset)) - Error(S, "invalid bit offset: only 5-bit values are legal"); - if ((Width-1) < 0 || !isUInt<WIDTH_M1_WIDTH_>(Width-1)) - Error(S, "invalid bitfield width: only values from 1 to 32 are legal"); - Imm16Val = (HwReg.Id << ID_SHIFT_) | (Offset << OFFSET_SHIFT_) | ((Width-1) << WIDTH_M1_SHIFT_); - } - break; + int64_t ImmVal = 0; + SMLoc Loc = getLoc(); + + // If parse failed, do not return error code + // to avoid excessive error messages. + if (trySkipId("hwreg", AsmToken::LParen)) { + OperandInfoTy HwReg(ID_UNKNOWN_); + int64_t Offset = OFFSET_DEFAULT_; + int64_t Width = WIDTH_DEFAULT_; + if (parseHwregBody(HwReg, Offset, Width) && + validateHwreg(HwReg, Offset, Width, Loc)) { + ImmVal = encodeHwreg(HwReg.Id, Offset, Width); + } + } else if (parseExpr(ImmVal)) { + if (ImmVal < 0 || !isUInt<16>(ImmVal)) + Error(Loc, "invalid immediate: only 16-bit values are legal"); } - Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTyHwreg)); - return MatchOperand_Success; -} -bool AMDGPUOperand::isSWaitCnt() const { - return isImm(); + Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg)); + return MatchOperand_Success; } bool AMDGPUOperand::isHwreg() const { return isImmTy(ImmTyHwreg); } -bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId) { +//===----------------------------------------------------------------------===// +// sendmsg +//===----------------------------------------------------------------------===// + +bool +AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg, + OperandInfoTy &Op, + OperandInfoTy &Stream) { using namespace llvm::AMDGPU::SendMsg; - if (Parser.getTok().getString() != "sendmsg") - return true; - Parser.Lex(); + if (isToken(AsmToken::Identifier) && (Msg.Id = getMsgId(getTokenStr())) >= 0) { + Msg.IsSymbolic = true; + lex(); // skip message name + } else if (!parseExpr(Msg.Id)) { + return false; + } - if (getLexer().isNot(AsmToken::LParen)) - return true; - Parser.Lex(); + if (trySkipToken(AsmToken::Comma)) { + Op.IsDefined = true; + if (isToken(AsmToken::Identifier) && + (Op.Id = getMsgOpId(Msg.Id, getTokenStr())) >= 0) { + lex(); // skip operation name + } else if (!parseExpr(Op.Id)) { + return false; + } - if (getLexer().is(AsmToken::Identifier)) { - Msg.IsSymbolic = true; - Msg.Id = ID_UNKNOWN_; - const std::string tok = Parser.getTok().getString(); - for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) { - switch(i) { - default: continue; // Omit gaps. - case ID_INTERRUPT: case ID_GS: case ID_GS_DONE: case ID_SYSMSG: break; - } - if (tok == IdSymbolic[i]) { - Msg.Id = i; - break; - } + if (trySkipToken(AsmToken::Comma)) { + Stream.IsDefined = true; + if (!parseExpr(Stream.Id)) + return false; } - Parser.Lex(); - } else { - Msg.IsSymbolic = false; - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(Msg.Id)) - return true; - if (getLexer().is(AsmToken::Integer)) - if (getParser().parseAbsoluteExpression(Msg.Id)) - Msg.Id = ID_UNKNOWN_; } - if (Msg.Id == ID_UNKNOWN_) // Don't know how to parse the rest. - return false; - if (!(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG)) { - if (getLexer().isNot(AsmToken::RParen)) - return true; - Parser.Lex(); + return skipToken(AsmToken::RParen, "expected a closing parenthesis"); +} + +bool +AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg, + const OperandInfoTy &Op, + const OperandInfoTy &Stream, + const SMLoc S) { + using namespace llvm::AMDGPU::SendMsg; + + // Validation strictness depends on whether message is specified + // in a symbolc or in a numeric form. In the latter case + // only encoding possibility is checked. + bool Strict = Msg.IsSymbolic; + + if (!isValidMsgId(Msg.Id, getSTI(), Strict)) { + Error(S, "invalid message id"); + return false; + } else if (Strict && (msgRequiresOp(Msg.Id) != Op.IsDefined)) { + Error(S, Op.IsDefined ? + "message does not support operations" : + "missing message operation"); + return false; + } else if (!isValidMsgOp(Msg.Id, Op.Id, Strict)) { + Error(S, "invalid operation id"); + return false; + } else if (Strict && !msgSupportsStream(Msg.Id, Op.Id) && Stream.IsDefined) { + Error(S, "message operation does not support streams"); + return false; + } else if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, Strict)) { + Error(S, "invalid message stream id"); return false; } + return true; +} - if (getLexer().isNot(AsmToken::Comma)) - return true; - Parser.Lex(); +OperandMatchResultTy +AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) { + using namespace llvm::AMDGPU::SendMsg; - assert(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG); - Operation.Id = ID_UNKNOWN_; - if (getLexer().is(AsmToken::Identifier)) { - Operation.IsSymbolic = true; - const char* const *S = (Msg.Id == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic; - const int F = (Msg.Id == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_; - const int L = (Msg.Id == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_; - const StringRef Tok = Parser.getTok().getString(); - for (int i = F; i < L; ++i) { - if (Tok == S[i]) { - Operation.Id = i; - break; - } + int64_t ImmVal = 0; + SMLoc Loc = getLoc(); + + // If parse failed, do not return error code + // to avoid excessive error messages. + if (trySkipId("sendmsg", AsmToken::LParen)) { + OperandInfoTy Msg(ID_UNKNOWN_); + OperandInfoTy Op(OP_NONE_); + OperandInfoTy Stream(STREAM_ID_NONE_); + if (parseSendMsgBody(Msg, Op, Stream) && + validateSendMsg(Msg, Op, Stream, Loc)) { + ImmVal = encodeMsg(Msg.Id, Op.Id, Stream.Id); } - Parser.Lex(); - } else { - Operation.IsSymbolic = false; - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(Operation.Id)) - return true; + } else if (parseExpr(ImmVal)) { + if (ImmVal < 0 || !isUInt<16>(ImmVal)) + Error(Loc, "invalid immediate: only 16-bit values are legal"); } - if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) { - // Stream id is optional. - if (getLexer().is(AsmToken::RParen)) { - Parser.Lex(); - return false; - } - - if (getLexer().isNot(AsmToken::Comma)) - return true; - Parser.Lex(); - - if (getLexer().isNot(AsmToken::Integer)) - return true; - if (getParser().parseAbsoluteExpression(StreamId)) - return true; - } + Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTySendMsg)); + return MatchOperand_Success; +} - if (getLexer().isNot(AsmToken::RParen)) - return true; - Parser.Lex(); - return false; +bool AMDGPUOperand::isSendMsg() const { + return isImmTy(ImmTySendMsg); } +//===----------------------------------------------------------------------===// +// v_interp +//===----------------------------------------------------------------------===// + OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) { if (getLexer().getKind() != AsmToken::Identifier) return MatchOperand_NoMatch; @@ -4062,6 +5074,10 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) { return MatchOperand_Success; } +//===----------------------------------------------------------------------===// +// exp +//===----------------------------------------------------------------------===// + void AMDGPUAsmParser::errorExpTgt() { Error(Parser.getTok().getLoc(), "invalid exp target"); } @@ -4094,13 +5110,18 @@ OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str, if (Str.getAsInteger(10, Val)) return MatchOperand_ParseFail; - if (Val > 3) + if (Val > 4 || (Val == 4 && !isGFX10())) errorExpTgt(); Val += 12; return MatchOperand_Success; } + if (isGFX10() && Str == "prim") { + Val = 20; + return MatchOperand_Success; + } + if (Str.startswith("param")) { Str = Str.drop_front(5); if (Str.getAsInteger(10, Val)) @@ -4141,98 +5162,39 @@ OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) { return MatchOperand_Success; } -OperandMatchResultTy -AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) { - using namespace llvm::AMDGPU::SendMsg; - - int64_t Imm16Val = 0; - SMLoc S = Parser.getTok().getLoc(); +//===----------------------------------------------------------------------===// +// parser helpers +//===----------------------------------------------------------------------===// - switch(getLexer().getKind()) { - default: - return MatchOperand_NoMatch; - case AsmToken::Integer: - // The operand can be an integer value. - if (getParser().parseAbsoluteExpression(Imm16Val)) - return MatchOperand_NoMatch; - if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) { - Error(S, "invalid immediate: only 16-bit values are legal"); - // Do not return error code, but create an imm operand anyway and proceed - // to the next operand, if any. That avoids unneccessary error messages. - } - break; - case AsmToken::Identifier: { - OperandInfoTy Msg(ID_UNKNOWN_); - OperandInfoTy Operation(OP_UNKNOWN_); - int64_t StreamId = STREAM_ID_DEFAULT_; - if (parseSendMsgConstruct(Msg, Operation, StreamId)) - return MatchOperand_ParseFail; - do { - // Validate and encode message ID. - if (! ((ID_INTERRUPT <= Msg.Id && Msg.Id <= ID_GS_DONE) - || Msg.Id == ID_SYSMSG)) { - if (Msg.IsSymbolic) - Error(S, "invalid/unsupported symbolic name of message"); - else - Error(S, "invalid/unsupported code of message"); - break; - } - Imm16Val = (Msg.Id << ID_SHIFT_); - // Validate and encode operation ID. - if (Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) { - if (! (OP_GS_FIRST_ <= Operation.Id && Operation.Id < OP_GS_LAST_)) { - if (Operation.IsSymbolic) - Error(S, "invalid symbolic name of GS_OP"); - else - Error(S, "invalid code of GS_OP: only 2-bit values are legal"); - break; - } - if (Operation.Id == OP_GS_NOP - && Msg.Id != ID_GS_DONE) { - Error(S, "invalid GS_OP: NOP is for GS_DONE only"); - break; - } - Imm16Val |= (Operation.Id << OP_SHIFT_); - } - if (Msg.Id == ID_SYSMSG) { - if (! (OP_SYS_FIRST_ <= Operation.Id && Operation.Id < OP_SYS_LAST_)) { - if (Operation.IsSymbolic) - Error(S, "invalid/unsupported symbolic name of SYSMSG_OP"); - else - Error(S, "invalid/unsupported code of SYSMSG_OP"); - break; - } - Imm16Val |= (Operation.Id << OP_SHIFT_); - } - // Validate and encode stream ID. - if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) { - if (! (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_)) { - Error(S, "invalid stream id: only 2-bit values are legal"); - break; - } - Imm16Val |= (StreamId << STREAM_ID_SHIFT_); - } - } while (false); - } - break; - } - Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTySendMsg)); - return MatchOperand_Success; +bool +AMDGPUAsmParser::isId(const AsmToken &Token, const StringRef Id) const { + return Token.is(AsmToken::Identifier) && Token.getString() == Id; } -bool AMDGPUOperand::isSendMsg() const { - return isImmTy(ImmTySendMsg); +bool +AMDGPUAsmParser::isId(const StringRef Id) const { + return isId(getToken(), Id); } -//===----------------------------------------------------------------------===// -// parser helpers -//===----------------------------------------------------------------------===// +bool +AMDGPUAsmParser::isToken(const AsmToken::TokenKind Kind) const { + return getTokenKind() == Kind; +} bool AMDGPUAsmParser::trySkipId(const StringRef Id) { - if (getLexer().getKind() == AsmToken::Identifier && - Parser.getTok().getString() == Id) { - Parser.Lex(); + if (isId(Id)) { + lex(); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::trySkipId(const StringRef Id, const AsmToken::TokenKind Kind) { + if (isId(Id) && peekToken().is(Kind)) { + lex(); + lex(); return true; } return false; @@ -4240,8 +5202,8 @@ AMDGPUAsmParser::trySkipId(const StringRef Id) { bool AMDGPUAsmParser::trySkipToken(const AsmToken::TokenKind Kind) { - if (getLexer().getKind() == Kind) { - Parser.Lex(); + if (isToken(Kind)) { + lex(); return true; } return false; @@ -4251,7 +5213,7 @@ bool AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg) { if (!trySkipToken(Kind)) { - Error(Parser.getTok().getLoc(), ErrMsg); + Error(getLoc(), ErrMsg); return false; } return true; @@ -4264,17 +5226,54 @@ AMDGPUAsmParser::parseExpr(int64_t &Imm) { bool AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) { - SMLoc S = Parser.getTok().getLoc(); - if (getLexer().getKind() == AsmToken::String) { - Val = Parser.getTok().getStringContents(); - Parser.Lex(); + if (isToken(AsmToken::String)) { + Val = getToken().getStringContents(); + lex(); return true; } else { - Error(S, ErrMsg); + Error(getLoc(), ErrMsg); return false; } } +AsmToken +AMDGPUAsmParser::getToken() const { + return Parser.getTok(); +} + +AsmToken +AMDGPUAsmParser::peekToken() { + return getLexer().peekTok(); +} + +void +AMDGPUAsmParser::peekTokens(MutableArrayRef<AsmToken> Tokens) { + auto TokCount = getLexer().peekTokens(Tokens); + + for (auto Idx = TokCount; Idx < Tokens.size(); ++Idx) + Tokens[Idx] = AsmToken(AsmToken::Error, ""); +} + +AsmToken::TokenKind +AMDGPUAsmParser::getTokenKind() const { + return getLexer().getKind(); +} + +SMLoc +AMDGPUAsmParser::getLoc() const { + return getToken().getLoc(); +} + +StringRef +AMDGPUAsmParser::getTokenStr() const { + return getToken().getString(); +} + +void +AMDGPUAsmParser::lex() { + Parser.Lex(); +} + //===----------------------------------------------------------------------===// // swizzle //===----------------------------------------------------------------------===// @@ -4322,8 +5321,8 @@ AMDGPUAsmParser::parseSwizzleQuadPerm(int64_t &Imm) { if (parseSwizzleOperands(LANE_NUM, Lane, 0, LANE_MAX, "expected a 2-bit lane id")) { Imm = QUAD_PERM_ENC; - for (auto i = 0; i < LANE_NUM; ++i) { - Imm |= Lane[i] << (LANE_SHIFT * i); + for (unsigned I = 0; I < LANE_NUM; ++I) { + Imm |= Lane[I] << (LANE_SHIFT * I); } return true; } @@ -4519,6 +5518,88 @@ AMDGPUOperand::isSwizzle() const { } //===----------------------------------------------------------------------===// +// VGPR Index Mode +//===----------------------------------------------------------------------===// + +int64_t AMDGPUAsmParser::parseGPRIdxMacro() { + + using namespace llvm::AMDGPU::VGPRIndexMode; + + if (trySkipToken(AsmToken::RParen)) { + return OFF; + } + + int64_t Imm = 0; + + while (true) { + unsigned Mode = 0; + SMLoc S = Parser.getTok().getLoc(); + + for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) { + if (trySkipId(IdSymbolic[ModeId])) { + Mode = 1 << ModeId; + break; + } + } + + if (Mode == 0) { + Error(S, (Imm == 0)? + "expected a VGPR index mode or a closing parenthesis" : + "expected a VGPR index mode"); + break; + } + + if (Imm & Mode) { + Error(S, "duplicate VGPR index mode"); + break; + } + Imm |= Mode; + + if (trySkipToken(AsmToken::RParen)) + break; + if (!skipToken(AsmToken::Comma, + "expected a comma or a closing parenthesis")) + break; + } + + return Imm; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) { + + int64_t Imm = 0; + SMLoc S = Parser.getTok().getLoc(); + + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == "gpr_idx" && + getLexer().peekTok().is(AsmToken::LParen)) { + + Parser.Lex(); + Parser.Lex(); + + // If parse failed, trigger an error but do not return error code + // to avoid excessive error messages. + Imm = parseGPRIdxMacro(); + + } else { + if (getParser().parseAbsoluteExpression(Imm)) + return MatchOperand_NoMatch; + if (Imm < 0 || !isUInt<4>(Imm)) { + Error(S, "invalid immediate: only 4-bit values are legal"); + } + } + + Operands.push_back( + AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyGprIdxMode)); + return MatchOperand_Success; +} + +bool AMDGPUOperand::isGPRIdxMode() const { + return isImmTy(ImmTyGprIdxMode); +} + +//===----------------------------------------------------------------------===// // sopp branch targets //===----------------------------------------------------------------------===// @@ -4546,9 +5627,22 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { } //===----------------------------------------------------------------------===// +// Boolean holding registers +//===----------------------------------------------------------------------===// + +OperandMatchResultTy +AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) { + return parseReg(Operands); +} + +//===----------------------------------------------------------------------===// // mubuf //===----------------------------------------------------------------------===// +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDLC() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDLC); +} + AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC); } @@ -4566,13 +5660,19 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, bool HasLdsModifier = false; OptionalImmIndexMap OptionalIdx; assert(IsAtomicReturn ? IsAtomic : true); + unsigned FirstOperandIdx = 1; - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + for (unsigned i = FirstOperandIdx, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); // Add the register arguments if (Op.isReg()) { Op.addRegOperands(Inst, 1); + // Insert a tied src for atomic return dst. + // This cannot be postponed as subsequent calls to + // addImmOperands rely on correct number of MC operands. + if (IsAtomicReturn && i == FirstOperandIdx) + Op.addRegOperands(Inst, 1); continue; } @@ -4582,7 +5682,7 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, continue; } - HasLdsModifier = Op.isLDS(); + HasLdsModifier |= Op.isLDS(); // Handle tokens like 'offen' which are sometimes hard-coded into the // asm string. There are no MCInst operands for these. @@ -4610,12 +5710,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, } } - // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns. - if (IsAtomicReturn) { - MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning. - Inst.insert(I, *I); - } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); if (!IsAtomic) { // glc is hard-coded. addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); @@ -4625,6 +5719,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, if (!IsLdsOpcode) { // tfe is not legal with lds opcodes addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); } + + if (isGFX10()) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); } void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { @@ -4662,6 +5759,9 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); + + if (isGFX10()) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); } //===----------------------------------------------------------------------===// @@ -4692,19 +5792,26 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands, Op.addRegOperands(Inst, 1); } else if (Op.isImmModifier()) { OptionalIdx[Op.getImmTy()] = I; - } else { + } else if (!Op.isToken()) { llvm_unreachable("unexpected operand type"); } } + bool IsGFX10 = isGFX10(); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask); + if (IsGFX10) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); + if (IsGFX10) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); + if (!IsGFX10) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16); } @@ -4742,11 +5849,7 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); } -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const { +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFlatOffset() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); } @@ -4801,7 +5904,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"lds", AMDGPUOperand::ImmTyLDS, true, nullptr}, {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr}, - {"dfmt", AMDGPUOperand::ImmTyFORMAT, false, nullptr}, + {"dlc", AMDGPUOperand::ImmTyDLC, true, nullptr}, + {"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, @@ -4816,9 +5920,11 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr}, {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr}, + {"dim", AMDGPUOperand::ImmTyDim, false, nullptr}, {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr}, {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr}, {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl}, + {"fi", AMDGPUOperand::ImmTyDppFi, false, nullptr}, {"dst_sel", AMDGPUOperand::ImmTySdwaDstSel, false, nullptr}, {"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr}, {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr}, @@ -4828,7 +5934,10 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr}, {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr}, {"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr}, - {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr} + {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr}, + {"blgp", AMDGPUOperand::ImmTyBLGP, false, nullptr}, + {"cbsz", AMDGPUOperand::ImmTyCBSZ, false, nullptr}, + {"abid", AMDGPUOperand::ImmTyABID, false, nullptr} }; OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { @@ -4884,7 +5993,9 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands) Op.Type == AMDGPUOperand::ImmTyNegHi) { res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); - } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT) { + } else if (Op.Type == AMDGPUOperand::ImmTyDim) { + res = parseDim(Operands); + } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT && !isGFX10()) { res = parseDfmtNfmt(Operands); } else { res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); @@ -4964,7 +6075,7 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) } else if (Op.isInterpSlot() || Op.isInterpAttr() || Op.isAttrChan()) { - Inst.addOperand(MCOperand::createImm(Op.Imm.Val)); + Inst.addOperand(MCOperand::createImm(Op.getImm())); } else if (Op.isImmModifier()) { OptionalIdx[Op.getImmTy()] = I; } else { @@ -5029,14 +6140,17 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); } - // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906): + // Special case v_mac_{f16, f32} and v_fmac_{f16, f32} (gfx906/gfx10+): // it has src2 register operand that is tied to dst operand // we don't allow modifiers for this operand in assembler so src2_modifiers // should be 0. - if (Opc == AMDGPU::V_MAC_F32_e64_si || + if (Opc == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 || + Opc == AMDGPU::V_MAC_F32_e64_gfx10 || Opc == AMDGPU::V_MAC_F32_e64_vi || Opc == AMDGPU::V_MAC_F16_e64_vi || - Opc == AMDGPU::V_FMAC_F32_e64_vi) { + Opc == AMDGPU::V_FMAC_F32_e64_gfx10 || + Opc == AMDGPU::V_FMAC_F32_e64_vi || + Opc == AMDGPU::V_FMAC_F16_e64_gfx10) { auto it = Inst.begin(); std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers)); it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 @@ -5137,6 +6251,10 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, // dpp //===----------------------------------------------------------------------===// +bool AMDGPUOperand::isDPP8() const { + return isImmTy(ImmTyDPP8); +} + bool AMDGPUOperand::isDPPCtrl() const { using namespace AMDGPU::DPP; @@ -5154,13 +6272,27 @@ bool AMDGPUOperand::isDPPCtrl() const { (Imm == DppCtrl::ROW_MIRROR) || (Imm == DppCtrl::ROW_HALF_MIRROR) || (Imm == DppCtrl::BCAST15) || - (Imm == DppCtrl::BCAST31); + (Imm == DppCtrl::BCAST31) || + (Imm >= DppCtrl::ROW_SHARE_FIRST && Imm <= DppCtrl::ROW_SHARE_LAST) || + (Imm >= DppCtrl::ROW_XMASK_FIRST && Imm <= DppCtrl::ROW_XMASK_LAST); } return false; } -bool AMDGPUOperand::isGPRIdxMode() const { - return isImm() && isUInt<4>(getImm()); +//===----------------------------------------------------------------------===// +// mAI +//===----------------------------------------------------------------------===// + +bool AMDGPUOperand::isBLGP() const { + return isImm() && getImmTy() == ImmTyBLGP && isUInt<3>(getImm()); +} + +bool AMDGPUOperand::isCBSZ() const { + return isImm() && getImmTy() == ImmTyCBSZ && isUInt<3>(getImm()); +} + +bool AMDGPUOperand::isABID() const { + return isImm() && getImmTy() == ImmTyABID && isUInt<4>(getImm()); } bool AMDGPUOperand::isS16Imm() const { @@ -5171,6 +6303,108 @@ bool AMDGPUOperand::isU16Imm() const { return isImm() && isUInt<16>(getImm()); } +OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) { + if (!isGFX10()) + return MatchOperand_NoMatch; + + SMLoc S = Parser.getTok().getLoc(); + + if (getLexer().isNot(AsmToken::Identifier)) + return MatchOperand_NoMatch; + if (getLexer().getTok().getString() != "dim") + return MatchOperand_NoMatch; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + + // We want to allow "dim:1D" etc., but the initial 1 is tokenized as an + // integer. + std::string Token; + if (getLexer().is(AsmToken::Integer)) { + SMLoc Loc = getLexer().getTok().getEndLoc(); + Token = getLexer().getTok().getString(); + Parser.Lex(); + if (getLexer().getTok().getLoc() != Loc) + return MatchOperand_ParseFail; + } + if (getLexer().isNot(AsmToken::Identifier)) + return MatchOperand_ParseFail; + Token += getLexer().getTok().getString(); + + StringRef DimId = Token; + if (DimId.startswith("SQ_RSRC_IMG_")) + DimId = DimId.substr(12); + + const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByAsmSuffix(DimId); + if (!DimInfo) + return MatchOperand_ParseFail; + + Parser.Lex(); + + Operands.push_back(AMDGPUOperand::CreateImm(this, DimInfo->Encoding, S, + AMDGPUOperand::ImmTyDim)); + return MatchOperand_Success; +} + +OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + StringRef Prefix; + + if (getLexer().getKind() == AsmToken::Identifier) { + Prefix = Parser.getTok().getString(); + } else { + return MatchOperand_NoMatch; + } + + if (Prefix != "dpp8") + return parseDPPCtrl(Operands); + if (!isGFX10()) + return MatchOperand_NoMatch; + + // dpp8:[%d,%d,%d,%d,%d,%d,%d,%d] + + int64_t Sels[8]; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(Sels[0])) + return MatchOperand_ParseFail; + if (0 > Sels[0] || 7 < Sels[0]) + return MatchOperand_ParseFail; + + for (size_t i = 1; i < 8; ++i) { + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(Sels[i])) + return MatchOperand_ParseFail; + if (0 > Sels[i] || 7 < Sels[i]) + return MatchOperand_ParseFail; + } + + if (getLexer().isNot(AsmToken::RBrac)) + return MatchOperand_ParseFail; + Parser.Lex(); + + unsigned DPP8 = 0; + for (size_t i = 0; i < 8; ++i) + DPP8 |= (Sels[i] << (i * 3)); + + Operands.push_back(AMDGPUOperand::CreateImm(this, DPP8, S, AMDGPUOperand::ImmTyDPP8)); + return MatchOperand_Success; +} + OperandMatchResultTy AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { using namespace AMDGPU::DPP; @@ -5201,10 +6435,21 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { && Prefix != "wave_rol" && Prefix != "wave_shr" && Prefix != "wave_ror" - && Prefix != "row_bcast") { + && Prefix != "row_bcast" + && Prefix != "row_share" + && Prefix != "row_xmask") { return MatchOperand_NoMatch; } + if (!isGFX10() && (Prefix == "row_share" || Prefix == "row_xmask")) + return MatchOperand_NoMatch; + + if (!isVI() && !isGFX9() && + (Prefix == "wave_shl" || Prefix == "wave_shr" || + Prefix == "wave_rol" || Prefix == "wave_ror" || + Prefix == "row_bcast")) + return MatchOperand_NoMatch; + Parser.Lex(); if (getLexer().isNot(AsmToken::Colon)) return MatchOperand_ParseFail; @@ -5262,6 +6507,10 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { } else { return MatchOperand_ParseFail; } + } else if (Prefix == "row_share" && 0 <= Int && Int <= 15) { + Int |= DppCtrl::ROW_SHARE_FIRST; + } else if (Prefix == "row_xmask" && 0 <= Int && Int <= 15) { + Int |= DppCtrl::ROW_XMASK_FIRST; } else { return MatchOperand_ParseFail; } @@ -5276,6 +6525,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const { return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask); } +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultEndpgmImmOperands() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyEndpgm); +} + AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const { return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask); } @@ -5284,7 +6537,11 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl); } -void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi); +} + +void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) { OptionalImmIndexMap OptionalIdx; unsigned I = 1; @@ -5293,6 +6550,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } + int Fi = 0; for (unsigned E = Operands.size(); I != E; ++I) { auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO); @@ -5303,25 +6561,49 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { } AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments - if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { + if (Op.isReg() && validateVccOperand(Op.getReg())) { // VOP2b (v_add_u32, v_sub_u32 ...) dpp use "vcc" token. // Skip it. continue; - } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { - Op.addRegWithFPInputModsOperands(Inst, 2); - } else if (Op.isDPPCtrl()) { - Op.addImmOperands(Inst, 1); - } else if (Op.isImm()) { - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = I; + } + + if (IsDPP8) { + if (Op.isDPP8()) { + Op.addImmOperands(Inst, 1); + } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegWithFPInputModsOperands(Inst, 2); + } else if (Op.isFI()) { + Fi = Op.getImm(); + } else if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + } else { + llvm_unreachable("Invalid operand type"); + } } else { - llvm_unreachable("Invalid operand type"); + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegWithFPInputModsOperands(Inst, 2); + } else if (Op.isDPPCtrl()) { + Op.addImmOperands(Inst, 1); + } else if (Op.isImm()) { + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = I; + } else { + llvm_unreachable("Invalid operand type"); + } } } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); + if (IsDPP8) { + using namespace llvm::AMDGPU::DPP; + Inst.addOperand(MCOperand::createImm(Fi? DPP8_FI_1 : DPP8_FI_0)); + } else { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::fi) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppFi); + } + } } //===----------------------------------------------------------------------===// @@ -5422,7 +6704,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { + if (skipVcc && !skippedVcc && Op.isReg() && + (Op.getReg() == AMDGPU::VCC || Op.getReg() == AMDGPU::VCC_LO)) { // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3) // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand. @@ -5448,7 +6731,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, skippedVcc = false; } - if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 && + if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx10 && + Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 && Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments switch (BasicInstType) { @@ -5474,7 +6758,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, break; case SIInstrFlags::VOPC: - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::clamp) != -1) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); break; @@ -5495,6 +6780,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } } +//===----------------------------------------------------------------------===// +// mAI +//===----------------------------------------------------------------------===// + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBLGP() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyBLGP); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCBSZ() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCBSZ); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultABID() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyABID); +} + /// Force static initialization. extern "C" void LLVMInitializeAMDGPUAsmParser() { RegisterMCAsmParser<AMDGPUAsmParser> A(getTheAMDGPUTarget()); @@ -5552,3 +6853,28 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, return Match_InvalidOperand; } } + +//===----------------------------------------------------------------------===// +// endpgm +//===----------------------------------------------------------------------===// + +OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + int64_t Imm = 0; + + if (!parseExpr(Imm)) { + // The operand is optional, if not present default to 0 + Imm = 0; + } + + if (!isUInt<16>(Imm)) { + Error(S, "expected a 16-bit value"); + return MatchOperand_ParseFail; + } + + Operands.push_back( + AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyEndpgm)); + return MatchOperand_Success; +} + +bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); } diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 51c2abeac2ff..62a19d848af2 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -1,37 +1,22 @@ //===-- BUFInstructions.td - Buffer Instruction Defintions ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; -def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; +def MUBUFAddr64 : ComplexPattern<i64, 8, "SelectMUBUFAddr64">; def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>; def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>; -def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; +def MUBUFOffset : ComplexPattern<i64, 7, "SelectMUBUFOffset">; def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; -class MubufLoad <SDPatternOperator op> : PatFrag < - (ops node:$ptr), (op node:$ptr), [{ - auto const AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUAS::GLOBAL_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; -}]>; - -def mubuf_load : MubufLoad <load>; -def mubuf_az_extloadi8 : MubufLoad <az_extloadi8>; -def mubuf_sextloadi8 : MubufLoad <sextloadi8>; -def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>; -def mubuf_sextloadi16 : MubufLoad <sextloadi16>; -def mubuf_load_atomic : MubufLoad <atomic_load>; - def BUFAddrKind { int Offset = 0; int OffEn = 1; @@ -97,7 +82,9 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, bits<1> has_vdata = 1; bits<1> has_vaddr = 1; bits<1> has_glc = 1; + bits<1> has_dlc = 1; bits<1> glc_value = 0; // the value for glc if no such operand + bits<1> dlc_value = 0; // the value for dlc if no such operand bits<1> has_srsrc = 1; bits<1> has_soffset = 1; bits<1> has_offset = 1; @@ -120,6 +107,7 @@ class MTBUF_Real <MTBUF_Pseudo ps> : bits<12> offset; bits<1> glc; + bits<1> dlc; bits<7> format; bits<8> vaddr; bits<8> vdata; @@ -138,17 +126,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList, RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe), + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe) + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc) ); dag InsData = !if(!empty(vaddrList), (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe), + SLC:$slc, TFE:$tfe, DLC:$dlc), (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe) + SLC:$slc, TFE:$tfe, DLC:$dlc) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); } @@ -199,7 +187,7 @@ class MTBUF_Load_Pseudo <string opName, : MTBUF_Pseudo<opName, (outs vdataClass:$vdata), getMTBUFIns<addrKindCopy>.ret, - " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc", pattern>, MTBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; @@ -214,13 +202,13 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format, - i1:$glc, i1:$slc, i1:$tfe)))]>, + i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>, MTBUFAddr64Table<0, NAME>; def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, - i8:$format, i1:$glc, i1:$slc, i1:$tfe)))]>, + i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>, MTBUFAddr64Table<1, NAME>; def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; @@ -245,7 +233,7 @@ class MTBUF_Store_Pseudo <string opName, : MTBUF_Pseudo<opName, (outs), getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret, - " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc", pattern>, MTBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; @@ -260,13 +248,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format, i1:$glc, - i1:$slc, i1:$tfe))]>, + i1:$slc, i1:$tfe, i1:$dlc))]>, MTBUFAddr64Table<0, NAME>; def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i8:$format, i1:$glc, - i1:$slc, i1:$tfe))]>, + i1:$slc, i1:$tfe, i1:$dlc))]>, MTBUFAddr64Table<1, NAME>; def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; @@ -324,7 +312,9 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins, bits<1> has_vdata = 1; bits<1> has_vaddr = 1; bits<1> has_glc = 1; + bits<1> has_dlc = 1; bits<1> glc_value = 0; // the value for glc if no such operand + bits<1> dlc_value = 0; // the value for dlc if no such operand bits<1> has_srsrc = 1; bits<1> has_soffset = 1; bits<1> has_offset = 1; @@ -333,7 +323,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins, bits<4> dwords = 0; } -class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> : +class MUBUF_Real <MUBUF_Pseudo ps> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> { let isPseudo = 0; @@ -348,6 +338,7 @@ class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> : bits<12> offset; bits<1> glc; + bits<1> dlc; bits<8> vaddr; bits<8> vdata; bits<7> srsrc; @@ -358,7 +349,7 @@ class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> : // For cache invalidation instructions. -class MUBUF_Invalidate <string opName, SDPatternOperator node> : +class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> : MUBUF_Pseudo<opName, (outs), (ins), "", [(node)]> { let AsmMatchConverter = ""; @@ -373,7 +364,9 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node> : let has_vdata = 0; let has_vaddr = 0; let has_glc = 0; + let has_dlc = 0; let glc_value = 0; + let dlc_value = 0; let has_srsrc = 0; let has_soffset = 0; let has_offset = 0; @@ -400,7 +393,7 @@ class getMUBUFInsDA<list<RegisterClass> vdataList, ); dag ret = !con( !if(!empty(vdataList), InsNoData, InsData), - !if(isLds, (ins), (ins TFE:$tfe)) + !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc)) ); } @@ -460,7 +453,7 @@ class MUBUF_Load_Pseudo <string opName, !con(getMUBUFIns<addrKindCopy, [], isLds>.ret, !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))), " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" # - !if(isLds, " lds", "$tfe"), + !if(isLds, " lds", "$tfe") # "$dlc", pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # !if(isLds, "_lds", "") # @@ -477,6 +470,24 @@ class MUBUF_Load_Pseudo <string opName, let dwords = getMUBUFDwords<vdataClass>.ret; } +class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat < + (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), + (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) +>; + +class MUBUF_Addr64_Load_Pat <Instruction inst, + ValueType load_vt = i32, + SDPatternOperator ld = null_frag> : Pat < + (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), + (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) +>; + +multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { + def : MUBUF_Offset_Load_Pat<!cast<Instruction>(BaseInst#"_OFFSET"), load_vt, ld>; + def : MUBUF_Addr64_Load_Pat<!cast<Instruction>(BaseInst#"_ADDR64"), load_vt, ld>; +} + + // FIXME: tfe can't be an operand because it requires a separate // opcode because it needs an N+1 register class dest register. multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, @@ -485,20 +496,10 @@ multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, bit TiedDest = 0, bit isLds = 0> { - def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, - TiedDest, isLds, - !if(isLds, - [], - [(set load_vt:$vdata, - (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>, + def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>, MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>; - def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, - TiedDest, isLds, - !if(isLds, - [], - [(set load_vt:$vdata, - (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>, + def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, TiedDest, isLds>, MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>; def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>; @@ -531,7 +532,7 @@ class MUBUF_Store_Pseudo <string opName, : MUBUF_Pseudo<opName, (outs), getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret, - " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc", pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; @@ -547,12 +548,12 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>, MUBUFAddr64Table<0, NAME>; def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>, MUBUFAddr64Table<1, NAME>; def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; @@ -638,6 +639,7 @@ class MUBUF_Atomic_Pseudo<string opName, let hasSideEffects = 1; let DisableWQM = 1; let has_glc = 0; + let has_dlc = 0; let has_tfe = 0; let maybeAtomic = 1; } @@ -656,6 +658,7 @@ class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind, AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let glc_value = 0; + let dlc_value = 0; let AsmMatchConverter = "cvtMubufAtomic"; } @@ -673,6 +676,7 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind, AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> { let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret; let glc_value = 1; + let dlc_value = 0; let Constraints = "$vdata = $vdata_in"; let DisableEncoding = "$vdata_in"; let AsmMatchConverter = "cvtMubufAtomicReturn"; @@ -681,34 +685,53 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind, multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, RegisterClass vdataClass, ValueType vdataType, - SDPatternOperator atomic> { + SDPatternOperator atomic, + bit isFP = getIsFP<vdataType>.ret> { + let FPAtomic = isFP in def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>, MUBUFAddr64Table <0, NAME>; + + let FPAtomic = isFP in def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>, MUBUFAddr64Table <1, NAME>; + + let FPAtomic = isFP in def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + + let FPAtomic = isFP in + def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + + let FPAtomic = isFP in def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; } multiclass MUBUF_Pseudo_Atomics_RTN <string opName, RegisterClass vdataClass, ValueType vdataType, - SDPatternOperator atomic> { + SDPatternOperator atomic, + bit isFP = getIsFP<vdataType>.ret> { + let FPAtomic = isFP in def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(set vdataType:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc), vdataType:$vdata_in))]>, MUBUFAddr64Table <0, NAME # "_RTN">; + let FPAtomic = isFP in def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(set vdataType:$vdata, (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vdataType:$vdata_in))]>, MUBUFAddr64Table <1, NAME # "_RTN">; + let FPAtomic = isFP in def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + + let FPAtomic = isFP in def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + + let FPAtomic = isFP in def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; } @@ -804,34 +827,45 @@ let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { } // End HasPackedD16VMem. defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds < - "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8 + "buffer_load_ubyte", VGPR_32, i32 >; defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds < - "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8 + "buffer_load_sbyte", VGPR_32, i32 >; defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds < - "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16 + "buffer_load_ushort", VGPR_32, i32 >; defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds < - "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16 + "buffer_load_sshort", VGPR_32, i32 >; defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds < - "buffer_load_dword", VGPR_32, i32, mubuf_load + "buffer_load_dword", VGPR_32, i32 >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load + "buffer_load_dwordx2", VReg_64, v2i32 >; defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx3", VReg_96, untyped, mubuf_load + "buffer_load_dwordx3", VReg_96, v3i32 >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load + "buffer_load_dwordx4", VReg_128, v4i32 >; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", i32, load_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>; + // This is not described in AMD documentation, // but 'lds' versions of these opcodes are available // in at least GFX8+ chips. See Bug 37653. -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8GFX9 in { defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads < "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1 >; @@ -856,7 +890,7 @@ defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores < "buffer_store_dwordx2", VReg_64, v2i32, store_global >; defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx3", VReg_96, untyped, store_global + "buffer_store_dwordx3", VReg_96, v3i32, store_global >; defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores < "buffer_store_dwordx4", VReg_128, v4i32, store_global @@ -940,11 +974,11 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global >; -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8GFX9 in { def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">; } -let SubtargetPredicate = isSI in { // isn't on CI & VI +let SubtargetPredicate = isGFX6 in { // isn't on CI & VI /* defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">; defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">; @@ -1006,17 +1040,28 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; +let SubtargetPredicate = HasAtomicFaddInsts in { + +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN < + "buffer_atomic_add_f32", VGPR_32, f32, atomic_add_global +>; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global +>; + +} // End SubtargetPredicate = HasAtomicFaddInsts + //===----------------------------------------------------------------------===// // MTBUF Instructions //===----------------------------------------------------------------------===// defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>; defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>; defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { @@ -1041,19 +1086,21 @@ let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>; } // End HasPackedD16VMem. -let SubtargetPredicate = isCIVI in { +let SubtargetPredicate = isGFX7Plus in { //===----------------------------------------------------------------------===// // Instruction definitions for CI and newer. //===----------------------------------------------------------------------===// -// Remaining instructions: -// BUFFER_LOAD_DWORDX3 -// BUFFER_STORE_DWORDX3 def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol>; -} // End let SubtargetPredicate = isCIVI +} // End let SubtargetPredicate = isGFX7Plus + +let SubtargetPredicate = isGFX10Plus in { + def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">; + def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">; +} // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// // MUBUF Patterns @@ -1067,6 +1114,10 @@ def extract_slc : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8); }]>; +def extract_dlc : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8); +}]>; + //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// @@ -1077,21 +1128,21 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, imm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, imm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, imm:$cachepolicy, imm)), (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1100,7 +1151,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; } @@ -1108,6 +1159,8 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X"> defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, i32, "BUFFER_LOAD_FORMAT_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2i32, "BUFFER_LOAD_FORMAT_XY">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v3f32, "BUFFER_LOAD_FORMAT_XYZ">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v3i32, "BUFFER_LOAD_FORMAT_XYZ">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4i32, "BUFFER_LOAD_FORMAT_XYZW">; @@ -1131,8 +1184,14 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">; multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { @@ -1140,21 +1199,23 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, imm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, imm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (as_i16imm $offset), (extract_glc $cachepolicy), + (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, imm:$cachepolicy, imm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (as_i16imm $offset), (extract_glc $cachepolicy), + (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1163,8 +1224,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy), + (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; } @@ -1172,6 +1233,8 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMAT_XYZW">; @@ -1195,42 +1258,47 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">; //===----------------------------------------------------------------------===// // buffer_atomic patterns //===----------------------------------------------------------------------===// -multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> { +multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt, + string opcode> { def : GCNPat< - (name i32:$vdata_in, v4i32:$rsrc, 0, + (vt (name vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + imm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + imm:$cachepolicy, imm)), (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name i32:$vdata_in, v4i32:$rsrc, 0, + (vt (name vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + imm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + imm:$cachepolicy, imm)), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN) $vdata_in, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), @@ -1238,16 +1306,66 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> { >; } -defm : BufferAtomicPatterns<SIbuffer_atomic_swap, "BUFFER_ATOMIC_SWAP">; -defm : BufferAtomicPatterns<SIbuffer_atomic_add, "BUFFER_ATOMIC_ADD">; -defm : BufferAtomicPatterns<SIbuffer_atomic_sub, "BUFFER_ATOMIC_SUB">; -defm : BufferAtomicPatterns<SIbuffer_atomic_smin, "BUFFER_ATOMIC_SMIN">; -defm : BufferAtomicPatterns<SIbuffer_atomic_umin, "BUFFER_ATOMIC_UMIN">; -defm : BufferAtomicPatterns<SIbuffer_atomic_smax, "BUFFER_ATOMIC_SMAX">; -defm : BufferAtomicPatterns<SIbuffer_atomic_umax, "BUFFER_ATOMIC_UMAX">; -defm : BufferAtomicPatterns<SIbuffer_atomic_and, "BUFFER_ATOMIC_AND">; -defm : BufferAtomicPatterns<SIbuffer_atomic_or, "BUFFER_ATOMIC_OR">; -defm : BufferAtomicPatterns<SIbuffer_atomic_xor, "BUFFER_ATOMIC_XOR">; +defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i32, "BUFFER_ATOMIC_SWAP">; +defm : BufferAtomicPatterns<SIbuffer_atomic_add, i32, "BUFFER_ATOMIC_ADD">; +defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i32, "BUFFER_ATOMIC_SUB">; +defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i32, "BUFFER_ATOMIC_SMIN">; +defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i32, "BUFFER_ATOMIC_UMIN">; +defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i32, "BUFFER_ATOMIC_SMAX">; +defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i32, "BUFFER_ATOMIC_UMAX">; +defm : BufferAtomicPatterns<SIbuffer_atomic_and, i32, "BUFFER_ATOMIC_AND">; +defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">; +defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">; +defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i64, "BUFFER_ATOMIC_SMIN_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i64, "BUFFER_ATOMIC_UMIN_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i64, "BUFFER_ATOMIC_SMAX_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i64, "BUFFER_ATOMIC_UMAX_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">; + +multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, + string opcode> { + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, 0, + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), + (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset, + (as_i16imm $offset), (extract_slc $cachepolicy)) + >; + + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), + (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (extract_slc $cachepolicy)) + >; + + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, 0, + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), + (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (extract_slc $cachepolicy)) + >; + + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), + (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) + $vdata_in, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) + >; +} + +defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">; +defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; def : GCNPat< (SIbuffer_atomic_cmpswap @@ -1298,12 +1416,11 @@ def : GCNPat< sub0) >; - class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt, PatFrag constant_ld> : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) >; multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, @@ -1311,43 +1428,47 @@ multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Ins def : GCNPat < (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0) + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0) >; def : GCNPat < (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0) + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) >; } -let SubtargetPredicate = isSICI in { +let SubtargetPredicate = isGFX6GFX7 in { def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; -def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, extloadi8_constant>; +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, zextloadi8_constant>; def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; -def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, extloadi16_constant>; +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, zextloadi16_constant>; -defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>; -defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>; -} // End SubtargetPredicate = isSICI +defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, atomic_load_32_global>; +defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, atomic_load_64_global>; +} // End SubtargetPredicate = isGFX6GFX7 multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag ld> { def : GCNPat < (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), - (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), + (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) >; } let OtherPredicates = [Has16BitInsts] in { defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_global>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_global>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_global>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_OFFSET, i16, mubuf_load>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_OFFSET, i16, load_global>; } // End OtherPredicates = [Has16BitInsts] @@ -1357,111 +1478,79 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen, def : GCNPat < (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0) >; def : GCNPat < (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0) >; } // XXX - Is it possible to have a complex pattern in a PatFrag? -multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen, +multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen, MUBUF_Pseudo InstrOffset, - ValueType vt, PatFrag ld> { - def : GCNPat < - (build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset)))), - (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; - - def : GCNPat < - (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset)))))), - (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; - - - def : GCNPat < - (build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))), - (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; - - def : GCNPat < - (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))), - (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; -} - -multiclass MUBUFScratchLoadPat_Lo16 <MUBUF_Pseudo InstrOffen, - MUBUF_Pseudo InstrOffset, - ValueType vt, PatFrag ld> { - def : GCNPat < - (build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))), - (vt (Hi16Elt vt:$hi))), - (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi)) - >; - + ValueType vt, PatFrag ld_frag> { def : GCNPat < - (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))))), - (f16 (Hi16Elt f16:$hi))), - (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in), + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) >; def : GCNPat < - (build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (vt (Hi16Elt vt:$hi))), - (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi)) - >; - - def : GCNPat < - (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))), - (f16 (Hi16Elt f16:$hi))), - (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in), + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) >; } defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, az_extloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, zextloadi8_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, az_extloadi16_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, zextloadi16_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>; let OtherPredicates = [D16PreservesUnusedBits] in { -defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>; -defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>; -defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2f16, load_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2f16, az_extloadi8_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2f16, sextloadi8_d16_hi_private>; -defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, i16, load_private>; -defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>; -defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2i16, load_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2i16, az_extloadi8_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2i16, sextloadi8_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2f16, load_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2f16, az_extloadi8_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>; } + multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_st> { // Store follows atomic op convention so address is forst def : GCNPat < (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0) + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0) >; def : GCNPat < (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0) + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) >; } -let SubtargetPredicate = isSICI in { +let SubtargetPredicate = isGFX6GFX7 in { defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, store_atomic_global>; defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, store_atomic_global>; -} // End Predicates = isSICI +} // End Predicates = isGFX6GFX7 multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, @@ -1469,8 +1558,8 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, def : GCNPat < (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe)), - (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)), + (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) >; } @@ -1479,17 +1568,18 @@ defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, store_global>; multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen, MUBUF_Pseudo InstrOffset, - ValueType vt, PatFrag st> { + ValueType vt, PatFrag st, + RegisterClass rc = VGPR_32> { def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), - (InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) + (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0) >; def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)), - (InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0) + (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0) >; } @@ -1498,8 +1588,9 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>; defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>; defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>; -defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private>; -defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private, VReg_64>; +defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OFFSET, v3i32, store_private, VReg_96>; +defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>; let OtherPredicates = [D16PreservesUnusedBits] in { @@ -1526,7 +1617,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, imm:$format, imm:$cachepolicy, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1534,7 +1625,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, imm:$format, imm:$cachepolicy, imm)), (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1542,7 +1633,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, imm:$format, imm:$cachepolicy, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1552,15 +1643,17 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; } defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32, "TBUFFER_LOAD_FORMAT_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3i32, "TBUFFER_LOAD_FORMAT_XYZ">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32, "TBUFFER_LOAD_FORMAT_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">; +defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3f32, "TBUFFER_LOAD_FORMAT_XYZ">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">; let SubtargetPredicate = HasUnpackedD16VMem in { @@ -1582,7 +1675,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, imm:$format, imm:$cachepolicy, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1590,7 +1683,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, imm:$format, imm:$cachepolicy, imm), (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1598,7 +1691,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, imm:$format, imm:$cachepolicy, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; def : GCNPat< @@ -1608,17 +1701,17 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) >; } defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32, "TBUFFER_STORE_FORMAT_X">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">; -defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3i32, "TBUFFER_STORE_FORMAT_XYZ">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32, "TBUFFER_STORE_FORMAT_X">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">; -defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">; +defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3f32, "TBUFFER_STORE_FORMAT_XYZ">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">; let SubtargetPredicate = HasUnpackedD16VMem in { @@ -1634,28 +1727,22 @@ let SubtargetPredicate = HasPackedD16VMem in { } // End HasPackedD16VMem. //===----------------------------------------------------------------------===// -// Target instructions, move to the appropriate target TD file +// Target-specific instruction encodings. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// SI +// Base ENC_MUBUF for GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// -class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> : - MUBUF_Real<op, ps>, - Enc64, - SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> { - let AssemblerPredicate=isSICI; - let DecoderNamespace="SICI"; - +class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> : + MUBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> { let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); - let Inst{15} = ps.addr64; let Inst{16} = !if(ps.lds, 1, 0); let Inst{24-18} = op; - let Inst{31-26} = 0x38; //encoding + let Inst{31-26} = 0x38; let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_vdata, vdata, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); @@ -1664,125 +1751,250 @@ class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> : let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } -multiclass MUBUF_Real_AllAddr_si<bits<7> op> { - def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; - def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>; - def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; - def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; - def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; -} - -multiclass MUBUF_Real_AllAddr_Lds_si<bits<7> op> { - - def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, - MUBUFLdsTable<0, NAME # "_OFFSET_si">; - def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>, - MUBUFLdsTable<0, NAME # "_ADDR64_si">; - def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, - MUBUFLdsTable<0, NAME # "_OFFEN_si">; - def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, - MUBUFLdsTable<0, NAME # "_IDXEN_si">; - def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, - MUBUFLdsTable<0, NAME # "_BOTHEN_si">; - - def _LDS_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>, - MUBUFLdsTable<1, NAME # "_OFFSET_si">; - def _LDS_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>, - MUBUFLdsTable<1, NAME # "_ADDR64_si">; - def _LDS_OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>, - MUBUFLdsTable<1, NAME # "_OFFEN_si">; - def _LDS_IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>, - MUBUFLdsTable<1, NAME # "_IDXEN_si">; - def _LDS_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, - MUBUFLdsTable<1, NAME # "_BOTHEN_si">; -} - -multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> { - def _OFFSET_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; - def _ADDR64_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>; - def _OFFEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>; - def _IDXEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>; - def _BOTHEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; -} - -defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_si <0x00>; -defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_si <0x01>; -defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x02>; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x03>; -defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_si <0x04>; -defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_si <0x05>; -defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x06>; -defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x07>; -defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_si <0x08>; -defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_si <0x09>; -defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_si <0x0a>; -defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_si <0x0b>; -defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_si <0x0c>; -defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_si <0x0d>; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_si <0x0e>; -defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_si <0x0f>; -defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_si <0x18>; -defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_si <0x1a>; -defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_si <0x1c>; -defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_si <0x1d>; -defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_si <0x1e>; -defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_si <0x1f>; - -defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_si <0x30>; -defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_si <0x31>; -defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_si <0x32>; -defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_si <0x33>; -//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomic_si <0x34>; // isn't on CI & VI -defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_si <0x35>; -defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_si <0x36>; -defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_si <0x37>; -defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_si <0x38>; -defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_si <0x39>; -defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_si <0x3a>; -defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_si <0x3b>; -defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_si <0x3c>; -defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_si <0x3d>; - -//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_si <0x3e>; // isn't on VI -//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomic_si <0x3f>; // isn't on VI -//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomic_si <0x40>; // isn't on VI -defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_si <0x50>; -defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_si <0x51>; -defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_si <0x52>; -defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_si <0x53>; -//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Real_Atomic_si <0x54>; // isn't on CI & VI -defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_si <0x55>; -defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_si <0x56>; -defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_si <0x57>; -defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_si <0x58>; -defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_si <0x59>; -defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_si <0x5a>; -defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>; -defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>; -defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>; -// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI. -//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI -//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI -//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI - -def BUFFER_WBINVL1_SC_si : MUBUF_Real_si <0x70, BUFFER_WBINVL1_SC>; -def BUFFER_WBINVL1_si : MUBUF_Real_si <0x71, BUFFER_WBINVL1>; - -class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> : - MTBUF_Real<ps>, - Enc64, - SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> { - let AssemblerPredicate=isSICI; - let DecoderNamespace="SICI"; +class MUBUF_Real_gfx10<bits<8> op, MUBUF_Pseudo ps> : + Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> { + let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value); + let Inst{25} = op{7}; +} + +class MUBUF_Real_gfx6_gfx7<bits<8> op, MUBUF_Pseudo ps> : + Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> { + let Inst{15} = ps.addr64; +} +//===----------------------------------------------------------------------===// +// MUBUF - GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass MUBUF_Real_gfx10_with_name<bits<8> op, string opName, + string asmName> { + def _gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(opName)> { + MUBUF_Pseudo ps = !cast<MUBUF_Pseudo>(opName); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass MUBUF_Real_AllAddr_gfx10<bits<8> op> { + def _BOTHEN_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; + def _IDXEN_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; + def _OFFEN_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; + def _OFFSET_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; + } + multiclass MUBUF_Real_AllAddr_Lds_gfx10<bits<8> op> { + def _OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, + MUBUFLdsTable<0, NAME # "_OFFSET_gfx10">; + def _OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, + MUBUFLdsTable<0, NAME # "_OFFEN_gfx10">; + def _IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, + MUBUFLdsTable<0, NAME # "_IDXEN_gfx10">; + def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, + MUBUFLdsTable<0, NAME # "_BOTHEN_gfx10">; + + def _LDS_OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>, + MUBUFLdsTable<1, NAME # "_OFFSET_gfx10">; + def _LDS_OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>, + MUBUFLdsTable<1, NAME # "_OFFEN_gfx10">; + def _LDS_IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>, + MUBUFLdsTable<1, NAME # "_IDXEN_gfx10">; + def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, + MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">; + } + multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> : + MUBUF_Real_AllAddr_gfx10<op> { + def _BOTHEN_RTN_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; + def _IDXEN_RTN_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>; + def _OFFEN_RTN_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>; + def _OFFSET_RTN_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>; +defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x01b>; +defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_gfx10<0x020>; +defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x021>; +defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_gfx10<0x022>; +defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x023>; +defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_gfx10<0x024>; +defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x025>; +// FIXME-GFX10: Add following instructions: +//defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x026>; +//defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x027>; +defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx10<0x080>; +defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx10<0x081>; +defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx10<0x082>; +defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx10<0x083>; +defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx10<0x084>; +defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx10<0x085>; +defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx10<0x086>; +defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx10<0x087>; + +def BUFFER_GL0_INV_gfx10 : + MUBUF_Real_gfx10<0x071, BUFFER_GL0_INV>; +def BUFFER_GL1_INV_gfx10 : + MUBUF_Real_gfx10<0x072, BUFFER_GL1_INV>; + +//===----------------------------------------------------------------------===// +// MUBUF - GFX6, GFX7, GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX6, DecoderNamespace = "GFX6" in { + multiclass MUBUF_Real_gfx6<bits<8> op> { + def _gfx6 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME)>; + } +} // End AssemblerPredicate = isGFX6, DecoderNamespace = "GFX6" + +let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { + multiclass MUBUF_Real_gfx7<bits<8> op> { + def _gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME)>; + } +} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" + +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass MUBUF_Real_AllAddr_gfx6_gfx7<bits<8> op> { + def _ADDR64_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>; + def _BOTHEN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; + def _IDXEN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; + def _OFFEN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; + def _OFFSET_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; + } + multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7<bits<8> op> { + def _OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, + MUBUFLdsTable<0, NAME # "_OFFSET_gfx6_gfx7">; + def _ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>, + MUBUFLdsTable<0, NAME # "_ADDR64_gfx6_gfx7">; + def _OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, + MUBUFLdsTable<0, NAME # "_OFFEN_gfx6_gfx7">; + def _IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, + MUBUFLdsTable<0, NAME # "_IDXEN_gfx6_gfx7">; + def _BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, + MUBUFLdsTable<0, NAME # "_BOTHEN_gfx6_gfx7">; + + def _LDS_OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>, + MUBUFLdsTable<1, NAME # "_OFFSET_gfx6_gfx7">; + def _LDS_ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>, + MUBUFLdsTable<1, NAME # "_ADDR64_gfx6_gfx7">; + def _LDS_OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>, + MUBUFLdsTable<1, NAME # "_OFFEN_gfx6_gfx7">; + def _LDS_IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>, + MUBUFLdsTable<1, NAME # "_IDXEN_gfx6_gfx7">; + def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, + MUBUFLdsTable<1, NAME # "_BOTHEN_gfx6_gfx7">; + } + multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> : + MUBUF_Real_AllAddr_gfx6_gfx7<op> { + def _ADDR64_RTN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>; + def _BOTHEN_RTN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; + def _IDXEN_RTN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>; + def _OFFEN_RTN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>; + def _OFFSET_RTN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; + } +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<bits<8> op> : + MUBUF_Real_AllAddr_gfx6_gfx7<op>, MUBUF_Real_AllAddr_gfx10<op>; + +multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<bits<8> op> : + MUBUF_Real_AllAddr_Lds_gfx6_gfx7<op>, MUBUF_Real_AllAddr_Lds_gfx10<op>; + +multiclass MUBUF_Real_Atomics_gfx6_gfx7_gfx10<bits<8> op> : + MUBUF_Real_Atomics_gfx6_gfx7<op>, MUBUF_Real_Atomics_gfx10<op>; + +// FIXME-GFX6: Following instructions are available only on GFX6. +//defm BUFFER_ATOMIC_RSUB : MUBUF_Real_Atomics_gfx6 <0x034>; +//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Real_Atomics_gfx6 <0x054>; + +defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x000>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x001>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x002>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x003>; +defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x004>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x005>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x006>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x007>; +defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x008>; +defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x009>; +defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00a>; +defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00b>; +defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_gfx6_gfx7_gfx10<0x00c>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00d>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00e>; +defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x00f>; +defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x018>; +defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01a>; +defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01c>; +defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01d>; +defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01e>; +defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x01f>; + +defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x030>; +defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x031>; +defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x032>; +defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x033>; +defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x035>; +defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x036>; +defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x037>; +defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x038>; +defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x039>; +defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03a>; +defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03b>; +defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03c>; +defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03d>; +// FIXME-GFX6-GFX7-GFX10: Add following instructions: +//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>; +//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>; +//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>; +defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x050>; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x051>; +defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x052>; +defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x053>; +defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x055>; +defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x056>; +defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x057>; +defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x058>; +defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x059>; +defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05a>; +defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05b>; +defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>; +defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>; +// FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7. +// FIXME-GFX6-GFX7-GFX10: Add following instructions: +//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>; +//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>; +//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>; + +defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>; +defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>; +def BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>; + +//===----------------------------------------------------------------------===// +// Base ENC_MTBUF for GFX6, GFX7, GFX10. +//===----------------------------------------------------------------------===// + +class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> : + MTBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> { let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); - let Inst{15} = ps.addr64; let Inst{18-16} = op; - let Inst{22-19} = dfmt; - let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_vdata, vdata, ?); @@ -1792,47 +2004,87 @@ class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> : let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } -multiclass MTBUF_Real_AllAddr_si<bits<3> op> { - def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>; - def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>; - def _OFFEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>; - def _IDXEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>; - def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; -} +//===----------------------------------------------------------------------===// +// MTBUF - GFX10. +//===----------------------------------------------------------------------===// + +class MTBUF_Real_gfx10<bits<4> op, MTBUF_Pseudo ps> : + Base_MTBUF_Real_gfx6_gfx7_gfx10<op{2-0}, ps, SIEncodingFamily.GFX10> { + let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value); + let Inst{25-19} = format; + let Inst{53} = op{3}; +} + +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass MTBUF_Real_AllAddr_gfx10<bits<4> op> { + def _BOTHEN_gfx10 : + MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; + def _IDXEN_gfx10 : + MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>; + def _OFFEN_gfx10 : + MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>; + def _OFFSET_gfx10 : + MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>; + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_si <6>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>; +defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx10<0x008>; +defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx10<0x009>; +defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx10<0x00a>; +defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx10<0x00b>; +defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx10<0x00c>; +defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx10<0x00d>; +defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx10<0x00e>; +defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx10<0x00f>; //===----------------------------------------------------------------------===// -// CI -// MTBUF - GFX6, GFX7. +// MTBUF - GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// -class MUBUF_Real_ci <bits<7> op, MUBUF_Pseudo ps> : - MUBUF_Real_si<op, ps> { - let AssemblerPredicate=isCIOnly; - let DecoderNamespace="CI"; +class MTBUF_Real_gfx6_gfx7<bits<4> op, MTBUF_Pseudo ps> : + Base_MTBUF_Real_gfx6_gfx7_gfx10<op{2-0}, ps, SIEncodingFamily.SI> { + let Inst{15} = ps.addr64; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; } -def BUFFER_WBINVL1_VOL_ci : MUBUF_Real_ci <0x70, BUFFER_WBINVL1_VOL>; +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass MTBUF_Real_AllAddr_gfx6_gfx7<bits<4> op> { + def _ADDR64_gfx6_gfx7 : + MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>; + def _BOTHEN_gfx6_gfx7 : + MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; + def _IDXEN_gfx6_gfx7 : + MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>; + def _OFFEN_gfx6_gfx7 : + MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>; + def _OFFSET_gfx6_gfx7 : + MTBUF_Real_gfx6_gfx7<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>; + } +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<bits<4> op> : + MTBUF_Real_AllAddr_gfx6_gfx7<op>, MTBUF_Real_AllAddr_gfx10<op>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x000>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x001>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x002>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x003>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x004>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x005>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x006>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x007>; //===----------------------------------------------------------------------===// -// VI +// GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> : - MUBUF_Real<op, ps>, + MUBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> { - let AssemblerPredicate=isVI; - let DecoderNamespace="VI"; + let AssemblerPredicate = isGFX8GFX9; + let DecoderNamespace = "GFX8"; let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; @@ -1878,7 +2130,7 @@ multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> { } class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> : - MUBUF_Real<op, ps>, + MUBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> { let AssemblerPredicate=HasUnpackedD16VMem; @@ -2002,12 +2254,19 @@ def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>; def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>; def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; +let SubtargetPredicate = HasAtomicFaddInsts in { + +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_AllAddr_vi <0x4d>; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_AllAddr_vi <0x4e>; + +} // End SubtargetPredicate = HasAtomicFaddInsts + class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> : MTBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> { - let AssemblerPredicate=isVI; - let DecoderNamespace="VI"; + let AssemblerPredicate = isGFX8GFX9; + let DecoderNamespace = "GFX8"; let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td index ae40c6387982..1a526675164a 100644 --- a/lib/Target/AMDGPU/CaymanInstructions.td +++ b/lib/Target/AMDGPU/CaymanInstructions.td @@ -1,9 +1,8 @@ //===-- CaymanInstructions.td - CM Instruction defs -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index 31d2ebef481d..c52eaaa3fdc5 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -1,9 +1,8 @@ //===-- DSInstructions.td - DS Instruction Defintions ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -11,8 +10,6 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt InstSI <outs, ins, "", pattern>, SIMCInstr <opName, SIEncodingFamily.NONE> { - let SubtargetPredicate = isGCN; - let LGKM_CNT = 1; let DS = 1; let Size = 8; @@ -21,6 +18,7 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt // Most instruction load and store data, so set this as the default. let mayLoad = 1; let mayStore = 1; + let maybeAtomic = 1; let hasSideEffects = 0; let SchedRW = [WriteLDS]; @@ -40,6 +38,8 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt bits<1> has_data0 = 1; bits<1> has_data1 = 1; + bits<1> has_gws_data0 = 0; // data0 is encoded as addr + bits<1> has_offset = 1; // has "offset" that should be split to offset0,1 bits<1> has_offset0 = 1; bits<1> has_offset1 = 1; @@ -61,6 +61,7 @@ class DS_Real <DS_Pseudo ds> : // copy relevant pseudo op flags let SubtargetPredicate = ds.SubtargetPredicate; + let OtherPredicates = ds.OtherPredicates; let AsmMatchConverter = ds.AsmMatchConverter; // encoding fields @@ -322,7 +323,7 @@ class DS_GWS_1D <string opName> : DS_GWS<opName, (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> { - let has_data0 = 1; + let has_gws_data0 = 1; } class DS_VOID <string opName> : DS_Pseudo<opName, @@ -469,11 +470,15 @@ defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>; defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>; defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>; -def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">; +let isConvergent = 1, usesCustomInserter = 1 in { +def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> { + let mayLoad = 0; +} def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">; def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">; def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">; def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">; +} def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">; def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">; @@ -550,12 +555,14 @@ def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">; // Instruction definitions for CI and newer. //===----------------------------------------------------------------------===// -let SubtargetPredicate = isCIVI in { +let SubtargetPredicate = isGFX7Plus in { defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPR_32>; defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VReg_64>; +let isConvergent = 1, usesCustomInserter = 1 in { def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">; +} let mayStore = 0 in { defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", VReg_96>; @@ -569,13 +576,13 @@ defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", VReg_128>; def DS_NOP : DS_VOID<"ds_nop">; -} // let SubtargetPredicate = isCIVI +} // let SubtargetPredicate = isGFX7Plus //===----------------------------------------------------------------------===// // Instruction definitions for VI and newer. //===----------------------------------------------------------------------===// -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8Plus in { let Uses = [EXEC] in { def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32", @@ -586,7 +593,7 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32", def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; -} // let SubtargetPredicate = isVI +} // let SubtargetPredicate = isGFX8Plus //===----------------------------------------------------------------------===// // DS Patterns @@ -597,9 +604,9 @@ def : GCNPat < (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) >; -class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < +class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst $ptr, (as_i16imm $offset), (i1 0)) + (inst $ptr, (as_i16imm $offset), (i1 gds)) >; multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { @@ -613,38 +620,21 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { } } - -multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> { - def : GCNPat < - (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))), - (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo)) - >; - - def : GCNPat < - (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))), - (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo)) - >; -} - -multiclass DSReadPat_Lo16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> { - def : GCNPat < - (build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (vt (Hi16Elt vt:$hi))), - (v2i16 (inst $ptr, (as_i16imm $offset), 0, $hi)) - >; - - def : GCNPat < - (build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), (f16 (Hi16Elt f16:$hi))), - (v2f16 (inst $ptr, (as_i16imm $offset), 0, $hi)) - >; -} +class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in), + (inst $ptr, (as_i16imm $offset), (i1 0), $in) +>; defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">; -defm : DSReadPat_mc <DS_READ_U8, i32, "az_extloadi8_local">; defm : DSReadPat_mc <DS_READ_I8, i16, "sextloadi8_local">; -defm : DSReadPat_mc <DS_READ_U8, i16, "az_extloadi8_local">; +defm : DSReadPat_mc <DS_READ_U8, i32, "extloadi8_local">; +defm : DSReadPat_mc <DS_READ_U8, i32, "zextloadi8_local">; +defm : DSReadPat_mc <DS_READ_U8, i16, "extloadi8_local">; +defm : DSReadPat_mc <DS_READ_U8, i16, "zextloadi8_local">; defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">; defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">; -defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">; +defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">; +defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">; defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">; defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">; defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">; @@ -658,21 +648,24 @@ defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">; } // End AddedComplexity = 100 let OtherPredicates = [D16PreservesUnusedBits] in { -let AddedComplexity = 100 in { -defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>; -defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>; -defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>; - -defm : DSReadPat_Lo16<DS_READ_U16_D16, load_local>; -defm : DSReadPat_Lo16<DS_READ_U8_D16, az_extloadi8_local>; -defm : DSReadPat_Lo16<DS_READ_I8_D16, sextloadi8_local>; - -} +def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>; +def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>; +def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2i16>; +def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2f16>; +def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2i16>; +def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2f16>; + +def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2i16>; +def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2f16>; +def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2i16>; +def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2f16>; +def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2i16>; +def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>; } -class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < +class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) + (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) >; multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { @@ -730,7 +723,7 @@ class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat< // v2i32 loads are split into i32 loads on SI during lowering, due to a bug // related to bounds checking. -let OtherPredicates = [LDSRequiresM0Init, isCIVI] in { +let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in { def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>; def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>; } @@ -747,260 +740,313 @@ defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">; defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">; } // End AddedComplexity = 100 -class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < +class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) + (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) >; multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_m0")>; + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0")>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag)>; + !cast<PatFrag>(frag#"_local")>; } + + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>; } -class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < +class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) + (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 gds)) >; multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_m0")>; + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0")>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag)>; + !cast<PatFrag>(frag#"_local")>; } + + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>; } // 32-bit atomics. -defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap_local">; -defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add_local">; -defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub_local">; -defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc_local">; -defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec_local">; -defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and_local">; -defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or_local">; -defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor_local">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min_local">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">; -defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin_local">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax_local">; -defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd_local">; +defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap">; +defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add">; +defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub">; +defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc">; +defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec">; +defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and">; +defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or">; +defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax">; +defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax">; +defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd">; // 64-bit atomics. -defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">; -defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add_local">; -defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub_local">; -defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc_local">; -defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec_local">; -defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and_local">; -defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or_local">; -defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor_local">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min_local">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max_local">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin_local">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax_local">; - -defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap_local">; +defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">; +defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add">; +defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub">; +defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc">; +defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec">; +defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and">; +defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or">; +defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax">; + +defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">; + +def : Pat < + (SIds_ordered_count i32:$value, i16:$offset), + (DS_ORDERED_COUNT $value, (as_i16imm $offset)) +>; //===----------------------------------------------------------------------===// -// Real instructions +// Target-specific instruction encodings. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// SIInstructions.td +// Base ENC_DS for GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// -class DS_Real_si <bits<8> op, DS_Pseudo ds> : - DS_Real <ds>, - SIMCInstr <ds.Mnemonic, SIEncodingFamily.SI> { - let AssemblerPredicates=[isSICI]; - let DecoderNamespace="SICI"; +class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> : + DS_Real<ps>, SIMCInstr <ps.Mnemonic, ef> { - // encoding - let Inst{7-0} = !if(ds.has_offset0, offset0, 0); - let Inst{15-8} = !if(ds.has_offset1, offset1, 0); - let Inst{17} = !if(ds.has_gds, gds, ds.gdsValue); + let Inst{7-0} = !if(ps.has_offset0, offset0, 0); + let Inst{15-8} = !if(ps.has_offset1, offset1, 0); + let Inst{17} = !if(ps.has_gds, gds, ps.gdsValue); let Inst{25-18} = op; - let Inst{31-26} = 0x36; // ds prefix - let Inst{39-32} = !if(ds.has_addr, addr, 0); - let Inst{47-40} = !if(ds.has_data0, data0, 0); - let Inst{55-48} = !if(ds.has_data1, data1, 0); - let Inst{63-56} = !if(ds.has_vdst, vdst, 0); + let Inst{31-26} = 0x36; + let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0, 0)); + let Inst{47-40} = !if(ps.has_data0, data0, 0); + let Inst{55-48} = !if(ps.has_data1, data1, 0); + let Inst{63-56} = !if(ps.has_vdst, vdst, 0); } -def DS_ADD_U32_si : DS_Real_si<0x0, DS_ADD_U32>; -def DS_SUB_U32_si : DS_Real_si<0x1, DS_SUB_U32>; -def DS_RSUB_U32_si : DS_Real_si<0x2, DS_RSUB_U32>; -def DS_INC_U32_si : DS_Real_si<0x3, DS_INC_U32>; -def DS_DEC_U32_si : DS_Real_si<0x4, DS_DEC_U32>; -def DS_MIN_I32_si : DS_Real_si<0x5, DS_MIN_I32>; -def DS_MAX_I32_si : DS_Real_si<0x6, DS_MAX_I32>; -def DS_MIN_U32_si : DS_Real_si<0x7, DS_MIN_U32>; -def DS_MAX_U32_si : DS_Real_si<0x8, DS_MAX_U32>; -def DS_AND_B32_si : DS_Real_si<0x9, DS_AND_B32>; -def DS_OR_B32_si : DS_Real_si<0xa, DS_OR_B32>; -def DS_XOR_B32_si : DS_Real_si<0xb, DS_XOR_B32>; -def DS_MSKOR_B32_si : DS_Real_si<0xc, DS_MSKOR_B32>; -def DS_WRITE_B32_si : DS_Real_si<0xd, DS_WRITE_B32>; -def DS_WRITE2_B32_si : DS_Real_si<0xe, DS_WRITE2_B32>; -def DS_WRITE2ST64_B32_si : DS_Real_si<0xf, DS_WRITE2ST64_B32>; -def DS_CMPST_B32_si : DS_Real_si<0x10, DS_CMPST_B32>; -def DS_CMPST_F32_si : DS_Real_si<0x11, DS_CMPST_F32>; -def DS_MIN_F32_si : DS_Real_si<0x12, DS_MIN_F32>; -def DS_MAX_F32_si : DS_Real_si<0x13, DS_MAX_F32>; -def DS_NOP_si : DS_Real_si<0x14, DS_NOP>; -def DS_GWS_INIT_si : DS_Real_si<0x19, DS_GWS_INIT>; -def DS_GWS_SEMA_V_si : DS_Real_si<0x1a, DS_GWS_SEMA_V>; -def DS_GWS_SEMA_BR_si : DS_Real_si<0x1b, DS_GWS_SEMA_BR>; -def DS_GWS_SEMA_P_si : DS_Real_si<0x1c, DS_GWS_SEMA_P>; -def DS_GWS_BARRIER_si : DS_Real_si<0x1d, DS_GWS_BARRIER>; -def DS_WRITE_B8_si : DS_Real_si<0x1e, DS_WRITE_B8>; -def DS_WRITE_B16_si : DS_Real_si<0x1f, DS_WRITE_B16>; -def DS_ADD_RTN_U32_si : DS_Real_si<0x20, DS_ADD_RTN_U32>; -def DS_SUB_RTN_U32_si : DS_Real_si<0x21, DS_SUB_RTN_U32>; -def DS_RSUB_RTN_U32_si : DS_Real_si<0x22, DS_RSUB_RTN_U32>; -def DS_INC_RTN_U32_si : DS_Real_si<0x23, DS_INC_RTN_U32>; -def DS_DEC_RTN_U32_si : DS_Real_si<0x24, DS_DEC_RTN_U32>; -def DS_MIN_RTN_I32_si : DS_Real_si<0x25, DS_MIN_RTN_I32>; -def DS_MAX_RTN_I32_si : DS_Real_si<0x26, DS_MAX_RTN_I32>; -def DS_MIN_RTN_U32_si : DS_Real_si<0x27, DS_MIN_RTN_U32>; -def DS_MAX_RTN_U32_si : DS_Real_si<0x28, DS_MAX_RTN_U32>; -def DS_AND_RTN_B32_si : DS_Real_si<0x29, DS_AND_RTN_B32>; -def DS_OR_RTN_B32_si : DS_Real_si<0x2a, DS_OR_RTN_B32>; -def DS_XOR_RTN_B32_si : DS_Real_si<0x2b, DS_XOR_RTN_B32>; -def DS_MSKOR_RTN_B32_si : DS_Real_si<0x2c, DS_MSKOR_RTN_B32>; -def DS_WRXCHG_RTN_B32_si : DS_Real_si<0x2d, DS_WRXCHG_RTN_B32>; -def DS_WRXCHG2_RTN_B32_si : DS_Real_si<0x2e, DS_WRXCHG2_RTN_B32>; -def DS_WRXCHG2ST64_RTN_B32_si : DS_Real_si<0x2f, DS_WRXCHG2ST64_RTN_B32>; -def DS_CMPST_RTN_B32_si : DS_Real_si<0x30, DS_CMPST_RTN_B32>; -def DS_CMPST_RTN_F32_si : DS_Real_si<0x31, DS_CMPST_RTN_F32>; -def DS_MIN_RTN_F32_si : DS_Real_si<0x32, DS_MIN_RTN_F32>; -def DS_MAX_RTN_F32_si : DS_Real_si<0x33, DS_MAX_RTN_F32>; - -// These instruction are CI/VI only -def DS_WRAP_RTN_B32_si : DS_Real_si<0x34, DS_WRAP_RTN_B32>; -def DS_CONDXCHG32_RTN_B64_si : DS_Real_si<0x7e, DS_CONDXCHG32_RTN_B64>; -def DS_GWS_SEMA_RELEASE_ALL_si : DS_Real_si<0x18, DS_GWS_SEMA_RELEASE_ALL>; - -def DS_SWIZZLE_B32_si : DS_Real_si<0x35, DS_SWIZZLE_B32>; -def DS_READ_B32_si : DS_Real_si<0x36, DS_READ_B32>; -def DS_READ2_B32_si : DS_Real_si<0x37, DS_READ2_B32>; -def DS_READ2ST64_B32_si : DS_Real_si<0x38, DS_READ2ST64_B32>; -def DS_READ_I8_si : DS_Real_si<0x39, DS_READ_I8>; -def DS_READ_U8_si : DS_Real_si<0x3a, DS_READ_U8>; -def DS_READ_I16_si : DS_Real_si<0x3b, DS_READ_I16>; -def DS_READ_U16_si : DS_Real_si<0x3c, DS_READ_U16>; -def DS_CONSUME_si : DS_Real_si<0x3d, DS_CONSUME>; -def DS_APPEND_si : DS_Real_si<0x3e, DS_APPEND>; -def DS_ORDERED_COUNT_si : DS_Real_si<0x3f, DS_ORDERED_COUNT>; -def DS_ADD_U64_si : DS_Real_si<0x40, DS_ADD_U64>; -def DS_SUB_U64_si : DS_Real_si<0x41, DS_SUB_U64>; -def DS_RSUB_U64_si : DS_Real_si<0x42, DS_RSUB_U64>; -def DS_INC_U64_si : DS_Real_si<0x43, DS_INC_U64>; -def DS_DEC_U64_si : DS_Real_si<0x44, DS_DEC_U64>; -def DS_MIN_I64_si : DS_Real_si<0x45, DS_MIN_I64>; -def DS_MAX_I64_si : DS_Real_si<0x46, DS_MAX_I64>; -def DS_MIN_U64_si : DS_Real_si<0x47, DS_MIN_U64>; -def DS_MAX_U64_si : DS_Real_si<0x48, DS_MAX_U64>; -def DS_AND_B64_si : DS_Real_si<0x49, DS_AND_B64>; -def DS_OR_B64_si : DS_Real_si<0x4a, DS_OR_B64>; -def DS_XOR_B64_si : DS_Real_si<0x4b, DS_XOR_B64>; -def DS_MSKOR_B64_si : DS_Real_si<0x4c, DS_MSKOR_B64>; -def DS_WRITE_B64_si : DS_Real_si<0x4d, DS_WRITE_B64>; -def DS_WRITE2_B64_si : DS_Real_si<0x4E, DS_WRITE2_B64>; -def DS_WRITE2ST64_B64_si : DS_Real_si<0x4f, DS_WRITE2ST64_B64>; -def DS_CMPST_B64_si : DS_Real_si<0x50, DS_CMPST_B64>; -def DS_CMPST_F64_si : DS_Real_si<0x51, DS_CMPST_F64>; -def DS_MIN_F64_si : DS_Real_si<0x52, DS_MIN_F64>; -def DS_MAX_F64_si : DS_Real_si<0x53, DS_MAX_F64>; - -def DS_ADD_RTN_U64_si : DS_Real_si<0x60, DS_ADD_RTN_U64>; -def DS_SUB_RTN_U64_si : DS_Real_si<0x61, DS_SUB_RTN_U64>; -def DS_RSUB_RTN_U64_si : DS_Real_si<0x62, DS_RSUB_RTN_U64>; -def DS_INC_RTN_U64_si : DS_Real_si<0x63, DS_INC_RTN_U64>; -def DS_DEC_RTN_U64_si : DS_Real_si<0x64, DS_DEC_RTN_U64>; -def DS_MIN_RTN_I64_si : DS_Real_si<0x65, DS_MIN_RTN_I64>; -def DS_MAX_RTN_I64_si : DS_Real_si<0x66, DS_MAX_RTN_I64>; -def DS_MIN_RTN_U64_si : DS_Real_si<0x67, DS_MIN_RTN_U64>; -def DS_MAX_RTN_U64_si : DS_Real_si<0x68, DS_MAX_RTN_U64>; -def DS_AND_RTN_B64_si : DS_Real_si<0x69, DS_AND_RTN_B64>; -def DS_OR_RTN_B64_si : DS_Real_si<0x6a, DS_OR_RTN_B64>; -def DS_XOR_RTN_B64_si : DS_Real_si<0x6b, DS_XOR_RTN_B64>; -def DS_MSKOR_RTN_B64_si : DS_Real_si<0x6c, DS_MSKOR_RTN_B64>; -def DS_WRXCHG_RTN_B64_si : DS_Real_si<0x6d, DS_WRXCHG_RTN_B64>; -def DS_WRXCHG2_RTN_B64_si : DS_Real_si<0x6e, DS_WRXCHG2_RTN_B64>; -def DS_WRXCHG2ST64_RTN_B64_si : DS_Real_si<0x6f, DS_WRXCHG2ST64_RTN_B64>; -def DS_CMPST_RTN_B64_si : DS_Real_si<0x70, DS_CMPST_RTN_B64>; -def DS_CMPST_RTN_F64_si : DS_Real_si<0x71, DS_CMPST_RTN_F64>; -def DS_MIN_RTN_F64_si : DS_Real_si<0x72, DS_MIN_RTN_F64>; -def DS_MAX_RTN_F64_si : DS_Real_si<0x73, DS_MAX_RTN_F64>; - -def DS_READ_B64_si : DS_Real_si<0x76, DS_READ_B64>; -def DS_READ2_B64_si : DS_Real_si<0x77, DS_READ2_B64>; -def DS_READ2ST64_B64_si : DS_Real_si<0x78, DS_READ2ST64_B64>; - -def DS_ADD_SRC2_U32_si : DS_Real_si<0x80, DS_ADD_SRC2_U32>; -def DS_SUB_SRC2_U32_si : DS_Real_si<0x81, DS_SUB_SRC2_U32>; -def DS_RSUB_SRC2_U32_si : DS_Real_si<0x82, DS_RSUB_SRC2_U32>; -def DS_INC_SRC2_U32_si : DS_Real_si<0x83, DS_INC_SRC2_U32>; -def DS_DEC_SRC2_U32_si : DS_Real_si<0x84, DS_DEC_SRC2_U32>; -def DS_MIN_SRC2_I32_si : DS_Real_si<0x85, DS_MIN_SRC2_I32>; -def DS_MAX_SRC2_I32_si : DS_Real_si<0x86, DS_MAX_SRC2_I32>; -def DS_MIN_SRC2_U32_si : DS_Real_si<0x87, DS_MIN_SRC2_U32>; -def DS_MAX_SRC2_U32_si : DS_Real_si<0x88, DS_MAX_SRC2_U32>; -def DS_AND_SRC2_B32_si : DS_Real_si<0x89, DS_AND_SRC2_B32>; -def DS_OR_SRC2_B32_si : DS_Real_si<0x8a, DS_OR_SRC2_B32>; -def DS_XOR_SRC2_B32_si : DS_Real_si<0x8b, DS_XOR_SRC2_B32>; -def DS_WRITE_SRC2_B32_si : DS_Real_si<0x8d, DS_WRITE_SRC2_B32>; - -def DS_MIN_SRC2_F32_si : DS_Real_si<0x92, DS_MIN_SRC2_F32>; -def DS_MAX_SRC2_F32_si : DS_Real_si<0x93, DS_MAX_SRC2_F32>; - -def DS_ADD_SRC2_U64_si : DS_Real_si<0xc0, DS_ADD_SRC2_U64>; -def DS_SUB_SRC2_U64_si : DS_Real_si<0xc1, DS_SUB_SRC2_U64>; -def DS_RSUB_SRC2_U64_si : DS_Real_si<0xc2, DS_RSUB_SRC2_U64>; -def DS_INC_SRC2_U64_si : DS_Real_si<0xc3, DS_INC_SRC2_U64>; -def DS_DEC_SRC2_U64_si : DS_Real_si<0xc4, DS_DEC_SRC2_U64>; -def DS_MIN_SRC2_I64_si : DS_Real_si<0xc5, DS_MIN_SRC2_I64>; -def DS_MAX_SRC2_I64_si : DS_Real_si<0xc6, DS_MAX_SRC2_I64>; -def DS_MIN_SRC2_U64_si : DS_Real_si<0xc7, DS_MIN_SRC2_U64>; -def DS_MAX_SRC2_U64_si : DS_Real_si<0xc8, DS_MAX_SRC2_U64>; -def DS_AND_SRC2_B64_si : DS_Real_si<0xc9, DS_AND_SRC2_B64>; -def DS_OR_SRC2_B64_si : DS_Real_si<0xca, DS_OR_SRC2_B64>; -def DS_XOR_SRC2_B64_si : DS_Real_si<0xcb, DS_XOR_SRC2_B64>; -def DS_WRITE_SRC2_B64_si : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>; - -def DS_MIN_SRC2_F64_si : DS_Real_si<0xd2, DS_MIN_SRC2_F64>; -def DS_MAX_SRC2_F64_si : DS_Real_si<0xd3, DS_MAX_SRC2_F64>; -def DS_WRITE_B96_si : DS_Real_si<0xde, DS_WRITE_B96>; -def DS_WRITE_B128_si : DS_Real_si<0xdf, DS_WRITE_B128>; -def DS_READ_B96_si : DS_Real_si<0xfe, DS_READ_B96>; -def DS_READ_B128_si : DS_Real_si<0xff, DS_READ_B128>; +//===----------------------------------------------------------------------===// +// GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass DS_Real_gfx10<bits<8> op> { + def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME), + SIEncodingFamily.GFX10>; + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +defm DS_ADD_F32 : DS_Real_gfx10<0x015>; +defm DS_ADD_RTN_F32 : DS_Real_gfx10<0x055>; +defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>; +defm DS_WRITE_B8_D16_HI : DS_Real_gfx10<0x0a0>; +defm DS_WRITE_B16_D16_HI : DS_Real_gfx10<0x0a1>; +defm DS_READ_U8_D16 : DS_Real_gfx10<0x0a2>; +defm DS_READ_U8_D16_HI : DS_Real_gfx10<0x0a3>; +defm DS_READ_I8_D16 : DS_Real_gfx10<0x0a4>; +defm DS_READ_I8_D16_HI : DS_Real_gfx10<0x0a5>; +defm DS_READ_U16_D16 : DS_Real_gfx10<0x0a6>; +defm DS_READ_U16_D16_HI : DS_Real_gfx10<0x0a7>; +defm DS_WRITE_ADDTID_B32 : DS_Real_gfx10<0x0b0>; +defm DS_READ_ADDTID_B32 : DS_Real_gfx10<0x0b1>; +defm DS_PERMUTE_B32 : DS_Real_gfx10<0x0b2>; +defm DS_BPERMUTE_B32 : DS_Real_gfx10<0x0b3>; + +//===----------------------------------------------------------------------===// +// GFX7, GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { + multiclass DS_Real_gfx7<bits<8> op> { + def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME), + SIEncodingFamily.SI>; + } +} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" + +multiclass DS_Real_gfx7_gfx10<bits<8> op> : + DS_Real_gfx7<op>, DS_Real_gfx10<op>; + +// FIXME-GFX7: Add tests when upstreaming this part. +defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10<0x018>; +defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10<0x034>; +defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10<0x07e>; +defm DS_WRITE_B96 : DS_Real_gfx7_gfx10<0x0de>; +defm DS_WRITE_B128 : DS_Real_gfx7_gfx10<0x0df>; +defm DS_READ_B96 : DS_Real_gfx7_gfx10<0x0fe>; +defm DS_READ_B128 : DS_Real_gfx7_gfx10<0x0ff>; + +//===----------------------------------------------------------------------===// +// GFX6, GFX7, GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass DS_Real_gfx6_gfx7<bits<8> op> { + def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME), + SIEncodingFamily.SI>; + } +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> : + DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>; + +defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10<0x000>; +defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x001>; +defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x002>; +defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10<0x003>; +defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10<0x004>; +defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10<0x005>; +defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10<0x006>; +defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10<0x007>; +defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10<0x008>; +defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10<0x009>; +defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00a>; +defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00b>; +defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00c>; +defm DS_WRITE_B32 : DS_Real_gfx6_gfx7_gfx10<0x00d>; +defm DS_WRITE2_B32 : DS_Real_gfx6_gfx7_gfx10<0x00e>; +defm DS_WRITE2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x00f>; +defm DS_CMPST_B32 : DS_Real_gfx6_gfx7_gfx10<0x010>; +defm DS_CMPST_F32 : DS_Real_gfx6_gfx7_gfx10<0x011>; +defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10<0x012>; +defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10<0x013>; +defm DS_NOP : DS_Real_gfx6_gfx7_gfx10<0x014>; +defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10<0x019>; +defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10<0x01a>; +defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10<0x01b>; +defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10<0x01c>; +defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10<0x01d>; +defm DS_WRITE_B8 : DS_Real_gfx6_gfx7_gfx10<0x01e>; +defm DS_WRITE_B16 : DS_Real_gfx6_gfx7_gfx10<0x01f>; +defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x020>; +defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x021>; +defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x022>; +defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x023>; +defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x024>; +defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x025>; +defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x026>; +defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x027>; +defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x028>; +defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x029>; +defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02a>; +defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02b>; +defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02c>; +defm DS_WRXCHG_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02d>; +defm DS_WRXCHG2_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02e>; +defm DS_WRXCHG2ST64_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02f>; +defm DS_CMPST_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x030>; +defm DS_CMPST_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x031>; +defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x032>; +defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x033>; +defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10<0x035>; +defm DS_READ_B32 : DS_Real_gfx6_gfx7_gfx10<0x036>; +defm DS_READ2_B32 : DS_Real_gfx6_gfx7_gfx10<0x037>; +defm DS_READ2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x038>; +defm DS_READ_I8 : DS_Real_gfx6_gfx7_gfx10<0x039>; +defm DS_READ_U8 : DS_Real_gfx6_gfx7_gfx10<0x03a>; +defm DS_READ_I16 : DS_Real_gfx6_gfx7_gfx10<0x03b>; +defm DS_READ_U16 : DS_Real_gfx6_gfx7_gfx10<0x03c>; +defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10<0x03d>; +defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10<0x03e>; +defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10<0x03f>; +defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10<0x040>; +defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x041>; +defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x042>; +defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10<0x043>; +defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10<0x044>; +defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10<0x045>; +defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10<0x046>; +defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10<0x047>; +defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10<0x048>; +defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10<0x049>; +defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04a>; +defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04b>; +defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04c>; +defm DS_WRITE_B64 : DS_Real_gfx6_gfx7_gfx10<0x04d>; +defm DS_WRITE2_B64 : DS_Real_gfx6_gfx7_gfx10<0x04e>; +defm DS_WRITE2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x04f>; +defm DS_CMPST_B64 : DS_Real_gfx6_gfx7_gfx10<0x050>; +defm DS_CMPST_F64 : DS_Real_gfx6_gfx7_gfx10<0x051>; +defm DS_MIN_F64 : DS_Real_gfx6_gfx7_gfx10<0x052>; +defm DS_MAX_F64 : DS_Real_gfx6_gfx7_gfx10<0x053>; +defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x060>; +defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x061>; +defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x062>; +defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x063>; +defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x064>; +defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x065>; +defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x066>; +defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x067>; +defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x068>; +defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x069>; +defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06a>; +defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06b>; +defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06c>; +defm DS_WRXCHG_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06d>; +defm DS_WRXCHG2_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06e>; +defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06f>; +defm DS_CMPST_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x070>; +defm DS_CMPST_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x071>; +defm DS_MIN_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x072>; +defm DS_MAX_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x073>; +defm DS_READ_B64 : DS_Real_gfx6_gfx7_gfx10<0x076>; +defm DS_READ2_B64 : DS_Real_gfx6_gfx7_gfx10<0x077>; +defm DS_READ2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x078>; +defm DS_ADD_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x080>; +defm DS_SUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x081>; +defm DS_RSUB_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x082>; +defm DS_INC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x083>; +defm DS_DEC_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x084>; +defm DS_MIN_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x085>; +defm DS_MAX_SRC2_I32 : DS_Real_gfx6_gfx7_gfx10<0x086>; +defm DS_MIN_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x087>; +defm DS_MAX_SRC2_U32 : DS_Real_gfx6_gfx7_gfx10<0x088>; +defm DS_AND_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x089>; +defm DS_OR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08a>; +defm DS_XOR_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08b>; +defm DS_WRITE_SRC2_B32 : DS_Real_gfx6_gfx7_gfx10<0x08d>; +defm DS_MIN_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x092>; +defm DS_MAX_SRC2_F32 : DS_Real_gfx6_gfx7_gfx10<0x093>; +defm DS_ADD_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c0>; +defm DS_SUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c1>; +defm DS_RSUB_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c2>; +defm DS_INC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c3>; +defm DS_DEC_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c4>; +defm DS_MIN_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c5>; +defm DS_MAX_SRC2_I64 : DS_Real_gfx6_gfx7_gfx10<0x0c6>; +defm DS_MIN_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c7>; +defm DS_MAX_SRC2_U64 : DS_Real_gfx6_gfx7_gfx10<0x0c8>; +defm DS_AND_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0c9>; +defm DS_OR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0ca>; +defm DS_XOR_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cb>; +defm DS_WRITE_SRC2_B64 : DS_Real_gfx6_gfx7_gfx10<0x0cd>; +defm DS_MIN_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d2>; +defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3>; //===----------------------------------------------------------------------===// -// VIInstructions.td +// GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// class DS_Real_vi <bits<8> op, DS_Pseudo ds> : DS_Real <ds>, SIMCInstr <ds.Mnemonic, SIEncodingFamily.VI> { - let AssemblerPredicates = [isVI]; - let DecoderNamespace="VI"; + let AssemblerPredicates = [isGFX8GFX9]; + let DecoderNamespace = "GFX8"; // encoding let Inst{7-0} = !if(ds.has_offset0, offset0, 0); @@ -1008,7 +1054,7 @@ class DS_Real_vi <bits<8> op, DS_Pseudo ds> : let Inst{16} = !if(ds.has_gds, gds, ds.gdsValue); let Inst{24-17} = op; let Inst{31-26} = 0x36; // ds prefix - let Inst{39-32} = !if(ds.has_addr, addr, 0); + let Inst{39-32} = !if(ds.has_addr, addr, !if(ds.has_gws_data0, data0, 0)); let Inst{47-40} = !if(ds.has_data0, data0, 0); let Inst{55-48} = !if(ds.has_data1, data1, 0); let Inst{63-56} = !if(ds.has_vdst, vdst, 0); diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index f3de903f21b2..4ec4be9bc485 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1,9 +1,8 @@ //===- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -22,13 +21,14 @@ #include "AMDGPURegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm-c/Disassembler.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCExpr.h" @@ -52,8 +52,22 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-disassembler" +#define SGPR_MAX (isGFX10() ? AMDGPU::EncValues::SGPR_MAX_GFX10 \ + : AMDGPU::EncValues::SGPR_MAX_SI) + using DecodeStatus = llvm::MCDisassembler::DecodeStatus; +AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI, + MCContext &Ctx, + MCInstrInfo const *MCII) : + MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()), + TargetMaxInstBytes(Ctx.getAsmInfo()->getMaxInstLength(&STI)) { + + // ToDo: AMDGPUDisassembler supports only VI ISA. + if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10()) + report_fatal_error("Disassembly not yet supported for subtarget"); +} + inline static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand& Opnd) { Inst.addOperand(Opnd); @@ -77,6 +91,8 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + // Our branches take a simm16, but we need two extra bits to account for the + // factor of 4. APInt SignedOffset(18, Imm * 4, true); int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue(); @@ -85,6 +101,12 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, return addOperand(Inst, MCOperand::createImm(Imm)); } +static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, + uint64_t Addr, const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeBoolReg(Val)); +} + #define DECODE_OPERAND(StaticDecoderName, DecoderName) \ static DecodeStatus StaticDecoderName(MCInst &Inst, \ unsigned Imm, \ @@ -98,6 +120,7 @@ static DecodeStatus StaticDecoderName(MCInst &Inst, \ DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass) DECODE_OPERAND_REG(VGPR_32) +DECODE_OPERAND_REG(VRegOrLds_32) DECODE_OPERAND_REG(VS_32) DECODE_OPERAND_REG(VS_64) DECODE_OPERAND_REG(VS_128) @@ -109,12 +132,20 @@ DECODE_OPERAND_REG(VReg_128) DECODE_OPERAND_REG(SReg_32) DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) DECODE_OPERAND_REG(SReg_32_XEXEC_HI) +DECODE_OPERAND_REG(SRegOrLds_32) DECODE_OPERAND_REG(SReg_64) DECODE_OPERAND_REG(SReg_64_XEXEC) DECODE_OPERAND_REG(SReg_128) DECODE_OPERAND_REG(SReg_256) DECODE_OPERAND_REG(SReg_512) +DECODE_OPERAND_REG(AGPR_32) +DECODE_OPERAND_REG(AReg_128) +DECODE_OPERAND_REG(AReg_512) +DECODE_OPERAND_REG(AReg_1024) +DECODE_OPERAND_REG(AV_32) +DECODE_OPERAND_REG(AV_64) + static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -131,6 +162,62 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); } +static DecodeStatus decodeOperand_VS_16(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm)); +} + +static DecodeStatus decodeOperand_VS_32(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm)); +} + +static DecodeStatus decodeOperand_AReg_128(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512)); +} + +static DecodeStatus decodeOperand_AReg_512(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm | 512)); +} + +static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512)); +} + +static DecodeStatus decodeOperand_SReg_32(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeOperand_SReg_32(Imm)); +} + +static DecodeStatus decodeOperand_VGPR_32(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW32, Imm)); +} + #define DECODE_SDWA(DecName) \ DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName) @@ -168,6 +255,16 @@ DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table, return MCDisassembler::Fail; } +static bool isValidDPP8(const MCInst &MI) { + using namespace llvm::AMDGPU::DPP; + int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi); + assert(FiIdx != -1); + if ((unsigned)FiIdx >= MI.getNumOperands()) + return false; + unsigned Fi = MI.getOperand(FiIdx).getImm(); + return Fi == DPP8_FI_0 || Fi == DPP8_FI_1; +} + DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes_, uint64_t Address, @@ -176,11 +273,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, CommentStream = &CS; bool IsSDWA = false; - // ToDo: AMDGPUDisassembler supports only VI ISA. - if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]) - report_fatal_error("Disassembly not yet supported for subtarget"); - - const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size()); + unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size()); Bytes = Bytes_.slice(0, MaxInstBytesNum); DecodeStatus Res = MCDisassembler::Fail; @@ -192,6 +285,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // encodings if (Bytes.size() >= 8) { const uint64_t QW = eatBytes<uint64_t>(Bytes); + + Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address); + if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + break; + + MI = MCInst(); // clear + Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address); if (Res) break; @@ -201,6 +301,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); if (Res) { IsSDWA = true; break; } + Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address); + if (Res) { IsSDWA = true; break; } + + // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and + // v_mad_mixhi_f16 for FMA variants. Try to decode using this special + // table first so we print the correct name. + + if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) { + Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address); + if (Res) break; + } + if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) { Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address); if (Res) @@ -223,7 +335,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Try decode 32-bit instruction if (Bytes.size() < 4) break; const uint32_t DW = eatBytes<uint32_t>(Bytes); - Res = tryDecodeInst(DecoderTableVI32, MI, DW, Address); + Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address); if (Res) break; Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address); @@ -232,33 +344,84 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address); if (Res) break; + Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address); + if (Res) break; + if (Bytes.size() < 4) break; const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW; - Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address); + Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address); if (Res) break; Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address); if (Res) break; Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address); + if (Res) break; + + Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address); } while (false); + if (Res && (MaxInstBytesNum - Bytes.size()) == 12 && (!HasLiteral || + !(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3))) { + MaxInstBytesNum = 8; + Bytes = Bytes_.slice(0, MaxInstBytesNum); + eatBytes<uint64_t>(Bytes); + } + if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || - MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si || + MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 || + MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 || MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi || - MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi)) { + MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi || + MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 || + MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10)) { // Insert dummy unused src2_modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::src2_modifiers); } if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) { - Res = convertMIMGInst(MI); + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); + int RsrcIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); + unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1; + if (VAddr0Idx >= 0 && NSAArgs > 0) { + unsigned NSAWords = (NSAArgs + 3) / 4; + if (Bytes.size() < 4 * NSAWords) { + Res = MCDisassembler::Fail; + } else { + for (unsigned i = 0; i < NSAArgs; ++i) { + MI.insert(MI.begin() + VAddr0Idx + 1 + i, + decodeOperand_VGPR_32(Bytes[i])); + } + Bytes = Bytes.slice(4 * NSAWords); + } + } + + if (Res) + Res = convertMIMGInst(MI); } if (Res && IsSDWA) Res = convertSDWAInst(MI); + int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vdst_in); + if (VDstIn_Idx != -1) { + int Tied = MCII->get(MI.getOpcode()).getOperandConstraint(VDstIn_Idx, + MCOI::OperandConstraint::TIED_TO); + if (Tied != -1 && (MI.getNumOperands() <= (unsigned)VDstIn_Idx || + !MI.getOperand(VDstIn_Idx).isReg() || + MI.getOperand(VDstIn_Idx).getReg() != MI.getOperand(Tied).getReg())) { + if (MI.getNumOperands() > (unsigned)VDstIn_Idx) + MI.erase(&MI.getOperand(VDstIn_Idx)); + insertNamedMCOperand(MI, + MCOperand::createReg(MI.getOperand(Tied).getReg()), + AMDGPU::OpName::vdst_in); + } + } + // if the opcode was not recognized we'll assume a Size of 4 bytes // (unless there are fewer bytes left) Size = Res ? (MaxInstBytesNum - Bytes.size()) @@ -267,7 +430,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { - if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { + if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] || + STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1) // VOPC - insert clamp insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp); @@ -285,9 +449,27 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { return MCDisassembler::Success; } -// Note that MIMG format provides no information about VADDR size. -// Consequently, decoded instructions always show address -// as if it has 1 dword, which could be not really so. +DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { + unsigned Opc = MI.getOpcode(); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + + // Insert dummy unused src modifiers. + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src0_modifiers); + + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src1_modifiers); + + return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail; +} + +// Note that before gfx10, the MIMG encoding provided no information about +// VADDR size. Consequently, decoded instructions always show address as if it +// has 1 dword, which could be not really so. DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), @@ -295,7 +477,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); - + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dmask); @@ -308,16 +491,42 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { assert(DMaskIdx != -1); assert(TFEIdx != -1); + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); bool IsAtomic = (VDstIdx != -1); bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4; - unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf; - if (DMask == 0) - return MCDisassembler::Success; + bool IsNSA = false; + unsigned AddrSize = Info->VAddrDwords; + + if (STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { + unsigned DimIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); + const AMDGPU::MIMGDimInfo *Dim = + AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm()); + + AddrSize = BaseOpcode->NumExtraArgs + + (BaseOpcode->Gradients ? Dim->NumGradients : 0) + + (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA; + if (!IsNSA) { + if (AddrSize > 8) + AddrSize = 16; + else if (AddrSize > 4) + AddrSize = 8; + } else { + if (AddrSize > Info->VAddrDwords) { + // The NSA encoding does not contain enough operands for the combination + // of base opcode / dimension. Should this be an error? + return MCDisassembler::Success; + } + } + } - unsigned DstSize = IsGather4 ? 4 : countPopulation(DMask); - if (DstSize == 1) - return MCDisassembler::Success; + unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf; + unsigned DstSize = IsGather4 ? 4 : std::max(countPopulation(DMask), 1u); bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm(); if (D16 && AMDGPU::hasPackedD16(STI)) { @@ -328,44 +537,64 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { if (MI.getOperand(TFEIdx).getImm()) return MCDisassembler::Success; - int NewOpcode = -1; + if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords) + return MCDisassembler::Success; + + int NewOpcode = + AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize); + if (NewOpcode == -1) + return MCDisassembler::Success; - if (IsGather4) { - if (D16 && AMDGPU::hasPackedD16(STI)) - NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), 2); - else + // Widen the register to the correct number of enabled channels. + unsigned NewVdata = AMDGPU::NoRegister; + if (DstSize != Info->VDataDwords) { + auto DataRCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass; + + // Get first subregister of VData + unsigned Vdata0 = MI.getOperand(VDataIdx).getReg(); + unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0); + Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0; + + NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, + &MRI.getRegClass(DataRCID)); + if (NewVdata == AMDGPU::NoRegister) { + // It's possible to encode this such that the low register + enabled + // components exceeds the register count. return MCDisassembler::Success; - } else { - NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), DstSize); - if (NewOpcode == -1) + } + } + + unsigned NewVAddr0 = AMDGPU::NoRegister; + if (STI.getFeatureBits()[AMDGPU::FeatureGFX10] && !IsNSA && + AddrSize != Info->VAddrDwords) { + unsigned VAddr0 = MI.getOperand(VAddr0Idx).getReg(); + unsigned VAddrSub0 = MRI.getSubReg(VAddr0, AMDGPU::sub0); + VAddr0 = (VAddrSub0 != 0) ? VAddrSub0 : VAddr0; + + auto AddrRCID = MCII->get(NewOpcode).OpInfo[VAddr0Idx].RegClass; + NewVAddr0 = MRI.getMatchingSuperReg(VAddr0, AMDGPU::sub0, + &MRI.getRegClass(AddrRCID)); + if (NewVAddr0 == AMDGPU::NoRegister) return MCDisassembler::Success; } - auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass; + MI.setOpcode(NewOpcode); - // Get first subregister of VData - unsigned Vdata0 = MI.getOperand(VDataIdx).getReg(); - unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0); - Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0; + if (NewVdata != AMDGPU::NoRegister) { + MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata); - // Widen the register to the correct number of enabled channels. - auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, - &MRI.getRegClass(RCID)); - if (NewVdata == AMDGPU::NoRegister) { - // It's possible to encode this such that the low register + enabled - // components exceeds the register count. - return MCDisassembler::Success; + if (IsAtomic) { + // Atomic operations have an additional operand (a copy of data) + MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata); + } } - MI.setOpcode(NewOpcode); - // vaddr will be always appear as a single VGPR. This will look different than - // how it is usually emitted because the number of register components is not - // in the instruction encoding. - MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata); - - if (IsAtomic) { - // Atomic operations have an additional operand (a copy of data) - MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata); + if (NewVAddr0 != AMDGPU::NoRegister) { + MI.getOperand(VAddr0Idx) = MCOperand::createReg(NewVAddr0); + } else if (IsNSA) { + assert(AddrSize <= Info->VAddrDwords); + MI.erase(MI.begin() + VAddr0Idx + AddrSize, + MI.begin() + VAddr0Idx + Info->VAddrDwords); } return MCDisassembler::Success; @@ -470,6 +699,34 @@ MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { return createRegOperand(AMDGPU::VGPR_32RegClassID, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VRegOrLds_32(unsigned Val) const { + return decodeSrcOp(OPW32, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AGPR_32(unsigned Val) const { + return createRegOperand(AMDGPU::AGPR_32RegClassID, Val & 255); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AReg_128(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_128RegClassID, Val & 255); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AReg_1024(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_1024RegClassID, Val & 255); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AV_32(unsigned Val) const { + return decodeSrcOp(OPW32, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AV_64(unsigned Val) const { + return decodeSrcOp(OPW64, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const { return createRegOperand(AMDGPU::VReg_64RegClassID, Val); } @@ -482,6 +739,14 @@ MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const { return createRegOperand(AMDGPU::VReg_128RegClassID, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VReg_256(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_256RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_512RegClassID, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const { // table-gen generated disassembler doesn't care about operand types // leaving only registry class so SSrc_32 operand turns into SReg_32 @@ -501,6 +766,13 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XEXEC_HI( return decodeOperand_SReg_32(Val); } +MCOperand AMDGPUDisassembler::decodeOperand_SRegOrLds_32(unsigned Val) const { + // table-gen generated disassembler doesn't care about operand types + // leaving only registry class so SSrc_32 operand turns into SReg_32 + // and therefore we accept immediates and literals here as well + return decodeSrcOp(OPW32, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const { return decodeSrcOp(OPW64, Val); } @@ -628,6 +900,9 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) { // ToDo: case 248: 1/(2*PI) - is allowed only on VI switch (Width) { case OPW32: + case OPW128: // splat constants + case OPW512: + case OPW1024: return MCOperand::createImm(getInlineImmVal32(Imm)); case OPW64: return MCOperand::createImm(getInlineImmVal64(Imm)); @@ -654,6 +929,24 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { } } +unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const { + using namespace AMDGPU; + + assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); + switch (Width) { + default: // fall + case OPW32: + case OPW16: + case OPWV216: + return AGPR_32RegClassID; + case OPW64: return AReg_64RegClassID; + case OPW128: return AReg_128RegClassID; + case OPW512: return AReg_512RegClassID; + case OPW1024: return AReg_1024RegClassID; + } +} + + unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { using namespace AMDGPU; @@ -691,8 +984,10 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const { using namespace AMDGPU::EncValues; - unsigned TTmpMin = isGFX9() ? TTMP_GFX9_MIN : TTMP_VI_MIN; - unsigned TTmpMax = isGFX9() ? TTMP_GFX9_MAX : TTMP_VI_MAX; + unsigned TTmpMin = + (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MIN : TTMP_VI_MIN; + unsigned TTmpMax = + (isGFX9() || isGFX10()) ? TTMP_GFX9_GFX10_MAX : TTMP_VI_MAX; return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1; } @@ -700,10 +995,14 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const { MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const { using namespace AMDGPU::EncValues; - assert(Val < 512); // enum9 + assert(Val < 1024); // enum10 + + bool IsAGPR = Val & 512; + Val &= 511; if (VGPR_MIN <= Val && Val <= VGPR_MAX) { - return createRegOperand(getVgprClassId(Width), Val - VGPR_MIN); + return createRegOperand(IsAGPR ? getAgprClassId(Width) + : getVgprClassId(Width), Val - VGPR_MIN); } if (Val <= SGPR_MAX) { assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning. @@ -765,23 +1064,23 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { case 105: return createRegOperand(XNACK_MASK_HI); case 106: return createRegOperand(VCC_LO); case 107: return createRegOperand(VCC_HI); - case 108: assert(!isGFX9()); return createRegOperand(TBA_LO); - case 109: assert(!isGFX9()); return createRegOperand(TBA_HI); - case 110: assert(!isGFX9()); return createRegOperand(TMA_LO); - case 111: assert(!isGFX9()); return createRegOperand(TMA_HI); + case 108: return createRegOperand(TBA_LO); + case 109: return createRegOperand(TBA_HI); + case 110: return createRegOperand(TMA_LO); + case 111: return createRegOperand(TMA_HI); case 124: return createRegOperand(M0); + case 125: return createRegOperand(SGPR_NULL); case 126: return createRegOperand(EXEC_LO); case 127: return createRegOperand(EXEC_HI); case 235: return createRegOperand(SRC_SHARED_BASE); case 236: return createRegOperand(SRC_SHARED_LIMIT); case 237: return createRegOperand(SRC_PRIVATE_BASE); case 238: return createRegOperand(SRC_PRIVATE_LIMIT); - // TODO: SRC_POPS_EXITING_WAVE_ID - // ToDo: no support for vccz register - case 251: break; - // ToDo: no support for execz register - case 252: break; - case 253: return createRegOperand(SCC); + case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID); + case 251: return createRegOperand(SRC_VCCZ); + case 252: return createRegOperand(SRC_EXECZ); + case 253: return createRegOperand(SRC_SCC); + case 254: return createRegOperand(LDS_DIRECT); default: break; } return errOperand(Val, "unknown operand encoding " + Twine(Val)); @@ -794,9 +1093,17 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { case 102: return createRegOperand(FLAT_SCR); case 104: return createRegOperand(XNACK_MASK); case 106: return createRegOperand(VCC); - case 108: assert(!isGFX9()); return createRegOperand(TBA); - case 110: assert(!isGFX9()); return createRegOperand(TMA); + case 108: return createRegOperand(TBA); + case 110: return createRegOperand(TMA); case 126: return createRegOperand(EXEC); + case 235: return createRegOperand(SRC_SHARED_BASE); + case 236: return createRegOperand(SRC_SHARED_LIMIT); + case 237: return createRegOperand(SRC_PRIVATE_BASE); + case 238: return createRegOperand(SRC_PRIVATE_LIMIT); + case 239: return createRegOperand(SRC_POPS_EXITING_WAVE_ID); + case 251: return createRegOperand(SRC_VCCZ); + case 252: return createRegOperand(SRC_EXECZ); + case 253: return createRegOperand(SRC_SCC); default: break; } return errOperand(Val, "unknown operand encoding " + Twine(Val)); @@ -807,16 +1114,18 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, using namespace AMDGPU::SDWA; using namespace AMDGPU::EncValues; - if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { - // XXX: static_cast<int> is needed to avoid stupid warning: + if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] || + STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { + // XXX: cast to int is needed to avoid stupid warning: // compare with unsigned is always true - if (SDWA9EncValues::SRC_VGPR_MIN <= static_cast<int>(Val) && + if (int(SDWA9EncValues::SRC_VGPR_MIN) <= int(Val) && Val <= SDWA9EncValues::SRC_VGPR_MAX) { return createRegOperand(getVgprClassId(Width), Val - SDWA9EncValues::SRC_VGPR_MIN); } if (SDWA9EncValues::SRC_SGPR_MIN <= Val && - Val <= SDWA9EncValues::SRC_SGPR_MAX) { + Val <= (isGFX10() ? SDWA9EncValues::SRC_SGPR_MAX_GFX10 + : SDWA9EncValues::SRC_SGPR_MAX_SI)) { return createSRegOperand(getSgprClassId(Width), Val - SDWA9EncValues::SRC_SGPR_MIN); } @@ -852,24 +1161,34 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const { MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { using namespace AMDGPU::SDWA; - assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] && - "SDWAVopcDst should be present only on GFX9"); + assert((STI.getFeatureBits()[AMDGPU::FeatureGFX9] || + STI.getFeatureBits()[AMDGPU::FeatureGFX10]) && + "SDWAVopcDst should be present only on GFX9+"); + + bool IsWave64 = STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64]; + if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) { Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK; int TTmpIdx = getTTmpIdx(Val); if (TTmpIdx >= 0) { return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx); - } else if (Val > AMDGPU::EncValues::SGPR_MAX) { - return decodeSpecialReg64(Val); + } else if (Val > SGPR_MAX) { + return IsWave64 ? decodeSpecialReg64(Val) + : decodeSpecialReg32(Val); } else { - return createSRegOperand(getSgprClassId(OPW64), Val); + return createSRegOperand(getSgprClassId(IsWave64 ? OPW64 : OPW32), Val); } } else { - return createRegOperand(AMDGPU::VCC); + return createRegOperand(IsWave64 ? AMDGPU::VCC : AMDGPU::VCC_LO); } } +MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const { + return STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? + decodeOperand_SReg_64(Val) : decodeOperand_SReg_32(Val); +} + bool AMDGPUDisassembler::isVI() const { return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; } @@ -878,6 +1197,10 @@ bool AMDGPUDisassembler::isGFX9() const { return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; } +bool AMDGPUDisassembler::isGFX10() const { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 75cfc5e11282..c5eaba615c2a 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -1,9 +1,8 @@ //===- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -42,15 +41,14 @@ class AMDGPUDisassembler : public MCDisassembler { private: std::unique_ptr<MCInstrInfo const> const MCII; const MCRegisterInfo &MRI; + const unsigned TargetMaxInstBytes; mutable ArrayRef<uint8_t> Bytes; mutable uint32_t Literal; mutable bool HasLiteral; public: AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, - MCInstrInfo const *MCII) : - MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()) {} - + MCInstrInfo const *MCII); ~AMDGPUDisassembler() override = default; DecodeStatus getInstruction(MCInst &MI, uint64_t &Size, @@ -69,9 +67,12 @@ public: uint64_t Address) const; DecodeStatus convertSDWAInst(MCInst &MI) const; + DecodeStatus convertDPP8Inst(MCInst &MI) const; DecodeStatus convertMIMGInst(MCInst &MI) const; MCOperand decodeOperand_VGPR_32(unsigned Val) const; + MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const; + MCOperand decodeOperand_VS_32(unsigned Val) const; MCOperand decodeOperand_VS_64(unsigned Val) const; MCOperand decodeOperand_VS_128(unsigned Val) const; @@ -81,22 +82,33 @@ public: MCOperand decodeOperand_VReg_64(unsigned Val) const; MCOperand decodeOperand_VReg_96(unsigned Val) const; MCOperand decodeOperand_VReg_128(unsigned Val) const; + MCOperand decodeOperand_VReg_256(unsigned Val) const; + MCOperand decodeOperand_VReg_512(unsigned Val) const; MCOperand decodeOperand_SReg_32(unsigned Val) const; MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const; MCOperand decodeOperand_SReg_32_XEXEC_HI(unsigned Val) const; + MCOperand decodeOperand_SRegOrLds_32(unsigned Val) const; MCOperand decodeOperand_SReg_64(unsigned Val) const; MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const; MCOperand decodeOperand_SReg_128(unsigned Val) const; MCOperand decodeOperand_SReg_256(unsigned Val) const; MCOperand decodeOperand_SReg_512(unsigned Val) const; + MCOperand decodeOperand_AGPR_32(unsigned Val) const; + MCOperand decodeOperand_AReg_128(unsigned Val) const; + MCOperand decodeOperand_AReg_512(unsigned Val) const; + MCOperand decodeOperand_AReg_1024(unsigned Val) const; + MCOperand decodeOperand_AV_32(unsigned Val) const; + MCOperand decodeOperand_AV_64(unsigned Val) const; + enum OpWidthTy { OPW32, OPW64, OPW128, OPW256, OPW512, + OPW1024, OPW16, OPWV216, OPW_LAST_, @@ -104,6 +116,7 @@ public: }; unsigned getVgprClassId(const OpWidthTy Width) const; + unsigned getAgprClassId(const OpWidthTy Width) const; unsigned getSgprClassId(const OpWidthTy Width) const; unsigned getTtmpClassId(const OpWidthTy Width) const; @@ -121,11 +134,14 @@ public: MCOperand decodeSDWASrc32(unsigned Val) const; MCOperand decodeSDWAVopcDst(unsigned Val) const; + MCOperand decodeBoolReg(unsigned Val) const; + int getTTmpIdx(unsigned Val) const; bool isVI() const; bool isGFX9() const; - }; + bool isGFX10() const; +}; //===----------------------------------------------------------------------===// // AMDGPUSymbolizer diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td index 944f4ffe598d..0550092ce1d6 100644 --- a/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/lib/Target/AMDGPU/EvergreenInstructions.td @@ -1,9 +1,8 @@ //===-- EvergreenInstructions.td - EG Instruction defs ----*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index 44040d352e6a..889f60dae920 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -1,17 +1,16 @@ //===-- FLATInstructions.td - FLAT Instruction Defintions -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [], -10>; -def FLATOffset : ComplexPattern<i64, 3, "SelectFlatOffset<false>", [], [], -10>; +def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [SDNPWantRoot], -10>; +def FLATOffset : ComplexPattern<i64, 3, "SelectFlatOffset<false>", [], [SDNPWantRoot], -10>; -def FLATOffsetSigned : ComplexPattern<i64, 3, "SelectFlatOffset<true>", [], [], -10>; -def FLATSignedAtomic : ComplexPattern<i64, 3, "SelectFlatAtomicSigned", [], [], -10>; +def FLATOffsetSigned : ComplexPattern<i64, 3, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>; +def FLATSignedAtomic : ComplexPattern<i64, 3, "SelectFlatAtomicSigned", [], [SDNPWantRoot], -10>; //===----------------------------------------------------------------------===// // FLAT classes @@ -52,6 +51,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, bits<1> has_data = 1; bits<1> has_glc = 1; bits<1> glcValue = 0; + bits<1> has_dlc = 1; + bits<1> dlcValue = 0; let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts, !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace)); @@ -64,6 +65,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, // and are not considered done until both have been decremented. let VM_CNT = 1; let LGKM_CNT = !if(!or(is_flat_global, is_flat_scratch), 0, 1); + + let IsNonFlatSeg = !if(!or(is_flat_global, is_flat_scratch), 1, 0); } class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : @@ -87,6 +90,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : bits<1> slc; bits<1> glc; + bits<1> dlc; // Only valid on gfx9 bits<1> lds = 0; // XXX - What does this actually do? @@ -131,18 +135,16 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> { // saddr is 32-bit (which isn't handled here yet). class FLAT_Load_Pseudo <string opName, RegisterClass regClass, bit HasTiedOutput = 0, - bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo< + bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs regClass:$vdst), !con( !con( - !con( - !con((ins VReg_64:$vaddr), - !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), - (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)), - (ins GLC:$glc, SLC:$slc)), - !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), - " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> { + !con((ins VReg_64:$vaddr), + !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), + (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), + !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), + " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> { let has_data = 0; let mayLoad = 1; let has_saddr = HasSaddr; @@ -155,16 +157,14 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass, } class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, - bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo< + bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs), !con( - !con( - !con((ins VReg_64:$vaddr, vdataClass:$vdata), - !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), - (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)), - (ins GLC:$glc, SLC:$slc)), - " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> { + !con((ins VReg_64:$vaddr, vdataClass:$vdata), + !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), + (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), + " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; @@ -176,18 +176,18 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { let is_flat_global = 1 in { - def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>, + def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>, + def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>, GlobalSaddrTable<1, opName>; } } multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> { let is_flat_global = 1 in { - def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>, + def "" : FLAT_Store_Pseudo<opName, regClass, 1>, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>, + def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>, GlobalSaddrTable<1, opName>; } } @@ -197,9 +197,9 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, opName, (outs regClass:$vdst), !if(EnableSaddr, - (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc), - (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)), - " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc"> { + (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc), + (ins VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), + " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc$dlc"> { let has_data = 0; let mayLoad = 1; let has_saddr = 1; @@ -213,9 +213,9 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En opName, (outs), !if(EnableSaddr, - (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc), - (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)), - " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc"> { + (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc), + (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), + " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; @@ -247,6 +247,8 @@ class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins, let mayStore = 1; let has_glc = 0; let glcValue = 0; + let has_dlc = 0; + let dlcValue = 0; let has_vdst = 0; let maybeAtomic = 1; } @@ -257,6 +259,7 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins, let hasPostISelHook = 1; let has_vdst = 1; let glcValue = 1; + let dlcValue = 0; let PseudoInstr = NAME # "_RTN"; } @@ -266,24 +269,28 @@ multiclass FLAT_Atomic_Pseudo< ValueType vt, SDPatternOperator atomic = null_frag, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc> { + RegisterClass data_rc = vdst_rc, + bit isFP = getIsFP<data_vt>.ret> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc), " $vaddr, $vdata$offset$slc">, GlobalSaddrTable<0, opName>, AtomicNoRet <opName, 0> { let PseudoInstr = NAME; + let FPAtomic = isFP; } def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_rc:$vdst), - (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc), " $vdst, $vaddr, $vdata$offset glc$slc", [(set vt:$vdst, (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, GlobalSaddrTable<0, opName#"_rtn">, - AtomicNoRet <opName, 1>; + AtomicNoRet <opName, 1>{ + let FPAtomic = isFP; + } } multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< @@ -292,27 +299,30 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< ValueType vt, SDPatternOperator atomic = null_frag, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc> { + RegisterClass data_rc = vdst_rc, + bit isFP = getIsFP<data_vt>.ret> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc), " $vaddr, $vdata, off$offset$slc">, GlobalSaddrTable<0, opName>, AtomicNoRet <opName, 0> { let has_saddr = 1; let PseudoInstr = NAME; + let FPAtomic = isFP; } def _SADDR : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC:$slc), " $vaddr, $vdata, $saddr$offset$slc">, GlobalSaddrTable<1, opName>, AtomicNoRet <opName#"_saddr", 0> { let has_saddr = 1; let enabled_saddr = 1; let PseudoInstr = NAME#"_SADDR"; + let FPAtomic = isFP; } } @@ -322,28 +332,31 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< ValueType vt, SDPatternOperator atomic = null_frag, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc> { + RegisterClass data_rc = vdst_rc, + bit isFP = getIsFP<data_vt>.ret> { def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_rc:$vdst), - (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc), " $vdst, $vaddr, $vdata, off$offset glc$slc", [(set vt:$vdst, (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet <opName, 1> { let has_saddr = 1; + let FPAtomic = isFP; } def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_rc:$vdst), - (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC:$slc), " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">, GlobalSaddrTable<1, opName#"_rtn">, AtomicNoRet <opName#"_saddr", 1> { let has_saddr = 1; let enabled_saddr = 1; let PseudoInstr = NAME#"_SADDR_RTN"; + let FPAtomic = isFP; } } @@ -491,7 +504,8 @@ defm FLAT_ATOMIC_INC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2", defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2", VReg_64, i64, atomic_dec_flat>; -let SubtargetPredicate = isCI in { // CI Only flat instructions : FIXME Only? +// GFX7-, GFX10-only flat instructions. +let SubtargetPredicate = isGFX7GFX10 in { defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap", VGPR_32, f32, null_frag, v2f32, VReg_64>; @@ -511,7 +525,7 @@ defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2", defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", VReg_64, f64>; -} // End SubtargetPredicate = isCI +} // End SubtargetPredicate = isGFX7GFX10 let SubtargetPredicate = HasFlatGlobalInsts in { defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; @@ -654,6 +668,32 @@ defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_shor } // End SubtargetPredicate = HasFlatScratchInsts +let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in { + defm GLOBAL_ATOMIC_FCMPSWAP : + FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32>; + defm GLOBAL_ATOMIC_FMIN : + FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>; + defm GLOBAL_ATOMIC_FMAX : + FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>; + defm GLOBAL_ATOMIC_FCMPSWAP_X2 : + FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64>; + defm GLOBAL_ATOMIC_FMIN_X2 : + FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>; + defm GLOBAL_ATOMIC_FMAX_X2 : + FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>; +} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1 + +let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in { + +defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < + "global_atomic_add_f32", VGPR_32, f32, atomic_add_global +>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < + "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global +>; + +} // End SubtargetPredicate = HasAtomicFaddInsts + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -661,89 +701,51 @@ defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_shor // Patterns for global loads with no offset. class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), - (inst $vaddr, $offset, 0, $slc) + (inst $vaddr, $offset, 0, 0, $slc) >; -multiclass FlatLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { - def : GCNPat < - (build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; - - def : GCNPat < - (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; -} - -multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { - def : GCNPat < - (build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; - - def : GCNPat < - (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; -} - -multiclass FlatLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { - def : GCNPat < - (build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; - - def : GCNPat < - (build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; -} - -multiclass FlatSignedLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { - def : GCNPat < - (build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; +class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in), + (inst $vaddr, $offset, 0, 0, $slc, $in) +>; - def : GCNPat < - (build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; -} +class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc), vt:$in), + (inst $vaddr, $offset, 0, 0, $slc, $in) +>; class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))), - (inst $vaddr, $offset, 0, $slc) + (vt (node (FLATAtomic (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))), + (inst $vaddr, $offset, 0, 0, $slc) >; class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), - (inst $vaddr, $offset, 0, $slc) + (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset, i1:$slc))), + (inst $vaddr, $offset, 0, 0, $slc) >; -class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < +class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat < (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)), - (inst $vaddr, $data, $offset, 0, $slc) + (inst $vaddr, rc:$data, $offset, 0, 0, $slc) >; -class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < +class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat < (node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)), - (inst $vaddr, $data, $offset, 0, $slc) + (inst $vaddr, rc:$data, $offset, 0, 0, $slc) >; -class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < +class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat < // atomic store follows atomic binop convention so the address comes // first. (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), - (inst $vaddr, $data, $offset, 0, $slc) + (inst $vaddr, rc:$data, $offset, 0, 0, $slc) >; -class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < +class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, RegisterClass rc = VGPR_32> : GCNPat < // atomic store follows atomic binop convention so the address comes // first. (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), - (inst $vaddr, $data, $offset, 0, $slc) + (inst $vaddr, rc:$data, $offset, 0, 0, $slc) >; class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, @@ -752,6 +754,11 @@ class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, (inst $vaddr, $data, $offset, $slc) >; +class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), + (inst $vaddr, $data, $offset, $slc) +>; + class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)), @@ -760,28 +767,33 @@ class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType v let OtherPredicates = [HasFlatAddressSpace] in { -def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_flat, i32>; +def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>; +def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>; def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_flat, i16>; +def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; +def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; -def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_flat, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>; def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>; def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, i32>; def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, v2i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>; def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, v4i32>; -def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_load_flat, i32>; -def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_flat, i64>; +def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>; +def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>; def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>; def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>; def : FlatStorePat <FLAT_STORE_DWORD, store_flat, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32>; -def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32, VReg_64>; +def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32, VReg_96>; +def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32, VReg_128>; -def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat, i32>; -def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat, i64>; +def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>; +def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64, VReg_64>; def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; @@ -818,62 +830,77 @@ let OtherPredicates = [D16PreservesUnusedBits] in { def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; -let AddedComplexity = 3 in { -defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>; -defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>; -defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>; -} - -let AddedComplexity = 9 in { -defm : FlatLoadPat_Lo16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_flat>; -defm : FlatLoadPat_Lo16 <FLAT_LOAD_SBYTE_D16, sextloadi8_flat>; -defm : FlatLoadPat_Lo16 <FLAT_LOAD_SHORT_D16, load_flat>; -} +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>; + +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; } } // End OtherPredicates = [HasFlatAddressSpace] +def atomic_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_fadd>; +def atomic_pk_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_pk_fadd>; + let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in { -def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, az_extloadi8_global, i32>; +def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>; +def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>; def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>; -def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, az_extloadi8_global, i16>; +def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>; +def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>; def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>; -def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, az_extloadi16_global, i32>; +def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, extloadi16_global, i32>; +def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>; def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>; def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>; def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, i32>; def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, v2i32>; +def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX3, load_global, v3i32>; def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, v4i32>; -def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORD, atomic_load_global, i32>; -def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORDX2, atomic_load_global, i64>; +def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORD, atomic_load_32_global, i32>; +def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORDX2, atomic_load_64_global, i64>; -def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32>; -def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16>; -def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32>; -def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16>; -def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>; -def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>; -def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>; +def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32, VGPR_32>; +def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16, VGPR_32>; +def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32, VGPR_32>; +def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16, VGPR_32>; +def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32, VGPR_32>; +def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32, VReg_64>; +def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX3, store_global, v3i32, VReg_96>; +def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32, VReg_128>; let OtherPredicates = [D16PreservesUnusedBits] in { def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>; def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>; -defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>; -defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>; -defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>; - -defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_global>; -defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_global>; -defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SHORT_D16, load_global>; - +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2f16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2f16>; + +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2f16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2f16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>; } def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>; -def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64>; +def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64, VReg_64>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_add_global, i32>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_sub_global, i32>; @@ -903,7 +930,10 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; -} // End OtherPredicates = [HasFlatGlobalInsts] +def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global, f32>; +def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global, v2f16>; + +} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 //===----------------------------------------------------------------------===// @@ -917,8 +947,8 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps> : FLAT_Real <op, ps>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SI> { - let AssemblerPredicate = isCIOnly; - let DecoderNamespace="CI"; + let AssemblerPredicate = isGFX7Only; + let DecoderNamespace="GFX7"; } def FLAT_LOAD_UBYTE_ci : FLAT_Real_ci <0x8, FLAT_LOAD_UBYTE>; @@ -985,8 +1015,8 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2 class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> : FLAT_Real <op, ps>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { - let AssemblerPredicate = isVI; - let DecoderNamespace="VI"; + let AssemblerPredicate = isGFX8GFX9; + let DecoderNamespace = "GFX8"; } multiclass FLAT_Real_AllAddr_vi<bits<7> op> { @@ -1133,3 +1163,200 @@ defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_vi <0x1c>; defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>; defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>; defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>; + + +//===----------------------------------------------------------------------===// +// GFX10. +//===----------------------------------------------------------------------===// + +class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> : + FLAT_Real<op, ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> { + let AssemblerPredicate = isGFX10Plus; + let DecoderNamespace = "GFX10"; + + let Inst{11-0} = {offset{12}, offset{10-0}}; + let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue); + let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d); + let Inst{55} = 0; +} + + +multiclass FLAT_Real_Base_gfx10<bits<7> op> { + def _gfx10 : + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME)>; +} + +multiclass FLAT_Real_RTN_gfx10<bits<7> op> { + def _RTN_gfx10 : + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_RTN")>; +} + +multiclass FLAT_Real_SADDR_gfx10<bits<7> op> { + def _SADDR_gfx10 : + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>; +} + +multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> { + def _SADDR_RTN_gfx10 : + FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>; +} + + +multiclass FLAT_Real_AllAddr_gfx10<bits<7> op> : + FLAT_Real_Base_gfx10<op>, + FLAT_Real_SADDR_gfx10<op>; + +multiclass FLAT_Real_Atomics_gfx10<bits<7> op> : + FLAT_Real_Base_gfx10<op>, + FLAT_Real_RTN_gfx10<op>; + +multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op> : + FLAT_Real_AllAddr_gfx10<op>, + FLAT_Real_RTN_gfx10<op>, + FLAT_Real_SADDR_RTN_gfx10<op>; + + +// ENC_FLAT. +defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>; +defm FLAT_LOAD_SBYTE : FLAT_Real_Base_gfx10<0x009>; +defm FLAT_LOAD_USHORT : FLAT_Real_Base_gfx10<0x00a>; +defm FLAT_LOAD_SSHORT : FLAT_Real_Base_gfx10<0x00b>; +defm FLAT_LOAD_DWORD : FLAT_Real_Base_gfx10<0x00c>; +defm FLAT_LOAD_DWORDX2 : FLAT_Real_Base_gfx10<0x00d>; +defm FLAT_LOAD_DWORDX4 : FLAT_Real_Base_gfx10<0x00e>; +defm FLAT_LOAD_DWORDX3 : FLAT_Real_Base_gfx10<0x00f>; +defm FLAT_STORE_BYTE : FLAT_Real_Base_gfx10<0x018>; +defm FLAT_STORE_BYTE_D16_HI : FLAT_Real_Base_gfx10<0x019>; +defm FLAT_STORE_SHORT : FLAT_Real_Base_gfx10<0x01a>; +defm FLAT_STORE_SHORT_D16_HI : FLAT_Real_Base_gfx10<0x01b>; +defm FLAT_STORE_DWORD : FLAT_Real_Base_gfx10<0x01c>; +defm FLAT_STORE_DWORDX2 : FLAT_Real_Base_gfx10<0x01d>; +defm FLAT_STORE_DWORDX4 : FLAT_Real_Base_gfx10<0x01e>; +defm FLAT_STORE_DWORDX3 : FLAT_Real_Base_gfx10<0x01f>; +defm FLAT_LOAD_UBYTE_D16 : FLAT_Real_Base_gfx10<0x020>; +defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Real_Base_gfx10<0x021>; +defm FLAT_LOAD_SBYTE_D16 : FLAT_Real_Base_gfx10<0x022>; +defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Real_Base_gfx10<0x023>; +defm FLAT_LOAD_SHORT_D16 : FLAT_Real_Base_gfx10<0x024>; +defm FLAT_LOAD_SHORT_D16_HI : FLAT_Real_Base_gfx10<0x025>; +defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_gfx10<0x030>; +defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_gfx10<0x031>; +defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_gfx10<0x032>; +defm FLAT_ATOMIC_SUB : FLAT_Real_Atomics_gfx10<0x033>; +defm FLAT_ATOMIC_SMIN : FLAT_Real_Atomics_gfx10<0x035>; +defm FLAT_ATOMIC_UMIN : FLAT_Real_Atomics_gfx10<0x036>; +defm FLAT_ATOMIC_SMAX : FLAT_Real_Atomics_gfx10<0x037>; +defm FLAT_ATOMIC_UMAX : FLAT_Real_Atomics_gfx10<0x038>; +defm FLAT_ATOMIC_AND : FLAT_Real_Atomics_gfx10<0x039>; +defm FLAT_ATOMIC_OR : FLAT_Real_Atomics_gfx10<0x03a>; +defm FLAT_ATOMIC_XOR : FLAT_Real_Atomics_gfx10<0x03b>; +defm FLAT_ATOMIC_INC : FLAT_Real_Atomics_gfx10<0x03c>; +defm FLAT_ATOMIC_DEC : FLAT_Real_Atomics_gfx10<0x03d>; +defm FLAT_ATOMIC_FCMPSWAP : FLAT_Real_Atomics_gfx10<0x03e>; +defm FLAT_ATOMIC_FMIN : FLAT_Real_Atomics_gfx10<0x03f>; +defm FLAT_ATOMIC_FMAX : FLAT_Real_Atomics_gfx10<0x040>; +defm FLAT_ATOMIC_SWAP_X2 : FLAT_Real_Atomics_gfx10<0x050>; +defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x051>; +defm FLAT_ATOMIC_ADD_X2 : FLAT_Real_Atomics_gfx10<0x052>; +defm FLAT_ATOMIC_SUB_X2 : FLAT_Real_Atomics_gfx10<0x053>; +defm FLAT_ATOMIC_SMIN_X2 : FLAT_Real_Atomics_gfx10<0x055>; +defm FLAT_ATOMIC_UMIN_X2 : FLAT_Real_Atomics_gfx10<0x056>; +defm FLAT_ATOMIC_SMAX_X2 : FLAT_Real_Atomics_gfx10<0x057>; +defm FLAT_ATOMIC_UMAX_X2 : FLAT_Real_Atomics_gfx10<0x058>; +defm FLAT_ATOMIC_AND_X2 : FLAT_Real_Atomics_gfx10<0x059>; +defm FLAT_ATOMIC_OR_X2 : FLAT_Real_Atomics_gfx10<0x05a>; +defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_gfx10<0x05b>; +defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_gfx10<0x05c>; +defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_gfx10<0x05d>; +defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Real_Atomics_gfx10<0x05e>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_Real_Atomics_gfx10<0x05f>; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_gfx10<0x060>; + + +// ENC_FLAT_GLBL. +defm GLOBAL_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>; +defm GLOBAL_LOAD_SBYTE : FLAT_Real_AllAddr_gfx10<0x009>; +defm GLOBAL_LOAD_USHORT : FLAT_Real_AllAddr_gfx10<0x00a>; +defm GLOBAL_LOAD_SSHORT : FLAT_Real_AllAddr_gfx10<0x00b>; +defm GLOBAL_LOAD_DWORD : FLAT_Real_AllAddr_gfx10<0x00c>; +defm GLOBAL_LOAD_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x00d>; +defm GLOBAL_LOAD_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x00e>; +defm GLOBAL_LOAD_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x00f>; +defm GLOBAL_STORE_BYTE : FLAT_Real_AllAddr_gfx10<0x018>; +defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x019>; +defm GLOBAL_STORE_SHORT : FLAT_Real_AllAddr_gfx10<0x01a>; +defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x01b>; +defm GLOBAL_STORE_DWORD : FLAT_Real_AllAddr_gfx10<0x01c>; +defm GLOBAL_STORE_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x01d>; +defm GLOBAL_STORE_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x01e>; +defm GLOBAL_STORE_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x01f>; +defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x020>; +defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x021>; +defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x022>; +defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x023>; +defm GLOBAL_LOAD_SHORT_D16 : FLAT_Real_AllAddr_gfx10<0x024>; +defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x025>; +defm GLOBAL_ATOMIC_SWAP : FLAT_Real_GlblAtomics_gfx10<0x030>; +defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x031>; +defm GLOBAL_ATOMIC_ADD : FLAT_Real_GlblAtomics_gfx10<0x032>; +defm GLOBAL_ATOMIC_SUB : FLAT_Real_GlblAtomics_gfx10<0x033>; +defm GLOBAL_ATOMIC_SMIN : FLAT_Real_GlblAtomics_gfx10<0x035>; +defm GLOBAL_ATOMIC_UMIN : FLAT_Real_GlblAtomics_gfx10<0x036>; +defm GLOBAL_ATOMIC_SMAX : FLAT_Real_GlblAtomics_gfx10<0x037>; +defm GLOBAL_ATOMIC_UMAX : FLAT_Real_GlblAtomics_gfx10<0x038>; +defm GLOBAL_ATOMIC_AND : FLAT_Real_GlblAtomics_gfx10<0x039>; +defm GLOBAL_ATOMIC_OR : FLAT_Real_GlblAtomics_gfx10<0x03a>; +defm GLOBAL_ATOMIC_XOR : FLAT_Real_GlblAtomics_gfx10<0x03b>; +defm GLOBAL_ATOMIC_INC : FLAT_Real_GlblAtomics_gfx10<0x03c>; +defm GLOBAL_ATOMIC_DEC : FLAT_Real_GlblAtomics_gfx10<0x03d>; +defm GLOBAL_ATOMIC_FCMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x03e>; +defm GLOBAL_ATOMIC_FMIN : FLAT_Real_GlblAtomics_gfx10<0x03f>; +defm GLOBAL_ATOMIC_FMAX : FLAT_Real_GlblAtomics_gfx10<0x040>; +defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x050>; +defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x051>; +defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Real_GlblAtomics_gfx10<0x052>; +defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Real_GlblAtomics_gfx10<0x053>; +defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x055>; +defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x056>; +defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x057>; +defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x058>; +defm GLOBAL_ATOMIC_AND_X2 : FLAT_Real_GlblAtomics_gfx10<0x059>; +defm GLOBAL_ATOMIC_OR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05a>; +defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Real_GlblAtomics_gfx10<0x05b>; +defm GLOBAL_ATOMIC_INC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05c>; +defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>; +defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>; +defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>; +defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>; + + +// ENC_FLAT_SCRATCH. +defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>; +defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_gfx10<0x009>; +defm SCRATCH_LOAD_USHORT : FLAT_Real_AllAddr_gfx10<0x00a>; +defm SCRATCH_LOAD_SSHORT : FLAT_Real_AllAddr_gfx10<0x00b>; +defm SCRATCH_LOAD_DWORD : FLAT_Real_AllAddr_gfx10<0x00c>; +defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x00d>; +defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x00e>; +defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x00f>; +defm SCRATCH_STORE_BYTE : FLAT_Real_AllAddr_gfx10<0x018>; +defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x019>; +defm SCRATCH_STORE_SHORT : FLAT_Real_AllAddr_gfx10<0x01a>; +defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x01b>; +defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_gfx10<0x01c>; +defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_gfx10<0x01d>; +defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_gfx10<0x01e>; +defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_gfx10<0x01f>; +defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x020>; +defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x021>; +defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x022>; +defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x023>; +defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_gfx10<0x024>; +defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x025>; + +let SubtargetPredicate = HasAtomicFaddInsts in { + +defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Real_AllAddr_vi <0x04d>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Real_AllAddr_vi <0x04e>; + +} // End SubtargetPredicate = HasAtomicFaddInsts diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp index 56071d0d2374..e1845e2e8e87 100644 --- a/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -1,37 +1,40 @@ //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0 -// operand.If any of the use instruction cannot be combined with the mov the +// operand. If any of the use instruction cannot be combined with the mov the // whole sequence is reverted. // // $old = ... // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane, -// dpp_controls..., $bound_ctrl -// $res = VALU $dpp_value, ... +// dpp_controls..., $row_mask, $bank_mask, $bound_ctrl +// $res = VALU $dpp_value [, src1] // // to // -// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ..., -// dpp_controls..., $folded_bound_ctrl +// $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,] +// dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl // // Combining rules : // -// $bound_ctrl is DPP_BOUND_ZERO, $old is any -// $bound_ctrl is DPP_BOUND_OFF, $old is 0 +// if $row_mask and $bank_mask are fully enabled (0xF) and +// $bound_ctrl==DPP_BOUND_ZERO or $old==0 +// -> $combined_old = undef, +// $combined_bound_ctrl = DPP_BOUND_ZERO // -// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO -// $bound_ctrl is DPP_BOUND_OFF, $old is undef +// if the VALU op is binary and +// $bound_ctrl==DPP_BOUND_OFF and +// $old==identity value (immediate) for the VALU op +// -> $combined_old = src1, +// $combined_bound_ctrl = DPP_BOUND_OFF // -// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF -// $bound_ctrl is DPP_BOUND_OFF, $old is foldable +// Otherwise cancel. // -// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF +// The mov_dpp instruction should reside in the same BB as all its uses //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -67,20 +70,16 @@ class GCNDPPCombine : public MachineFunctionPass { MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; - RegSubRegPair foldOldOpnd(MachineInstr &OrigMI, - RegSubRegPair OldOpndVGPR, - MachineOperand &OldOpndValue) const; - MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, - RegSubRegPair OldOpndVGPR, + RegSubRegPair CombOldVGPR, MachineOperand *OldOpnd, - bool BoundCtrlZero) const; + bool CombBCZ) const; MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, - RegSubRegPair OldOpndVGPR, - bool BoundCtrlZero) const; + RegSubRegPair CombOldVGPR, + bool CombBCZ) const; bool hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, @@ -153,8 +152,8 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, - RegSubRegPair OldOpndVGPR, - bool BoundCtrlZero) const { + RegSubRegPair CombOldVGPR, + bool CombBCZ) const { assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg()); @@ -178,9 +177,15 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); if (OldIdx != -1) { assert(OldIdx == NumOperands); - assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI)); - DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg); + assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)); + DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg); ++NumOperands; + } else { + // TODO: this discards MAC/FMA instructions for now, let's add it later + LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction," + " TBD\n"); + Fail = true; + break; } if (auto *Mod0 = TII->getNamedOperand(OrigMI, @@ -199,6 +204,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, break; } DPPInst.add(*Src0); + DPPInst->getOperand(NumOperands).setIsKill(false); ++NumOperands; if (auto *Mod1 = TII->getNamedOperand(OrigMI, @@ -231,7 +237,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); - DPPInst.addImm(BoundCtrlZero ? 1 : 0); + DPPInst.addImm(CombBCZ ? 1 : 0); } while (false); if (Fail) { @@ -242,64 +248,81 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, return DPPInst.getInstr(); } -GCNDPPCombine::RegSubRegPair -GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI, - RegSubRegPair OldOpndVGPR, - MachineOperand &OldOpndValue) const { - assert(OldOpndValue.isImm()); - switch (OrigMI.getOpcode()) { +static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) { + assert(OldOpnd->isImm()); + switch (OrigMIOp) { default: break; + case AMDGPU::V_ADD_U32_e32: + case AMDGPU::V_ADD_U32_e64: + case AMDGPU::V_ADD_I32_e32: + case AMDGPU::V_ADD_I32_e64: + case AMDGPU::V_OR_B32_e32: + case AMDGPU::V_OR_B32_e64: + case AMDGPU::V_SUBREV_U32_e32: + case AMDGPU::V_SUBREV_U32_e64: + case AMDGPU::V_SUBREV_I32_e32: + case AMDGPU::V_SUBREV_I32_e64: case AMDGPU::V_MAX_U32_e32: - if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max()) - return OldOpndVGPR; + case AMDGPU::V_MAX_U32_e64: + case AMDGPU::V_XOR_B32_e32: + case AMDGPU::V_XOR_B32_e64: + if (OldOpnd->getImm() == 0) + return true; break; - case AMDGPU::V_MAX_I32_e32: - if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max()) - return OldOpndVGPR; + case AMDGPU::V_AND_B32_e32: + case AMDGPU::V_AND_B32_e64: + case AMDGPU::V_MIN_U32_e32: + case AMDGPU::V_MIN_U32_e64: + if (static_cast<uint32_t>(OldOpnd->getImm()) == + std::numeric_limits<uint32_t>::max()) + return true; break; case AMDGPU::V_MIN_I32_e32: - if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min()) - return OldOpndVGPR; + case AMDGPU::V_MIN_I32_e64: + if (static_cast<int32_t>(OldOpnd->getImm()) == + std::numeric_limits<int32_t>::max()) + return true; + break; + case AMDGPU::V_MAX_I32_e32: + case AMDGPU::V_MAX_I32_e64: + if (static_cast<int32_t>(OldOpnd->getImm()) == + std::numeric_limits<int32_t>::min()) + return true; break; - case AMDGPU::V_MUL_I32_I24_e32: + case AMDGPU::V_MUL_I32_I24_e64: case AMDGPU::V_MUL_U32_U24_e32: - if (OldOpndValue.getImm() == 1) { - auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); - assert(Src1 && Src1->isReg()); - return getRegSubRegPair(*Src1); - } + case AMDGPU::V_MUL_U32_U24_e64: + if (OldOpnd->getImm() == 1) + return true; break; } - return RegSubRegPair(); + return false; } -// Cases to combine: -// $bound_ctrl is DPP_BOUND_ZERO, $old is any -// $bound_ctrl is DPP_BOUND_OFF, $old is 0 -// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO - -// $bound_ctrl is DPP_BOUND_OFF, $old is undef -// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF - -// $bound_ctrl is DPP_BOUND_OFF, $old is foldable -// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF - MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, - RegSubRegPair OldOpndVGPR, + RegSubRegPair CombOldVGPR, MachineOperand *OldOpndValue, - bool BoundCtrlZero) const { - assert(OldOpndVGPR.Reg); - if (!BoundCtrlZero && OldOpndValue) { - assert(OldOpndValue->isImm()); - OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue); - if (!OldOpndVGPR.Reg) { - LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n"); + bool CombBCZ) const { + assert(CombOldVGPR.Reg); + if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) { + auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); + if (!Src1 || !Src1->isReg()) { + LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n"); + return nullptr; + } + if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) { + LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n"); + return nullptr; + } + CombOldVGPR = getRegSubRegPair(*Src1); + if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) { + LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n"); return nullptr; } } - return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero); + return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ); } // returns true if MI doesn't have OpndName immediate operand or the @@ -316,31 +339,64 @@ bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); + + auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); + assert(DstOpnd && DstOpnd->isReg()); + auto DPPMovReg = DstOpnd->getReg(); + if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) { + LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" + " for all uses\n"); + return false; + } + + auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask); + assert(RowMaskOpnd && RowMaskOpnd->isImm()); + auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask); + assert(BankMaskOpnd && BankMaskOpnd->isImm()); + const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF && + BankMaskOpnd->getImm() == 0xF; + auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl); assert(BCZOpnd && BCZOpnd->isImm()); - bool BoundCtrlZero = 0 != BCZOpnd->getImm(); - - LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); + bool BoundCtrlZero = BCZOpnd->getImm(); auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); assert(OldOpnd && OldOpnd->isReg()); - auto OldOpndVGPR = getRegSubRegPair(*OldOpnd); - auto *OldOpndValue = getOldOpndValue(*OldOpnd); + + auto * const OldOpndValue = getOldOpndValue(*OldOpnd); + // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else + // We could use: assert(!OldOpndValue || OldOpndValue->isImm()) + // but the third option is used to distinguish undef from non-immediate + // to reuse IMPLICIT_DEF instruction later assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); - if (OldOpndValue) { - if (BoundCtrlZero) { - OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd - OldOpndValue = nullptr; - } else { - if (!OldOpndValue->isImm()) { - LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n"); - return false; - } - if (OldOpndValue->getImm() == 0) { - OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef - OldOpndValue = nullptr; - BoundCtrlZero = true; + + bool CombBCZ = false; + + if (MaskAllLanes && BoundCtrlZero) { // [1] + CombBCZ = true; + } else { + if (!OldOpndValue || !OldOpndValue->isImm()) { + LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n"); + return false; + } + + if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) { + LLVM_DEBUG(dbgs() << + " failed: old reg def and mov should be in the same BB\n"); + return false; + } + + if (OldOpndValue->getImm() == 0) { + if (MaskAllLanes) { + assert(!BoundCtrlZero); // by check [1] + CombBCZ = true; } + } else if (BoundCtrlZero) { + assert(!MaskAllLanes); // by check [1] + LLVM_DEBUG(dbgs() << + " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n"); + return false; } } @@ -348,25 +404,28 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { if (!OldOpndValue) dbgs() << "undef"; else - dbgs() << OldOpndValue->getImm(); - dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n'); - - std::vector<MachineInstr*> OrigMIs, DPPMIs; - if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef - OldOpndVGPR = RegSubRegPair( + dbgs() << *OldOpndValue; + dbgs() << ", bound_ctrl=" << CombBCZ << '\n'); + + SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs; + auto CombOldVGPR = getRegSubRegPair(*OldOpnd); + // try to reuse previous old reg if its undefined (IMPLICIT_DEF) + if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef + CombOldVGPR = RegSubRegPair( MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass)); auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), - TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg); + TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg); DPPMIs.push_back(UndefInst.getInstr()); } OrigMIs.push_back(&MovMI); bool Rollback = true; - for (auto &Use : MRI->use_nodbg_operands( - TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) { + for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) { Rollback = true; auto &OrigMI = *Use.getParent(); + LLVM_DEBUG(dbgs() << " try: " << OrigMI); + auto OrigOp = OrigMI.getOpcode(); if (TII->isVOP3(OrigOp)) { if (!TII->hasVALU32BitEncoding(OrigOp)) { @@ -389,8 +448,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { LLVM_DEBUG(dbgs() << " combining: " << OrigMI); if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { - if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR, - OldOpndValue, BoundCtrlZero)) { + if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, + OldOpndValue, CombBCZ)) { DPPMIs.push_back(DPPInst); Rollback = false; } @@ -401,8 +460,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { BB->insert(OrigMI, NewMI); if (TII->commuteInstruction(*NewMI)) { LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); - if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR, - OldOpndValue, BoundCtrlZero)) { + if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR, + OldOpndValue, CombBCZ)) { DPPMIs.push_back(DPPInst); Rollback = false; } diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index c6396de89c4f..885239e2faed 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1,9 +1,8 @@ //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -21,6 +20,7 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/MC/MCInstrDesc.h" @@ -38,6 +38,7 @@ using namespace llvm; //===----------------------------------------------------------------------===// GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : + IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), @@ -45,7 +46,8 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : TRI(TII.getRegisterInfo()), ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { - MaxLookAhead = 5; + MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; + TSchedModel.init(&ST); } void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { @@ -88,18 +90,38 @@ static bool isSMovRel(unsigned Opcode) { } } -static bool isSendMsgTraceDataOrGDS(const MachineInstr &MI) { +static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, + const MachineInstr &MI) { + if (TII.isAlwaysGDS(MI.getOpcode())) + return true; + switch (MI.getOpcode()) { case AMDGPU::S_SENDMSG: case AMDGPU::S_SENDMSGHALT: case AMDGPU::S_TTRACEDATA: return true; + // These DS opcodes don't support GDS. + case AMDGPU::DS_NOP: + case AMDGPU::DS_PERMUTE_B32: + case AMDGPU::DS_BPERMUTE_B32: + return false; default: - // TODO: GDS + if (TII.isDS(MI.getOpcode())) { + int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::gds); + if (MI.getOperand(GDS).getImm()) + return true; + } return false; } } +static bool isPermlane(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + return Opcode == AMDGPU::V_PERMLANE16_B32 || + Opcode == AMDGPU::V_PERMLANEX16_B32; +} + static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); @@ -109,6 +131,8 @@ static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { ScheduleHazardRecognizer::HazardType GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { MachineInstr *MI = SU->getInstr(); + if (MI->isBundle()) + return NoHazard; if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) return NoopHazard; @@ -119,6 +143,15 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { && checkVMEMHazards(MI) > 0) return NoopHazard; + if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) + return NoopHazard; + + if (checkFPAtomicToDenormModeHazard(MI) > 0) + return NoopHazard; + + if (ST.hasNoDataDepHazard()) + return NoHazard; + if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) return NoopHazard; @@ -145,10 +178,16 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { checkReadM0Hazards(MI) > 0) return NoopHazard; - if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI) && + if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && checkReadM0Hazards(MI) > 0) return NoopHazard; + if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) + return NoopHazard; + + if ((MI->mayLoad() || MI->mayStore()) && checkMAILdStHazards(MI) > 0) + return NoopHazard; + if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) return NoopHazard; @@ -158,22 +197,74 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { return NoHazard; } +static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) { + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) + .addImm(0); +} + +void GCNHazardRecognizer::processBundle() { + MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); + MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); + // Check bundled MachineInstr's for hazards. + for (; MI != E && MI->isInsideBundle(); ++MI) { + CurrCycleInstr = &*MI; + unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); + + if (IsHazardRecognizerMode) + fixHazards(CurrCycleInstr); + + for (unsigned i = 0; i < WaitStates; ++i) + insertNoopInBundle(CurrCycleInstr, TII); + + // It’s unnecessary to track more than MaxLookAhead instructions. Since we + // include the bundled MI directly after, only add a maximum of + // (MaxLookAhead - 1) noops to EmittedInstrs. + for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) + EmittedInstrs.push_front(nullptr); + + EmittedInstrs.push_front(CurrCycleInstr); + EmittedInstrs.resize(MaxLookAhead); + } + CurrCycleInstr = nullptr; +} + unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { - return PreEmitNoops(SU->getInstr()); + IsHazardRecognizerMode = false; + return PreEmitNoopsCommon(SU->getInstr()); } unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { + IsHazardRecognizerMode = true; + CurrCycleInstr = MI; + unsigned W = PreEmitNoopsCommon(MI); + fixHazards(MI); + CurrCycleInstr = nullptr; + return W; +} + +unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { + if (MI->isBundle()) + return 0; + int WaitStates = std::max(0, checkAnyInstHazards(MI)); if (SIInstrInfo::isSMRD(*MI)) return std::max(WaitStates, checkSMRDHazards(MI)); - if (SIInstrInfo::isVALU(*MI)) - WaitStates = std::max(WaitStates, checkVALUHazards(MI)); - if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); + if (ST.hasNSAtoVMEMBug()) + WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); + + WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); + + if (ST.hasNoDataDepHazard()) + return WaitStates; + + if (SIInstrInfo::isVALU(*MI)) + WaitStates = std::max(WaitStates, checkVALUHazards(MI)); + if (SIInstrInfo::isDPP(*MI)) WaitStates = std::max(WaitStates, checkDPPHazards(MI)); @@ -199,9 +290,15 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { isSMovRel(MI->getOpcode()))) return std::max(WaitStates, checkReadM0Hazards(MI)); - if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI)) + if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) return std::max(WaitStates, checkReadM0Hazards(MI)); + if (SIInstrInfo::isMAI(*MI)) + return std::max(WaitStates, checkMAIHazards(MI)); + + if (MI->mayLoad() || MI->mayStore()) + return std::max(WaitStates, checkMAILdStHazards(MI)); + return WaitStates; } @@ -218,10 +315,14 @@ void GCNHazardRecognizer::AdvanceCycle() { // Do not track non-instructions which do not affect the wait states. // If included, these instructions can lead to buffer overflow such that // detectable hazards are missed. - if (CurrCycleInstr->getOpcode() == AMDGPU::IMPLICIT_DEF) + if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || + CurrCycleInstr->isKill()) return; - else if (CurrCycleInstr->isDebugInstr()) + + if (CurrCycleInstr->isBundle()) { + processBundle(); return; + } unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); @@ -252,41 +353,112 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -int GCNHazardRecognizer::getWaitStatesSince( - function_ref<bool(MachineInstr *)> IsHazard) { +typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; + +// Returns a minimum wait states since \p I walking all predecessors. +// Only scans until \p IsExpired does not return true. +// Can only be run in a hazard recognizer mode. +static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, + MachineBasicBlock *MBB, + MachineBasicBlock::reverse_instr_iterator I, + int WaitStates, + IsExpiredFn IsExpired, + DenseSet<const MachineBasicBlock *> &Visited) { + for (auto E = MBB->instr_rend(); I != E; ++I) { + // Don't add WaitStates for parent BUNDLE instructions. + if (I->isBundle()) + continue; + + if (IsHazard(&*I)) + return WaitStates; + + if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr()) + continue; + + WaitStates += SIInstrInfo::getNumWaitStates(*I); + + if (IsExpired(&*I, WaitStates)) + return std::numeric_limits<int>::max(); + } + + int MinWaitStates = WaitStates; + bool Found = false; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (!Visited.insert(Pred).second) + continue; + + int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), + WaitStates, IsExpired, Visited); + + if (W == std::numeric_limits<int>::max()) + continue; + + MinWaitStates = Found ? std::min(MinWaitStates, W) : W; + if (IsExpired(nullptr, MinWaitStates)) + return MinWaitStates; + + Found = true; + } + + if (Found) + return MinWaitStates; + + return std::numeric_limits<int>::max(); +} + +static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, + MachineInstr *MI, + IsExpiredFn IsExpired) { + DenseSet<const MachineBasicBlock *> Visited; + return getWaitStatesSince(IsHazard, MI->getParent(), + std::next(MI->getReverseIterator()), + 0, IsExpired, Visited); +} + +int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { + if (IsHazardRecognizerMode) { + auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { + return WaitStates >= Limit; + }; + return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); + } + int WaitStates = 0; for (MachineInstr *MI : EmittedInstrs) { if (MI) { if (IsHazard(MI)) return WaitStates; - unsigned Opcode = MI->getOpcode(); - if (Opcode == AMDGPU::INLINEASM) + if (MI->isInlineAsm()) continue; } ++WaitStates; + + if (WaitStates >= Limit) + break; } return std::numeric_limits<int>::max(); } -int GCNHazardRecognizer::getWaitStatesSinceDef( - unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) { +int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, + IsHazardFn IsHazardDef, + int Limit) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); }; - return getWaitStatesSince(IsHazardFn); + return getWaitStatesSince(IsHazardFn, Limit); } -int GCNHazardRecognizer::getWaitStatesSinceSetReg( - function_ref<bool(MachineInstr *)> IsHazard) { +int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, + int Limit) { auto IsHazardFn = [IsHazard] (MachineInstr *MI) { return isSSetReg(MI->getOpcode()) && IsHazard(MI); }; - return getWaitStatesSince(IsHazardFn); + return getWaitStatesSince(IsHazardFn, Limit); } //===----------------------------------------------------------------------===// @@ -328,9 +500,9 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { // instructions in this group may return out of order and/or may be // replayed (i.e. the same instruction issued more than once). // - // In order to handle these situations correctly we need to make sure - // that when a clause has more than one instruction, no instruction in the - // clause writes to a register that is read another instruction in the clause + // In order to handle these situations correctly we need to make sure that + // when a clause has more than one instruction, no instruction in the clause + // writes to a register that is read by another instruction in the clause // (including itself). If we encounter this situaion, we need to break the // clause by inserting a non SMEM instruction. @@ -363,13 +535,12 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { } int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); int WaitStatesNeeded = 0; WaitStatesNeeded = checkSoftClauseHazards(SMRD); // This SMRD hazard only affects SI. - if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (!ST.hasSMRDReadVALUDefHazard()) return WaitStatesNeeded; // A read of an SGPR by SMRD instruction requires 4 wait states when the @@ -384,7 +555,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { if (!Use.isReg()) continue; int WaitStatesNeededForUse = - SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, + SmrdSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); // This fixes what appears to be undocumented hardware behavior in SI where @@ -397,7 +569,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { if (IsBufferSMRD) { int WaitStatesNeededForUse = SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), - IsBufferHazardDefFn); + IsBufferHazardDefFn, + SmrdSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } } @@ -406,7 +579,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { } int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { - if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (!ST.hasVMEMReadSGPRVALUDefHazard()) return 0; int WaitStatesNeeded = checkSoftClauseHazards(VMEM); @@ -415,13 +588,13 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { // SGPR was written by a VALU Instruction. const int VmemSgprWaitStates = 5; auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; - for (const MachineOperand &Use : VMEM->uses()) { if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) continue; int WaitStatesNeededForUse = - VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, + VmemSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } return WaitStatesNeeded; @@ -441,13 +614,16 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) continue; int WaitStatesNeededForUse = - DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg()); + DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), + [](MachineInstr *) { return true; }, + DppVgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } WaitStatesNeeded = std::max( WaitStatesNeeded, - DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn)); + DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, + DppExecWaitStates)); return WaitStatesNeeded; } @@ -459,7 +635,8 @@ int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { // instruction. const int DivFMasWaitStates = 4; auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; - int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn); + int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, + DivFMasWaitStates); return DivFMasWaitStates - WaitStatesNeeded; } @@ -472,7 +649,7 @@ int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { return GetRegHWReg == getHWReg(TII, *MI); }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); return GetRegWaitStates - WaitStatesNeeded; } @@ -481,12 +658,11 @@ int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { const SIInstrInfo *TII = ST.getInstrInfo(); unsigned HWReg = getHWReg(TII, *SetRegInstr); - const int SetRegWaitStates = - ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ? 1 : 2; + const int SetRegWaitStates = ST.getSetRegWaitStates(); auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { return HWReg == getHWReg(TII, *MI); }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); return SetRegWaitStates - WaitStatesNeeded; } @@ -557,7 +733,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); }; int WaitStatesNeededForDef = - VALUWaitStates - getWaitStatesSince(IsHazardFn); + VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); return WaitStatesNeeded; @@ -622,12 +798,13 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { }; const int RWLaneWaitStates = 4; - int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn); + int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, + RWLaneWaitStates); return RWLaneWaitStates - WaitStatesSince; } int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { - if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (!ST.hasRFEHazards()) return 0; const SIInstrInfo *TII = ST.getInstrInfo(); @@ -637,7 +814,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { auto IsHazardFn = [TII] (MachineInstr *MI) { return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); return RFEWaitStates - WaitStatesNeeded; } @@ -661,7 +838,8 @@ int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) { return MI->getOpcode() == AMDGPU::S_MOV_FED_B32; }; int WaitStatesNeededForUse = - MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn); + MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn, + MovFedWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } @@ -674,5 +852,557 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { auto IsHazardFn = [TII] (MachineInstr *MI) { return TII->isSALU(*MI); }; - return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn); + return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, + SMovRelWaitStates); +} + +void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { + fixVMEMtoScalarWriteHazards(MI); + fixVcmpxPermlaneHazards(MI); + fixSMEMtoVectorWriteHazards(MI); + fixVcmpxExecWARHazard(MI); + fixLdsBranchVmemWARHazard(MI); +} + +bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { + if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + auto IsHazardFn = [TII] (MachineInstr *MI) { + return TII->isVOPC(*MI); + }; + + auto IsExpiredFn = [] (MachineInstr *MI, int) { + if (!MI) + return false; + unsigned Opc = MI->getOpcode(); + return SIInstrInfo::isVALU(*MI) && + Opc != AMDGPU::V_NOP_e32 && + Opc != AMDGPU::V_NOP_e64 && + Opc != AMDGPU::V_NOP_sdwa; + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + // V_NOP will be discarded by SQ. + // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* + // which is always a VGPR and available. + auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); + unsigned Reg = Src0->getReg(); + bool IsUndef = Src0->isUndef(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32)) + .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) + .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); + + return true; +} + +bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { + if (!ST.hasVMEMtoScalarWriteHazard()) + return false; + + if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) + return false; + + if (MI->getNumDefs() == 0) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsHazardFn = [TRI, MI] (MachineInstr *I) { + if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && + !SIInstrInfo::isFLAT(*I)) + return false; + + for (const MachineOperand &Def : MI->defs()) { + MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); + if (!Op) + continue; + return true; + } + return false; + }; + + auto IsExpiredFn = [] (MachineInstr *MI, int) { + return MI && (SIInstrInfo::isVALU(*MI) || + (MI->getOpcode() == AMDGPU::S_WAITCNT && + !MI->getOperand(0).getImm())); + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); + return true; +} + +bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { + if (!ST.hasSMEMtoVectorWriteHazard()) + return false; + + if (!SIInstrInfo::isVALU(*MI)) + return false; + + unsigned SDSTName; + switch (MI->getOpcode()) { + case AMDGPU::V_READLANE_B32: + case AMDGPU::V_READFIRSTLANE_B32: + SDSTName = AMDGPU::OpName::vdst; + break; + default: + SDSTName = AMDGPU::OpName::sdst; + break; + } + + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); + const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); + if (!SDST) { + for (const auto &MO : MI->implicit_operands()) { + if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { + SDST = &MO; + break; + } + } + } + + if (!SDST) + return false; + + const unsigned SDSTReg = SDST->getReg(); + auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { + return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); + }; + + auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { + if (MI) { + if (TII->isSALU(*MI)) { + switch (MI->getOpcode()) { + case AMDGPU::S_SETVSKIP: + case AMDGPU::S_VERSION: + case AMDGPU::S_WAITCNT_VSCNT: + case AMDGPU::S_WAITCNT_VMCNT: + case AMDGPU::S_WAITCNT_EXPCNT: + // These instructions cannot not mitigate the hazard. + return false; + case AMDGPU::S_WAITCNT_LGKMCNT: + // Reducing lgkmcnt count to 0 always mitigates the hazard. + return (MI->getOperand(1).getImm() == 0) && + (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); + case AMDGPU::S_WAITCNT: { + const int64_t Imm = MI->getOperand(0).getImm(); + AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); + return (Decoded.LgkmCnt == 0); + } + default: + // SOPP instructions cannot mitigate the hazard. + if (TII->isSOPP(*MI)) + return false; + // At this point the SALU can be assumed to mitigate the hazard + // because either: + // (a) it is independent of the at risk SMEM (breaking chain), + // or + // (b) it is dependent on the SMEM, in which case an appropriate + // s_waitcnt lgkmcnt _must_ exist between it and the at risk + // SMEM instruction. + return true; + } + } + } + return false; + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) + .addImm(0); + return true; +} + +bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { + if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) + return false; + + auto IsHazardFn = [TRI] (MachineInstr *I) { + if (SIInstrInfo::isVALU(*I)) + return false; + return I->readsRegister(AMDGPU::EXEC, TRI); + }; + + const SIInstrInfo *TII = ST.getInstrInfo(); + auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { + if (!MI) + return false; + if (SIInstrInfo::isVALU(*MI)) { + if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) + return true; + for (auto MO : MI->implicit_operands()) + if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) + return true; + } + if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) + return true; + return false; + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xfffe); + return true; +} + +bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { + if (!ST.hasLdsBranchVmemWARHazard()) + return false; + + auto IsHazardInst = [] (const MachineInstr *MI) { + if (SIInstrInfo::isDS(*MI)) + return 1; + if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) + return 2; + return 0; + }; + + auto InstType = IsHazardInst(MI); + if (!InstType) + return false; + + auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { + return I && (IsHazardInst(I) || + (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && + !I->getOperand(1).getImm())); + }; + + auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { + if (!I->isBranch()) + return false; + + auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { + auto InstType2 = IsHazardInst(I); + return InstType2 && InstType != InstType2; + }; + + auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { + if (!I) + return false; + + auto InstType2 = IsHazardInst(I); + if (InstType == InstType2) + return true; + + return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && + !I->getOperand(1).getImm(); + }; + + return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != + std::numeric_limits<int>::max(); + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + + return true; +} + +int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { + int NSAtoVMEMWaitStates = 1; + + if (!ST.hasNSAtoVMEMBug()) + return 0; + + if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) + return 0; + + const SIInstrInfo *TII = ST.getInstrInfo(); + const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); + if (!Offset || (Offset->getImm() & 6) == 0) + return 0; + + auto IsHazardFn = [TII] (MachineInstr *I) { + if (!SIInstrInfo::isMIMG(*I)) + return false; + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); + return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && + TII->getInstSizeInBytes(*I) >= 16; + }; + + return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); +} + +int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { + int FPAtomicToDenormModeWaitStates = 3; + + if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) + return 0; + + auto IsHazardFn = [] (MachineInstr *I) { + if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) + return false; + return SIInstrInfo::isFPAtomic(*I); + }; + + auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { + if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) + return true; + + switch (MI->getOpcode()) { + case AMDGPU::S_WAITCNT: + case AMDGPU::S_WAITCNT_VSCNT: + case AMDGPU::S_WAITCNT_VMCNT: + case AMDGPU::S_WAITCNT_EXPCNT: + case AMDGPU::S_WAITCNT_LGKMCNT: + case AMDGPU::S_WAITCNT_IDLE: + return true; + default: + break; + } + + return false; + }; + + + return FPAtomicToDenormModeWaitStates - + ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); +} + +int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { + assert(SIInstrInfo::isMAI(*MI)); + + int WaitStatesNeeded = 0; + unsigned Opc = MI->getOpcode(); + + auto IsVALUFn = [] (MachineInstr *MI) { + return SIInstrInfo::isVALU(*MI); + }; + + if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write + const int LegacyVALUWritesVGPRWaitStates = 2; + const int VALUWritesExecWaitStates = 4; + const int MaxWaitStates = 4; + + int WaitStatesNeededForUse = VALUWritesExecWaitStates - + getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded < MaxWaitStates) { + for (const MachineOperand &Use : MI->explicit_uses()) { + const int MaxWaitStates = 2; + + if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + + int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - + getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + break; + } + } + } + + auto IsMFMAFn = [] (MachineInstr *MI) { + return SIInstrInfo::isMAI(*MI) && + MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && + MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32; + }; + + for (const MachineOperand &Op : MI->explicit_operands()) { + if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) + continue; + + if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32) + continue; + + const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; + const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; + const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; + const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; + const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; + const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; + const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; + const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; + const int MaxWaitStates = 18; + unsigned Reg = Op.getReg(); + unsigned HazardDefLatency = 0; + + auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] + (MachineInstr *MI) { + if (!IsMFMAFn(MI)) + return false; + unsigned DstReg = MI->getOperand(0).getReg(); + if (DstReg == Reg) + return false; + HazardDefLatency = std::max(HazardDefLatency, + TSchedModel.computeInstrLatency(MI)); + return TRI.regsOverlap(DstReg, Reg); + }; + + int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, + MaxWaitStates); + int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; + int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + int OpNo = MI->getOperandNo(&Op); + if (OpNo == SrcCIdx) { + NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; + } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) { + switch (HazardDefLatency) { + case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; + break; + case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; + break; + } + } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { + switch (HazardDefLatency) { + case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; + break; + case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; + break; + } + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + return WaitStatesNeeded; // Early exit. + + auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) + return false; + unsigned DstReg = MI->getOperand(0).getReg(); + return TRI.regsOverlap(Reg, DstReg); + }; + + const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; + const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; + const int AccVGPRWriteAccVgprReadWaitStates = 3; + NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; + if (OpNo == SrcCIdx) + NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; + else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) + NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; + + WaitStatesNeededForUse = NeedWaitStates - + getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + return WaitStatesNeeded; // Early exit. + } + + if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { + const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; + const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; + const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; + const int MaxWaitStates = 13; + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned HazardDefLatency = 0; + + auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] + (MachineInstr *MI) { + if (!IsMFMAFn(MI)) + return false; + unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); + HazardDefLatency = std::max(HazardDefLatency, + TSchedModel.computeInstrLatency(MI)); + return TRI.regsOverlap(Reg, DstReg); + }; + + int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); + int NeedWaitStates; + switch (HazardDefLatency) { + case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; + break; + case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; + break; + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { + if (!ST.hasMAIInsts()) + return 0; + + int WaitStatesNeeded = 0; + + auto IsAccVgprReadFn = [] (MachineInstr *MI) { + return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32; + }; + + for (const MachineOperand &Op : MI->explicit_uses()) { + if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) + continue; + + unsigned Reg = Op.getReg(); + + const int AccVgprReadLdStWaitStates = 2; + const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1; + const int MaxWaitStates = 2; + + int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - + getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + return WaitStatesNeeded; // Early exit. + + auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) + return false; + auto IsVALUFn = [] (MachineInstr *MI) { + return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); + }; + return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < + std::numeric_limits<int>::max(); + }; + + WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates - + getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; } diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h index ca17e7cb6018..6aa2e70dfbfb 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -1,9 +1,8 @@ //===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,6 +16,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/TargetSchedule.h" #include <list> namespace llvm { @@ -31,6 +31,13 @@ class SIRegisterInfo; class GCNSubtarget; class GCNHazardRecognizer final : public ScheduleHazardRecognizer { +public: + typedef function_ref<bool(MachineInstr *)> IsHazardFn; + +private: + // Distinguish if we are called from scheduler or hazard recognizer + bool IsHazardRecognizerMode; + // This variable stores the instruction that has been emitted this cycle. It // will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is // called. @@ -40,6 +47,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { const GCNSubtarget &ST; const SIInstrInfo &TII; const SIRegisterInfo &TRI; + TargetSchedModel TSchedModel; /// RegUnits of uses in the current soft memory clause. BitVector ClauseUses; @@ -54,11 +62,13 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { void addClauseInst(const MachineInstr &MI); - int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard); - int getWaitStatesSinceDef(unsigned Reg, - function_ref<bool(MachineInstr *)> IsHazardDef = - [](MachineInstr *) { return true; }); - int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard); + // Advance over a MachineInstr bundle. Look for hazards in the bundled + // instructions. + void processBundle(); + + int getWaitStatesSince(IsHazardFn IsHazard, int Limit); + int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit); + int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit); int checkSoftClauseHazards(MachineInstr *SMEM); int checkSMRDHazards(MachineInstr *SMRD); @@ -75,6 +85,18 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { int checkInlineAsmHazards(MachineInstr *IA); int checkAnyInstHazards(MachineInstr *MI); int checkReadM0Hazards(MachineInstr *SMovRel); + int checkNSAtoVMEMHazard(MachineInstr *MI); + int checkFPAtomicToDenormModeHazard(MachineInstr *MI); + void fixHazards(MachineInstr *MI); + bool fixVcmpxPermlaneHazards(MachineInstr *MI); + bool fixVMEMtoScalarWriteHazards(MachineInstr *MI); + bool fixSMEMtoVectorWriteHazards(MachineInstr *MI); + bool fixVcmpxExecWARHazard(MachineInstr *MI); + bool fixLdsBranchVmemWARHazard(MachineInstr *MI); + + int checkMAIHazards(MachineInstr *MI); + int checkMAILdStHazards(MachineInstr *MI); + public: GCNHazardRecognizer(const MachineFunction &MF); // We can only issue one instruction per cycle. @@ -85,6 +107,7 @@ public: void EmitNoop() override; unsigned PreEmitNoops(SUnit *SU) override; unsigned PreEmitNoops(MachineInstr *) override; + unsigned PreEmitNoopsCommon(MachineInstr *); void AdvanceCycle() override; void RecedeCycle() override; }; diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp index d62dc8d86781..1eb617640c32 100644 --- a/lib/Target/AMDGPU/GCNILPSched.cpp +++ b/lib/Target/AMDGPU/GCNILPSched.cpp @@ -1,9 +1,8 @@ //===---------------------------- GCNILPSched.cpp - -----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 8e4cc391dc21..3525174223bd 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -1,9 +1,8 @@ //===- GCNIterativeScheduler.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.h b/lib/Target/AMDGPU/GCNIterativeScheduler.h index 14ef5147f32a..e6f83914af5b 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.h +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -1,9 +1,8 @@ //===- GCNIterativeScheduler.h - GCN Scheduler ------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp index ec6bcae33555..c469cf290e26 100644 --- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -1,9 +1,8 @@ //===- GCNMinRegStrategy.cpp ----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNNSAReassign.cpp b/lib/Target/AMDGPU/GCNNSAReassign.cpp new file mode 100644 index 000000000000..51c4c99cfb18 --- /dev/null +++ b/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -0,0 +1,343 @@ +//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential +/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA +/// with sequential versions where possible. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Support/MathExtras.h" +#include <algorithm> + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-nsa-reassign" + +STATISTIC(NumNSAInstructions, + "Number of NSA instructions with non-sequential address found"); +STATISTIC(NumNSAConverted, + "Number of NSA instructions changed to sequential"); + +namespace { + +class GCNNSAReassign : public MachineFunctionPass { +public: + static char ID; + + GCNNSAReassign() : MachineFunctionPass(ID) { + initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN NSA Reassign"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.addRequired<VirtRegMap>(); + AU.addRequired<LiveRegMatrix>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + typedef enum { + NOT_NSA, // Not an NSA instruction + FIXED, // NSA which we cannot modify + NON_CONTIGUOUS, // NSA with non-sequential address which we can try + // to optimize. + CONTIGUOUS // NSA with all sequential address registers + } NSA_Status; + + const GCNSubtarget *ST; + + const MachineRegisterInfo *MRI; + + const SIRegisterInfo *TRI; + + VirtRegMap *VRM; + + LiveRegMatrix *LRM; + + LiveIntervals *LIS; + + unsigned MaxNumVGPRs; + + const MCPhysReg *CSRegs; + + NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; + + bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, + unsigned StartReg) const; + + bool canAssign(unsigned StartReg, unsigned NumRegs) const; + + bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", + false, false) + + +char GCNNSAReassign::ID = 0; + +char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; + +bool +GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, + unsigned StartReg) const { + unsigned NumRegs = Intervals.size(); + + for (unsigned N = 0; N < NumRegs; ++N) + if (VRM->hasPhys(Intervals[N]->reg)) + LRM->unassign(*Intervals[N]); + + for (unsigned N = 0; N < NumRegs; ++N) + if (LRM->checkInterference(*Intervals[N], StartReg + N)) + return false; + + for (unsigned N = 0; N < NumRegs; ++N) + LRM->assign(*Intervals[N], StartReg + N); + + return true; +} + +bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { + for (unsigned N = 0; N < NumRegs; ++N) { + unsigned Reg = StartReg + N; + if (!MRI->isAllocatable(Reg)) + return false; + + for (unsigned I = 0; CSRegs[I]; ++I) + if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && + !LRM->isPhysRegUsed(CSRegs[I])) + return false; + } + + return true; +} + +bool +GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const { + unsigned NumRegs = Intervals.size(); + + if (NumRegs > MaxNumVGPRs) + return false; + unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; + + for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { + if (!canAssign(Reg, NumRegs)) + continue; + + if (tryAssignRegisters(Intervals, Reg)) + return true; + } + + return false; +} + +GCNNSAReassign::NSA_Status +GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); + if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + return NSA_Status::NOT_NSA; + + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); + + unsigned VgprBase = 0; + bool NSA = false; + for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); + unsigned Reg = Op.getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) + return NSA_Status::FIXED; + + unsigned PhysReg = VRM->getPhys(Reg); + + if (!Fast) { + if (!PhysReg) + return NSA_Status::FIXED; + + // Bail if address is not a VGPR32. That should be possible to extend the + // optimization to work with subregs of a wider register tuples, but the + // logic to find free registers will be much more complicated with much + // less chances for success. That seems reasonable to assume that in most + // cases a tuple is used because a vector variable contains different + // parts of an address and it is either already consequitive or cannot + // be reassigned if not. If needed it is better to rely on register + // coalescer to process such address tuples. + if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg()) + return NSA_Status::FIXED; + + const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); + + if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) + return NSA_Status::FIXED; + + for (auto U : MRI->use_nodbg_operands(Reg)) { + if (U.isImplicit()) + return NSA_Status::FIXED; + const MachineInstr *UseInst = U.getParent(); + if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) + return NSA_Status::FIXED; + } + + if (!LIS->hasInterval(Reg)) + return NSA_Status::FIXED; + } + + if (I == 0) + VgprBase = PhysReg; + else if (VgprBase + I != PhysReg) + NSA = true; + } + + return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; +} + +bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget<GCNSubtarget>(); + if (ST->getGeneration() < GCNSubtarget::GFX10) + return false; + + MRI = &MF.getRegInfo(); + TRI = ST->getRegisterInfo(); + VRM = &getAnalysis<VirtRegMap>(); + LRM = &getAnalysis<LiveRegMatrix>(); + LIS = &getAnalysis<LiveIntervals>(); + + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MaxNumVGPRs = ST->getMaxNumVGPRs(MF); + MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs); + CSRegs = MRI->getCalleeSavedRegs(); + + using Candidate = std::pair<const MachineInstr*, bool>; + SmallVector<Candidate, 32> Candidates; + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + switch (CheckNSA(MI)) { + default: + continue; + case NSA_Status::CONTIGUOUS: + Candidates.push_back(std::make_pair(&MI, true)); + break; + case NSA_Status::NON_CONTIGUOUS: + Candidates.push_back(std::make_pair(&MI, false)); + ++NumNSAInstructions; + break; + } + } + } + + bool Changed = false; + for (auto &C : Candidates) { + if (C.second) + continue; + + const MachineInstr *MI = C.first; + if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) { + // Already happen to be fixed. + C.second = true; + ++NumNSAConverted; + continue; + } + + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode()); + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0); + + SmallVector<LiveInterval *, 16> Intervals; + SmallVector<unsigned, 16> OrigRegs; + SlotIndex MinInd, MaxInd; + for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); + unsigned Reg = Op.getReg(); + LiveInterval *LI = &LIS->getInterval(Reg); + if (llvm::find(Intervals, LI) != Intervals.end()) { + // Same register used, unable to make sequential + Intervals.clear(); + break; + } + Intervals.push_back(LI); + OrigRegs.push_back(VRM->getPhys(Reg)); + MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex(); + MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex(); + } + + if (Intervals.empty()) + continue; + + LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI + << "\tOriginal allocation:\t"; + for(auto *LI : Intervals) + dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI); + dbgs() << '\n'); + + bool Success = scavengeRegs(Intervals); + if (!Success) { + LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n"); + if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation. + continue; + } else { + // Check we did not make it worse for other instructions. + auto I = std::lower_bound(Candidates.begin(), &C, MinInd, + [this](const Candidate &C, SlotIndex I) { + return LIS->getInstructionIndex(*C.first) < I; + }); + for (auto E = Candidates.end(); Success && I != E && + LIS->getInstructionIndex(*I->first) < MaxInd; ++I) { + if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) { + Success = false; + LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); + } + } + } + + if (!Success) { + for (unsigned I = 0; I < Info->VAddrDwords; ++I) + if (VRM->hasPhys(Intervals[I]->reg)) + LRM->unassign(*Intervals[I]); + + for (unsigned I = 0; I < Info->VAddrDwords; ++I) + LRM->assign(*Intervals[I], OrigRegs[I]); + + continue; + } + + C.second = true; + ++NumNSAConverted; + LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t [" + << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI) + << " : " + << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI) + << "]\n"); + Changed = true; + } + + return Changed; +} diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td index b8142a4e4ff8..b926041afb2f 100644 --- a/lib/Target/AMDGPU/GCNProcessors.td +++ b/lib/Target/AMDGPU/GCNProcessors.td @@ -1,163 +1,185 @@ //===-- GCNProcessors.td - GCN Processor definitions ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // The code produced for "generic" is only useful for tests and cannot // reasonably be expected to execute on any particular target. def : ProcessorModel<"generic", NoSchedModel, - [FeatureGCN, FeatureWavefrontSize64] + [FeatureWavefrontSize64] >; -//===----------------------------------------------------------------------===// +def : ProcessorModel<"generic-hsa", NoSchedModel, + [FeatureWavefrontSize64, FeatureFlatAddressSpace] +>; + +//===------------------------------------------------------------===// // GCN GFX6 (Southern Islands (SI)). -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// def : ProcessorModel<"gfx600", SIFullSpeedModel, - [FeatureISAVersion6_0_0] + FeatureISAVersion6_0_0.Features >; def : ProcessorModel<"tahiti", SIFullSpeedModel, - [FeatureISAVersion6_0_0] + FeatureISAVersion6_0_0.Features >; def : ProcessorModel<"gfx601", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1] + FeatureISAVersion6_0_1.Features >; def : ProcessorModel<"hainan", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1] + FeatureISAVersion6_0_1.Features >; def : ProcessorModel<"oland", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1] + FeatureISAVersion6_0_1.Features >; def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1] + FeatureISAVersion6_0_1.Features >; def : ProcessorModel<"verde", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1] + FeatureISAVersion6_0_1.Features >; -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// // GCN GFX7 (Sea Islands (CI)). -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// def : ProcessorModel<"gfx700", SIQuarterSpeedModel, - [FeatureISAVersion7_0_0] + FeatureISAVersion7_0_0.Features >; def : ProcessorModel<"kaveri", SIQuarterSpeedModel, - [FeatureISAVersion7_0_0] + FeatureISAVersion7_0_0.Features >; def : ProcessorModel<"gfx701", SIFullSpeedModel, - [FeatureISAVersion7_0_1] + FeatureISAVersion7_0_1.Features >; def : ProcessorModel<"hawaii", SIFullSpeedModel, - [FeatureISAVersion7_0_1] + FeatureISAVersion7_0_1.Features >; def : ProcessorModel<"gfx702", SIQuarterSpeedModel, - [FeatureISAVersion7_0_2] + FeatureISAVersion7_0_2.Features >; def : ProcessorModel<"gfx703", SIQuarterSpeedModel, - [FeatureISAVersion7_0_3] + FeatureISAVersion7_0_3.Features >; def : ProcessorModel<"kabini", SIQuarterSpeedModel, - [FeatureISAVersion7_0_3] + FeatureISAVersion7_0_3.Features >; def : ProcessorModel<"mullins", SIQuarterSpeedModel, - [FeatureISAVersion7_0_3] + FeatureISAVersion7_0_3.Features >; def : ProcessorModel<"gfx704", SIQuarterSpeedModel, - [FeatureISAVersion7_0_4] + FeatureISAVersion7_0_4.Features >; def : ProcessorModel<"bonaire", SIQuarterSpeedModel, - [FeatureISAVersion7_0_4] + FeatureISAVersion7_0_4.Features >; -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// // GCN GFX8 (Volcanic Islands (VI)). -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// def : ProcessorModel<"gfx801", SIQuarterSpeedModel, - [FeatureISAVersion8_0_1] + FeatureISAVersion8_0_1.Features >; def : ProcessorModel<"carrizo", SIQuarterSpeedModel, - [FeatureISAVersion8_0_1] + FeatureISAVersion8_0_1.Features >; def : ProcessorModel<"gfx802", SIQuarterSpeedModel, - [FeatureISAVersion8_0_2] + FeatureISAVersion8_0_2.Features >; def : ProcessorModel<"iceland", SIQuarterSpeedModel, - [FeatureISAVersion8_0_2] + FeatureISAVersion8_0_2.Features >; def : ProcessorModel<"tonga", SIQuarterSpeedModel, - [FeatureISAVersion8_0_2] + FeatureISAVersion8_0_2.Features >; def : ProcessorModel<"gfx803", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] + FeatureISAVersion8_0_3.Features >; def : ProcessorModel<"fiji", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] + FeatureISAVersion8_0_3.Features >; def : ProcessorModel<"polaris10", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] + FeatureISAVersion8_0_3.Features >; def : ProcessorModel<"polaris11", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] + FeatureISAVersion8_0_3.Features >; def : ProcessorModel<"gfx810", SIQuarterSpeedModel, - [FeatureISAVersion8_1_0] + FeatureISAVersion8_1_0.Features >; def : ProcessorModel<"stoney", SIQuarterSpeedModel, - [FeatureISAVersion8_1_0] + FeatureISAVersion8_1_0.Features >; -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// // GCN GFX9. -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// def : ProcessorModel<"gfx900", SIQuarterSpeedModel, - [FeatureISAVersion9_0_0] + FeatureISAVersion9_0_0.Features >; def : ProcessorModel<"gfx902", SIQuarterSpeedModel, - [FeatureISAVersion9_0_2] + FeatureISAVersion9_0_2.Features >; def : ProcessorModel<"gfx904", SIQuarterSpeedModel, - [FeatureISAVersion9_0_4] + FeatureISAVersion9_0_4.Features >; def : ProcessorModel<"gfx906", SIQuarterSpeedModel, - [FeatureISAVersion9_0_6] + FeatureISAVersion9_0_6.Features +>; + +def : ProcessorModel<"gfx908", SIQuarterSpeedModel, + FeatureISAVersion9_0_8.Features >; def : ProcessorModel<"gfx909", SIQuarterSpeedModel, - [FeatureISAVersion9_0_9] + FeatureISAVersion9_0_9.Features +>; + +//===----------------------------------------------------------------------===// +// GCN GFX10. +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx1010", GFX10SpeedModel, + FeatureISAVersion10_1_0.Features >; +def : ProcessorModel<"gfx1011", GFX10SpeedModel, + FeatureISAVersion10_1_1.Features +>; + +def : ProcessorModel<"gfx1012", GFX10SpeedModel, + FeatureISAVersion10_1_2.Features +>; diff --git a/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/lib/Target/AMDGPU/GCNRegBankReassign.cpp new file mode 100644 index 000000000000..f0d47eaa4ed1 --- /dev/null +++ b/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -0,0 +1,800 @@ +//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Try to reassign registers on GFX10+ to reduce register bank +/// conflicts. +/// +/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in +/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to +/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, +/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. +/// +/// The shader can read one dword from each of these banks once per cycle. +/// If an instruction has to read more register operands from the same bank +/// an additional cycle is needed. HW attempts to pre-load registers through +/// input operand gathering, but a stall cycle may occur if that fails. For +/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, +/// potentially incuring 2 stall cycles. +/// +/// The pass tries to reassign registers to reduce bank conflicts. +/// +/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so +/// that 4 has to be subtracted from an SGPR bank number to get the real value. +/// This also corresponds to bit numbers in bank masks used in the pass. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Support/MathExtras.h" + +using namespace llvm; + +static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign", + cl::desc("Verify stall cycles in the regbanks reassign pass"), + cl::value_desc("0|1|2"), + cl::init(0), cl::Hidden); + +#define DEBUG_TYPE "amdgpu-regbanks-reassign" + +#define NUM_VGPR_BANKS 4 +#define NUM_SGPR_BANKS 8 +#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) +#define SGPR_BANK_OFFSET NUM_VGPR_BANKS +#define VGPR_BANK_MASK 0xf +#define SGPR_BANK_MASK 0xff0 +#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) + +STATISTIC(NumStallsDetected, + "Number of operand read stalls detected"); +STATISTIC(NumStallsRecovered, + "Number of operand read stalls recovered"); + +namespace { + +class GCNRegBankReassign : public MachineFunctionPass { + + class OperandMask { + public: + OperandMask(unsigned r, unsigned s, unsigned m) + : Reg(r), SubReg(s), Mask(m) {} + unsigned Reg; + unsigned SubReg; + unsigned Mask; + }; + + class Candidate { + public: + Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, + unsigned weight) + : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} + + bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(const GCNRegBankReassign *P) const { + MI->dump(); + dbgs() << P->printReg(Reg) << " to banks "; + dumpFreeBanks(FreeBanks); + dbgs() << " weight " << Weight << '\n'; + } +#endif + + MachineInstr *MI; + unsigned Reg; + unsigned FreeBanks; + unsigned Weight; + }; + + class CandidateList : public std::list<Candidate> { + public: + // Speedup subsequent sort. + void push(const Candidate&& C) { + if (C.Weight) push_back(C); + else push_front(C); + } + }; + +public: + static char ID; + +public: + GCNRegBankReassign() : MachineFunctionPass(ID) { + initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN RegBank Reassign"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineLoopInfo>(); + AU.addRequired<LiveIntervals>(); + AU.addRequired<VirtRegMap>(); + AU.addRequired<LiveRegMatrix>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + const GCNSubtarget *ST; + + const MachineRegisterInfo *MRI; + + const SIRegisterInfo *TRI; + + MachineLoopInfo *MLI; + + VirtRegMap *VRM; + + LiveRegMatrix *LRM; + + LiveIntervals *LIS; + + unsigned MaxNumVGPRs; + + unsigned MaxNumSGPRs; + + BitVector RegsUsed; + + SmallVector<OperandMask, 8> OperandMasks; + + CandidateList Candidates; + + const MCPhysReg *CSRegs; + + // Returns bank for a phys reg. + unsigned getPhysRegBank(unsigned Reg) const; + + // Return a bit set for each register bank used. 4 banks for VGPRs and + // 8 banks for SGPRs. + // Registers already processed and recorded in RegsUsed are excluded. + // If Bank is not -1 assume Reg:SubReg to belong to that Bank. + unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank); + + // Return number of stalls in the instructions. + // UsedBanks has bits set for the banks used by all operands. + // If Reg and Bank provided substitute the Reg with the Bank. + unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks, + unsigned Reg = AMDGPU::NoRegister, int Bank = -1); + + // Return true if register is regular VGPR or SGPR or their tuples. + // Returns false for special registers like m0, vcc etc. + bool isReassignable(unsigned Reg) const; + + // Check if registers' defs are old and may be pre-loaded. + // Returns 0 if both registers are old enough, 1 or 2 if one or both + // registers will not likely be pre-loaded. + unsigned getOperandGatherWeight(const MachineInstr& MI, + unsigned Reg1, + unsigned Reg2, + unsigned StallCycles) const; + + + // Find all bank bits in UsedBanks where Mask can be relocated to. + unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; + + // Find all bank bits in UsedBanks where Mask can be relocated to. + // Bank is relative to the register and not its subregister component. + // Returns 0 is a register is not reassignable. + unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask, + unsigned UsedBanks) const; + + // Add cadidate instruction to the work list. + void collectCandidates(MachineInstr& MI, unsigned UsedBanks, + unsigned StallCycles); + + // Collect cadidate instructions across function. Returns a number stall + // cycles detected. Only counts stalls if Collect is false. + unsigned collectCandidates(MachineFunction &MF, bool Collect = true); + + // Remove all candidates that read specified register. + void removeCandidates(unsigned Reg); + + // Compute stalls within the uses of SrcReg replaced by a register from + // Bank. If Bank is -1 does not perform substitution. If Collect is set + // candidates are collected and added to work list. + unsigned computeStallCycles(unsigned SrcReg, + unsigned Reg = AMDGPU::NoRegister, + int Bank = -1, bool Collect = false); + + // Search for a register in Bank unused within LI. + // Returns phys reg or NoRegister. + unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; + + // Try to reassign candidate. Returns number or stall cycles saved. + unsigned tryReassign(Candidate &C); + + bool verifyCycles(MachineFunction &MF, + unsigned OriginalCycles, unsigned CyclesSaved); + + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +public: + Printable printReg(unsigned Reg, unsigned SubReg = 0) const { + return Printable([Reg, SubReg, this](raw_ostream &OS) { + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + OS << llvm::printReg(Reg, TRI); + return; + } + if (!VRM->isAssignedReg(Reg)) + OS << "<unassigned> " << llvm::printReg(Reg, TRI); + else + OS << llvm::printReg(Reg, TRI) << '(' + << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; + if (SubReg) + OS << ':' << TRI->getSubRegIndexName(SubReg); + }); + } + + static Printable printBank(unsigned Bank) { + return Printable([Bank](raw_ostream &OS) { + OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); + }); + } + + static void dumpFreeBanks(unsigned FreeBanks) { + for (unsigned L = 0; L < NUM_BANKS; ++L) + if (FreeBanks & (1 << L)) + dbgs() << printBank(L) << ' '; + } +#endif +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", + false, false) + + +char GCNRegBankReassign::ID = 0; + +char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; + +unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { + assert (TargetRegisterInfo::isPhysicalRegister(Reg)); + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + unsigned Size = TRI->getRegSizeInBits(*RC); + if (Size > 32) + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + + if (TRI->hasVGPRs(RC)) { + Reg -= AMDGPU::VGPR0; + return Reg % NUM_VGPR_BANKS; + } + + Reg = TRI->getEncodingValue(Reg) / 2; + return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; +} + +unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, + int Bank) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!VRM->isAssignedReg(Reg)) + return 0; + + Reg = VRM->getPhys(Reg); + if (!Reg) + return 0; + if (SubReg) + Reg = TRI->getSubReg(Reg, SubReg); + } + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + unsigned Size = TRI->getRegSizeInBits(*RC) / 32; + if (Size > 1) + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + + if (TRI->hasVGPRs(RC)) { + // VGPRs have 4 banks assigned in a round-robin fashion. + Reg -= AMDGPU::VGPR0; + unsigned Mask = (1 << Size) - 1; + unsigned Used = 0; + // Bitmask lacks an extract method + for (unsigned I = 0; I < Size; ++I) + if (RegsUsed.test(Reg + I)) + Used |= 1 << I; + RegsUsed.set(Reg, Reg + Size); + Mask &= ~Used; + Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank); + return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; + } + + // SGPRs have 8 banks holding 2 consequitive registers each. + Reg = TRI->getEncodingValue(Reg) / 2; + unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); + if (Reg + StartBit >= RegsUsed.size()) + return 0; + + if (Size > 1) + Size /= 2; + unsigned Mask = (1 << Size) - 1; + unsigned Used = 0; + for (unsigned I = 0; I < Size; ++I) + if (RegsUsed.test(StartBit + Reg + I)) + Used |= 1 << I; + RegsUsed.set(StartBit + Reg, StartBit + Reg + Size); + Mask &= ~Used; + Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS + : unsigned(Bank - SGPR_BANK_OFFSET); + Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; + // Reserve 4 bank ids for VGPRs. + return Mask << SGPR_BANK_OFFSET; +} + +unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, + unsigned& UsedBanks, + unsigned Reg, + int Bank) { + unsigned StallCycles = 0; + UsedBanks = 0; + + if (MI.isDebugValue()) + return 0; + + RegsUsed.reset(); + OperandMasks.clear(); + for (const auto& Op : MI.explicit_uses()) { + // Undef can be assigned to any register, so two vregs can be assigned + // the same phys reg within the same instruction. + if (!Op.isReg() || Op.isUndef()) + continue; + + unsigned R = Op.getReg(); + if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) + continue; + + unsigned ShiftedBank = Bank; + + if (Bank != -1 && R == Reg && Op.getSubReg()) { + unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger(); + if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) { + // If a register spans all banks we cannot shift it to avoid conflict. + if (countPopulation(LM) >= NUM_VGPR_BANKS) + continue; + ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS; + } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) { + // If a register spans all banks we cannot shift it to avoid conflict. + if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS) + continue; + ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET + + (countTrailingZeros(LM) >> 1)) % + NUM_SGPR_BANKS; + } + } + + unsigned Mask = getRegBankMask(R, Op.getSubReg(), + (Reg == R) ? ShiftedBank : -1); + StallCycles += countPopulation(UsedBanks & Mask); + UsedBanks |= Mask; + OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); + } + + return StallCycles; +} + +unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, + unsigned Reg1, + unsigned Reg2, + unsigned StallCycles) const +{ + unsigned Defs = 0; + MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); + MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); + for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { + if (MI.isDebugInstr()) + continue; + --Def; + if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) + continue; + if (Def->modifiesRegister(Reg1, TRI)) + Defs |= 1; + if (Def->modifiesRegister(Reg2, TRI)) + Defs |= 2; + } + return countPopulation(Defs); +} + +bool GCNRegBankReassign::isReassignable(unsigned Reg) const { + if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) + return false; + + const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); + + unsigned PhysReg = VRM->getPhys(Reg); + + if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) + return false; + + for (auto U : MRI->use_nodbg_operands(Reg)) { + if (U.isImplicit()) + return false; + const MachineInstr *UseInst = U.getParent(); + if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) + return false; + } + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); + if (TRI->hasVGPRs(RC)) + return true; + + unsigned Size = TRI->getRegSizeInBits(*RC); + if (Size > 32) + PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); + + return AMDGPU::SGPR_32RegClass.contains(PhysReg); +} + +unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, + unsigned UsedBanks) const { + unsigned Size = countPopulation(Mask); + unsigned FreeBanks = 0; + unsigned Bank = findFirstSet(Mask); + + UsedBanks &= ~Mask; + + // Find free VGPR banks + if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { + for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { + if (Bank == I) + continue; + unsigned NewMask = ((1 << Size) - 1) << I; + NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; + if (!(UsedBanks & NewMask)) + FreeBanks |= 1 << I; + } + return FreeBanks; + } + + // Find free SGPR banks + // SGPR tuples must be aligned, so step is size in banks it + // crosses. + Bank -= SGPR_BANK_OFFSET; + for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { + if (Bank == I) + continue; + unsigned NewMask = ((1 << Size) - 1) << I; + NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; + if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) + FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; + } + + return FreeBanks; +} + +unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg, + unsigned SubReg, + unsigned Mask, + unsigned UsedBanks) const { + if (!isReassignable(Reg)) + return 0; + + unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); + + unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger(); + if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) { + unsigned Shift = countTrailingZeros(LM); + if (Shift >= NUM_VGPR_BANKS) + return 0; + unsigned VB = FreeBanks & VGPR_BANK_MASK; + FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & + VGPR_BANK_MASK; + } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) { + unsigned Shift = countTrailingZeros(LM) >> 1; + if (Shift >= NUM_SGPR_BANKS) + return 0; + unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; + FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & + SGPR_BANK_SHIFTED_MASK; + FreeBanks <<= SGPR_BANK_OFFSET; + } + + LLVM_DEBUG(if (FreeBanks) { + dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) + << " to banks: "; dumpFreeBanks(FreeBanks); + dbgs() << '\n'; }); + + return FreeBanks; +} + +void GCNRegBankReassign::collectCandidates(MachineInstr& MI, + unsigned UsedBanks, + unsigned StallCycles) { + LLVM_DEBUG(MI.dump()); + + if (!StallCycles) + return; + + LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); + + for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { + for (unsigned J = I + 1; J != E; ++J) { + if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) + continue; + + unsigned Reg1 = OperandMasks[I].Reg; + unsigned Reg2 = OperandMasks[J].Reg; + unsigned SubReg1 = OperandMasks[I].SubReg; + unsigned SubReg2 = OperandMasks[J].SubReg; + unsigned Mask1 = OperandMasks[I].Mask; + unsigned Mask2 = OperandMasks[J].Mask; + unsigned Size1 = countPopulation(Mask1); + unsigned Size2 = countPopulation(Mask2); + + LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << + " and " << printReg(Reg2, SubReg2) << '\n'); + + unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); + Weight += MLI->getLoopDepth(MI.getParent()) * 10; + + LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); + + unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); + unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); + if (FreeBanks1) + Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight + + ((Size2 > Size1) ? 1 : 0))); + if (FreeBanks2) + Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight + + ((Size1 > Size2) ? 1 : 0))); + } + } +} + +unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, + unsigned Reg, int Bank, + bool Collect) { + unsigned TotalStallCycles = 0; + unsigned UsedBanks = 0; + SmallSet<const MachineInstr *, 16> Visited; + + for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { + if (MI.isBundle()) + continue; + if (!Visited.insert(&MI).second) + continue; + unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank); + TotalStallCycles += StallCycles; + if (Collect) + collectCandidates(MI, UsedBanks, StallCycles); + } + + return TotalStallCycles; +} + +unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, + unsigned Bank) const { + const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); + unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs + : MaxNumSGPRs; + unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 + : AMDGPU::SGPR0); + + for (unsigned Reg : RC->getRegisters()) { + // Check occupancy limit. + if (TRI->isSubRegisterEq(Reg, MaxReg)) + break; + + if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) + continue; + + for (unsigned I = 0; CSRegs[I]; ++I) + if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && + !LRM->isPhysRegUsed(CSRegs[I])) + return AMDGPU::NoRegister; + + LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); + + if (!LRM->checkInterference(LI, Reg)) + return Reg; + } + + return AMDGPU::NoRegister; +} + +unsigned GCNRegBankReassign::tryReassign(Candidate &C) { + if (!LIS->hasInterval(C.Reg)) + return 0; + + LiveInterval &LI = LIS->getInterval(C.Reg); + LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); + LI.dump()); + + // For each candidate bank walk all instructions in the range of live + // interval and check if replacing the register with one belonging to + // the candidate bank reduces conflicts. + + unsigned OrigStalls = computeStallCycles(C.Reg); + LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); + if (!OrigStalls) + return 0; + + struct BankStall { + BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; + bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; } + unsigned Bank; + unsigned Stalls; + }; + SmallVector<BankStall, 8> BankStalls; + + for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { + if (C.FreeBanks & (1 << Bank)) { + LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); + unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); + if (Stalls < OrigStalls) { + LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " + << Stalls << '\n'); + BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); + } + } + } + std::sort(BankStalls.begin(), BankStalls.end()); + + unsigned OrigReg = VRM->getPhys(C.Reg); + LRM->unassign(LI); + while (!BankStalls.empty()) { + BankStall BS = BankStalls.pop_back_val(); + unsigned Reg = scavengeReg(LI, BS.Bank); + if (Reg == AMDGPU::NoRegister) { + LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) + << '\n'); + continue; + } + LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) + << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") + << " in bank " << printBank(BS.Bank) << '\n'); + + LRM->assign(LI, Reg); + + LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); + + return OrigStalls - BS.Stalls; + } + LRM->assign(LI, OrigReg); + + return 0; +} + +unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, + bool Collect) { + unsigned TotalStallCycles = 0; + + for (MachineBasicBlock &MBB : MF) { + + LLVM_DEBUG(if (Collect) { + if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); + else dbgs() << MBB.getName(); dbgs() << ":\n"; + }); + + for (MachineInstr &MI : MBB.instrs()) { + if (MI.isBundle()) + continue; // we analyze the instructions inside the bundle individually + + unsigned UsedBanks = 0; + unsigned StallCycles = analyzeInst(MI, UsedBanks); + + if (Collect) + collectCandidates(MI, UsedBanks, StallCycles); + + TotalStallCycles += StallCycles; + } + + LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); + } + + return TotalStallCycles; +} + +void GCNRegBankReassign::removeCandidates(unsigned Reg) { + Candidates.remove_if([Reg, this](const Candidate& C) { + return C.MI->readsRegister(Reg, TRI); + }); +} + +bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, + unsigned OriginalCycles, + unsigned CyclesSaved) { + unsigned StallCycles = collectCandidates(MF, false); + LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles + << " stall cycles left\n"); + return StallCycles + CyclesSaved == OriginalCycles; +} + +bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) + return false; + + MRI = &MF.getRegInfo(); + TRI = ST->getRegisterInfo(); + MLI = &getAnalysis<MachineLoopInfo>(); + VRM = &getAnalysis<VirtRegMap>(); + LRM = &getAnalysis<LiveRegMatrix>(); + LIS = &getAnalysis<LiveIntervals>(); + + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + unsigned Occupancy = MFI->getOccupancy(); + MaxNumVGPRs = ST->getMaxNumVGPRs(MF); + MaxNumSGPRs = ST->getMaxNumSGPRs(MF); + MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); + MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); + + CSRegs = MRI->getCalleeSavedRegs(); + + RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() + + TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1); + + LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName() + << '\n'); + + unsigned StallCycles = collectCandidates(MF); + NumStallsDetected += StallCycles; + + LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " + "function " << MF.getName() << '\n'); + + Candidates.sort(); + + LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; + for (auto C : Candidates) C.dump(this); + dbgs() << "\n\n"); + + unsigned CyclesSaved = 0; + while (!Candidates.empty()) { + Candidate C = Candidates.back(); + unsigned LocalCyclesSaved = tryReassign(C); + CyclesSaved += LocalCyclesSaved; + + if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) + report_fatal_error("RegBank reassign stall cycles verification failed."); + + Candidates.pop_back(); + if (LocalCyclesSaved) { + removeCandidates(C.Reg); + computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); + Candidates.sort(); + + LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; + for (auto C : Candidates) + C.dump(this); + dbgs() << "\n\n"); + } + } + NumStallsRecovered += CyclesSaved; + + LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved + << " cycles saved in function " << MF.getName() << '\n'); + + Candidates.clear(); + + if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) + report_fatal_error("RegBank reassign stall cycles verification failed."); + + RegsUsed.clear(); + + return CyclesSaved > 0; +} diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 3d8cacc4f02c..39460fbd8a84 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -1,9 +1,8 @@ //===- GCNRegPressure.cpp -------------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -64,9 +63,10 @@ void llvm::printLivesAt(SlotIndex SI, } if (!Num) dbgs() << " <none>\n"; } +#endif -static bool isEqual(const GCNRPTracker::LiveRegSet &S1, - const GCNRPTracker::LiveRegSet &S2) { +bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1, + const GCNRPTracker::LiveRegSet &S2) { if (S1.size() != S2.size()) return false; @@ -77,7 +77,7 @@ static bool isEqual(const GCNRPTracker::LiveRegSet &S1, } return true; } -#endif + /////////////////////////////////////////////////////////////////////////////// // GCNRegPressure @@ -89,7 +89,9 @@ unsigned GCNRegPressure::getRegKind(unsigned Reg, auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); return STI->isSGPRClass(RC) ? (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) : - (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); + STI->hasAGPRs(RC) ? + (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) : + (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); } void GCNRegPressure::inc(unsigned Reg, @@ -110,16 +112,18 @@ void GCNRegPressure::inc(unsigned Reg, switch (auto Kind = getRegKind(Reg, MRI)) { case SGPR32: case VGPR32: + case AGPR32: assert(PrevMask.none() && NewMask == MaxMask); Value[Kind] += Sign; break; case SGPR_TUPLE: case VGPR_TUPLE: + case AGPR_TUPLE: assert(NewMask < MaxMask || NewMask == MaxMask); assert(PrevMask < NewMask); - Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] += + Value[Kind == SGPR_TUPLE ? SGPR32 : Kind == AGPR_TUPLE ? AGPR32 : VGPR32] += Sign * (~PrevMask & NewMask).getNumLanes(); if (PrevMask.none()) { diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h index 357d3b7b2334..e4894418b943 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.h +++ b/lib/Target/AMDGPU/GCNRegPressure.h @@ -1,9 +1,8 @@ //===- GCNRegPressure.h -----------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -32,6 +31,8 @@ struct GCNRegPressure { SGPR_TUPLE, VGPR32, VGPR_TUPLE, + AGPR32, + AGPR_TUPLE, TOTAL_KINDS }; @@ -44,9 +45,10 @@ struct GCNRegPressure { void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } unsigned getSGPRNum() const { return Value[SGPR32]; } - unsigned getVGPRNum() const { return Value[VGPR32]; } + unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); } - unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; } + unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE], + Value[AGPR_TUPLE]); } unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; } unsigned getOccupancy(const GCNSubtarget &ST) const { @@ -191,6 +193,50 @@ GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI); +/// creates a map MachineInstr -> LiveRegSet +/// R - range of iterators on instructions +/// After - upon entry or exit of every instruction +/// Note: there is no entry in the map for instructions with empty live reg set +/// Complexity = O(NumVirtRegs * averageLiveRangeSegmentsPerReg * lg(R)) +template <typename Range> +DenseMap<MachineInstr*, GCNRPTracker::LiveRegSet> +getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) { + std::vector<SlotIndex> Indexes; + Indexes.reserve(std::distance(R.begin(), R.end())); + auto &SII = *LIS.getSlotIndexes(); + for (MachineInstr *I : R) { + auto SI = SII.getInstructionIndex(*I); + Indexes.push_back(After ? SI.getDeadSlot() : SI.getBaseIndex()); + } + std::sort(Indexes.begin(), Indexes.end()); + + auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo(); + DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap; + SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs; + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = TargetRegisterInfo::index2VirtReg(I); + if (!LIS.hasInterval(Reg)) + continue; + auto &LI = LIS.getInterval(Reg); + LiveIdxs.clear(); + if (!LI.findIndexesLiveAt(Indexes, std::back_inserter(LiveIdxs))) + continue; + if (!LI.hasSubRanges()) { + for (auto SI : LiveIdxs) + LiveRegMap[SII.getInstructionFromIndex(SI)][Reg] = + MRI.getMaxLaneMaskForVReg(Reg); + } else + for (const auto &S : LI.subranges()) { + // constrain search for subranges by indexes live at main range + SRLiveIdxs.clear(); + S.findIndexesLiveAt(LiveIdxs, std::back_inserter(SRLiveIdxs)); + for (auto SI : SRLiveIdxs) + LiveRegMap[SII.getInstructionFromIndex(SI)][Reg] |= S.LaneMask; + } + } + return LiveRegMap; +} + inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI, const LiveIntervals &LIS) { return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS, @@ -212,6 +258,9 @@ GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, return Res; } +bool isEqual(const GCNRPTracker::LiveRegSet &S1, + const GCNRPTracker::LiveRegSet &S2); + void printLivesAt(SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI); diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index f09b7f6cff22..4ea990ae490e 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1,9 +1,8 @@ //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -446,8 +445,12 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { RPTracker.reset(*MBB->begin(), &LiveIn); MBBLiveIns.erase(LiveInIt); } else { - I = Regions[CurRegion].first; - RPTracker.reset(*I); + auto &Rgn = Regions[CurRegion]; + I = Rgn.first; + auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); + auto LRS = BBLiveInMap.lookup(NonDbgMI); + assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); + RPTracker.reset(*I, &LRS); } for ( ; ; ) { @@ -478,6 +481,23 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { } } +DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> +GCNScheduleDAGMILive::getBBLiveInMap() const { + assert(!Regions.empty()); + std::vector<MachineInstr *> BBStarters; + BBStarters.reserve(Regions.size()); + auto I = Regions.rbegin(), E = Regions.rend(); + auto *BB = I->first->getParent(); + do { + auto *MI = &*skipDebugInstructionsForward(I->first, I->second); + BBStarters.push_back(MI); + do { + ++I; + } while (I != E && I->first->getParent() == BB); + } while (I != E); + return getLiveRegMap(BBStarters, false /*After*/, *LIS); +} + void GCNScheduleDAGMILive::finalizeSchedule() { GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); @@ -485,6 +505,9 @@ void GCNScheduleDAGMILive::finalizeSchedule() { LiveIns.resize(Regions.size()); Pressure.resize(Regions.size()); + if (!Regions.empty()) + BBLiveInMap = getBBLiveInMap(); + do { Stage++; RegionIdx = 0; diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h index 3ac6af89cb9b..eaf3dee9ba5d 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -1,9 +1,8 @@ //===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -27,7 +26,7 @@ class GCNSubtarget; /// and the GenericScheduler is that GCNSchedStrategy uses different /// heuristics to determine excess/critical pressure sets. Its goal is to /// maximize kernel occupancy (i.e. maximum number of waves per simd). -class GCNMaxOccupancySchedStrategy : public GenericScheduler { +class GCNMaxOccupancySchedStrategy final : public GenericScheduler { friend class GCNScheduleDAGMILive; SUnit *pickNodeBidirectional(bool &IsTopNode); @@ -60,7 +59,7 @@ public: void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; } }; -class GCNScheduleDAGMILive : public ScheduleDAGMILive { +class GCNScheduleDAGMILive final : public ScheduleDAGMILive { const GCNSubtarget &ST; @@ -78,7 +77,7 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive { // Current region index. size_t RegionIdx; - // Vecor of regions recorder for later rescheduling + // Vector of regions recorder for later rescheduling SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> Regions; @@ -91,6 +90,9 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive { // Temporary basic block live-in cache. DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns; + DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap; + DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const; + // Return current region pressure. GCNRegPressure getRealRegPressure() const; diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index abc88c02adca..57c0ba26cc3a 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// @@ -19,8 +18,10 @@ #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/TargetRegistry.h" +#include "Utils/AMDGPUBaseInfo.h" using namespace llvm; +using namespace llvm::AMDGPU; namespace { @@ -36,17 +37,13 @@ public: const MCSubtargetInfo *STI) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, - const MCAsmLayout &Layout) const override { - return false; - } + const MCAsmLayout &Layout) const override; + void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - MCInst &Res) const override { - llvm_unreachable("Not implemented"); - } + MCInst &Res) const override; + bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override { - return false; - } + const MCSubtargetInfo &STI) const override; unsigned getMinimumNopSize() const override; bool writeNopData(raw_ostream &OS, uint64_t Count) const override; @@ -56,6 +53,36 @@ public: } //End anonymous namespace +void AMDGPUAsmBackend::relaxInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI, + MCInst &Res) const { + unsigned RelaxedOpcode = AMDGPU::getSOPPWithRelaxation(Inst.getOpcode()); + Res.setOpcode(RelaxedOpcode); + Res.addOperand(Inst.getOperand(0)); + return; +} + +bool AMDGPUAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const { + // if the branch target has an offset of x3f this needs to be relaxed to + // add a s_nop 0 immediately after branch to effectively increment offset + // for hardware workaround in gfx1010 + return (((int64_t(Value)/4)-1) == 0x3f); +} + +bool AMDGPUAsmBackend::mayNeedRelaxation(const MCInst &Inst, + const MCSubtargetInfo &STI) const { + if (!STI.getFeatureBits()[AMDGPU::FeatureOffset3fBug]) + return false; + + if (AMDGPU::getSOPPWithRelaxation(Inst.getOpcode()) >= 0) + return true; + + return false; +} + static unsigned getFixupKindNumBytes(unsigned Kind) { switch (Kind) { case AMDGPU::fixup_si_sopp_br: @@ -173,11 +200,13 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { bool Is64Bit; bool HasRelocationAddend; uint8_t OSABI = ELF::ELFOSABI_NONE; + uint8_t ABIVersion = 0; public: - ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) : + ELFAMDGPUAsmBackend(const Target &T, const Triple &TT, uint8_t ABIVersion) : AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn), - HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { + HasRelocationAddend(TT.getOS() == Triple::AMDHSA), + ABIVersion(ABIVersion) { switch (TT.getOS()) { case Triple::AMDHSA: OSABI = ELF::ELFOSABI_AMDGPU_HSA; @@ -195,7 +224,8 @@ public: std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { - return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend); + return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend, + ABIVersion); } }; @@ -206,5 +236,6 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, const MCTargetOptions &Options) { // Use 64-bit ELF for amdgcn - return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple()); + return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(), + IsaInfo::hasCodeObjectV3(&STI) ? 1 : 0); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index c85a1ea5b054..6549a8d7d592 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -1,9 +1,8 @@ //===- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -23,7 +22,8 @@ namespace { class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { public: - AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend); + AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend, + uint8_t ABIVersion); protected: unsigned getRelocType(MCContext &Ctx, const MCValue &Target, @@ -35,9 +35,10 @@ protected: AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, - bool HasRelocationAddend) + bool HasRelocationAddend, + uint8_t ABIVersion) : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU, - HasRelocationAddend) {} + HasRelocationAddend, ABIVersion) {} unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, @@ -84,7 +85,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, std::unique_ptr<MCObjectTargetWriter> llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, - bool HasRelocationAddend) { + bool HasRelocationAddend, + uint8_t ABIVersion) { return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI, - HasRelocationAddend); + HasRelocationAddend, + ABIVersion); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp index c627a08e7463..40437d8fa1a4 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp @@ -1,9 +1,8 @@ //===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h index 41e9063a759e..9fbf53c944ef 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h @@ -1,9 +1,8 @@ //===-------- AMDGPUELFStreamer.h - ELF Object Output -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h index 20c1adfbc6b9..d49bb196ab3a 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h @@ -1,9 +1,8 @@ //===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index fab0f87dfcbe..01b53432cbb7 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // // \file //===----------------------------------------------------------------------===// @@ -72,11 +71,6 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); } -void AMDGPUInstPrinter::printS13ImmDecOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm())); -} - void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -123,13 +117,25 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { +void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { uint16_t Imm = MI->getOperand(OpNo).getImm(); if (Imm != 0) { O << ((OpNo == 0)? "offset:" : " offset:"); - printS13ImmDecOperand(MI, OpNo, O); + + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + bool IsFlatSeg = !(Desc.TSFlags & SIInstrFlags::IsNonFlatSeg); + + if (IsFlatSeg) { // Unsigned offset + printU16ImmDecOperand(MI, OpNo, O); + } else { // Signed offset + if (AMDGPU::isGFX10(STI)) { + O << formatDec(SignExtend32<12>(MI->getOperand(OpNo).getImm())); + } else { + O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm())); + } + } } } @@ -174,6 +180,12 @@ void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "gds"); } +void AMDGPUInstPrinter::printDLC(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + if (AMDGPU::isGFX10(STI)) + printNamedBit(MI, OpNo, O, "dlc"); +} + void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "glc"); @@ -197,6 +209,18 @@ void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printDim(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + unsigned Dim = MI->getOperand(OpNo).getImm(); + O << " dim:SQ_RSRC_IMG_"; + + const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); + if (DimInfo) + O << DimInfo->AsmSuffix; + else + O << Dim; +} + void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "unorm"); @@ -243,140 +267,96 @@ void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { if (unsigned Val = MI->getOperand(OpNo).getImm()) { - O << " dfmt:" << (Val & 15); - O << ", nfmt:" << (Val >> 4); + if (AMDGPU::isGFX10(STI)) + O << " format:" << Val; + else { + O << " dfmt:" << (Val & 15); + O << ", nfmt:" << (Val >> 4); + } } } void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI) { +#if !defined(NDEBUG) switch (RegNo) { - case AMDGPU::VCC: - O << "vcc"; - return; - case AMDGPU::SCC: - O << "scc"; - return; - case AMDGPU::EXEC: - O << "exec"; - return; - case AMDGPU::M0: - O << "m0"; - return; - case AMDGPU::FLAT_SCR: - O << "flat_scratch"; - return; - case AMDGPU::XNACK_MASK: - O << "xnack_mask"; - return; - case AMDGPU::VCC_LO: - O << "vcc_lo"; - return; - case AMDGPU::VCC_HI: - O << "vcc_hi"; - return; - case AMDGPU::TBA_LO: - O << "tba_lo"; - return; - case AMDGPU::TBA_HI: - O << "tba_hi"; - return; - case AMDGPU::TMA_LO: - O << "tma_lo"; - return; - case AMDGPU::TMA_HI: - O << "tma_hi"; - return; - case AMDGPU::EXEC_LO: - O << "exec_lo"; - return; - case AMDGPU::EXEC_HI: - O << "exec_hi"; - return; - case AMDGPU::FLAT_SCR_LO: - O << "flat_scratch_lo"; - return; - case AMDGPU::FLAT_SCR_HI: - O << "flat_scratch_hi"; - return; - case AMDGPU::XNACK_MASK_LO: - O << "xnack_mask_lo"; - return; - case AMDGPU::XNACK_MASK_HI: - O << "xnack_mask_hi"; - return; case AMDGPU::FP_REG: case AMDGPU::SP_REG: case AMDGPU::SCRATCH_WAVE_OFFSET_REG: case AMDGPU::PRIVATE_RSRC_REG: llvm_unreachable("pseudo-register should not ever be emitted"); + case AMDGPU::SCC: + llvm_unreachable("pseudo scc should not ever be emitted"); default: break; } - - // The low 8 bits of the encoding value is the register index, for both VGPRs - // and SGPRs. - unsigned RegIdx = MRI.getEncodingValue(RegNo) & ((1 << 8) - 1); - - unsigned NumRegs; - if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(RegNo)) { - O << 'v'; - NumRegs = 1; - } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(RegNo)) { - O << 's'; - NumRegs = 1; - } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo)) { - O <<'v'; - NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo)) { - O << 's'; - NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo)) { - O << 'v'; - NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo)) { - O << 's'; - NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo)) { - O << 'v'; - NumRegs = 3; - } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) { - O << 'v'; - NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) { - O << 's'; - NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) { - O << 'v'; - NumRegs = 16; - } else if (MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo)) { - O << 's'; - NumRegs = 16; - } else { - O << getRegisterName(RegNo); - return; - } - - if (NumRegs == 1) { - O << RegIdx; - return; - } - - O << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; +#endif + + unsigned AltName = AMDGPU::Reg32; + + if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg64; + else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg128; + else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SReg_96RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg96; + else if (MRI.getRegClass(AMDGPU::VReg_160RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SReg_160RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg160; + else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg256; + else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg512; + else if (MRI.getRegClass(AMDGPU::VReg_1024RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::SReg_1024RegClassID).contains(RegNo) || + MRI.getRegClass(AMDGPU::AReg_1024RegClassID).contains(RegNo)) + AltName = AMDGPU::Reg1024; + + O << getRegisterName(RegNo, AltName); } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) - O << "_e64 "; - else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP) - O << "_dpp "; - else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA) - O << "_sdwa "; - else - O << "_e32 "; + if (OpNo == 0) { + if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) + O << "_e64 "; + else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP) + O << "_dpp "; + else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA) + O << "_sdwa "; + else + O << "_e32 "; + } printOperand(MI, OpNo, STI, O); + + // Print default vcc/vcc_lo operand. + switch (MI->getOpcode()) { + default: break; + + case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10: + printDefaultVccOperand(1, STI, O); + break; + } } void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo, @@ -491,7 +471,7 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, O << "-4.0"; else if (Imm == 0x3fc45f306dc9c882 && STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) - O << "0.15915494"; + O << "0.15915494309189532"; else { assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882); @@ -501,9 +481,57 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, } } +void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (!Imm) + return; + + O << " blgp:" << Imm; +} + +void AMDGPUInstPrinter::printCBSZ(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (!Imm) + return; + + O << " cbsz:" << Imm; +} + +void AMDGPUInstPrinter::printABID(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (!Imm) + return; + + O << " abid:" << Imm; +} + +void AMDGPUInstPrinter::printDefaultVccOperand(unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (OpNo > 0) + O << ", "; + printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? + AMDGPU::VCC : AMDGPU::VCC_LO, O, MRI); + if (OpNo == 0) + O << ", "; +} + void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { + // Print default vcc/vcc_lo operand of VOPC. + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + if (OpNo == 0 && (Desc.TSFlags & SIInstrFlags::VOPC) && + (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) || + Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO))) + printDefaultVccOperand(OpNo, STI, O); + if (OpNo >= MI->getNumOperands()) { O << "/*Missing OP" << OpNo << "*/"; return; @@ -513,12 +541,13 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (Op.isReg()) { printRegOperand(Op.getReg(), O, MRI); } else if (Op.isImm()) { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); switch (Desc.OpInfo[OpNo].OperandType) { case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: case MCOI::OPERAND_IMMEDIATE: printImmediate32(Op.getImm(), STI, O); break; @@ -530,12 +559,24 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, break; case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP16: printImmediate16(Op.getImm(), STI, O); break; + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + if (!isUInt<16>(Op.getImm()) && + STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) { + printImmediate32(Op.getImm(), STI, O); + break; + } + LLVM_FALLTHROUGH; case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: printImmediateV216(Op.getImm(), STI, O); break; case MCOI::OPERAND_UNKNOWN: @@ -573,6 +614,29 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } else { O << "/*INV_OP*/"; } + + // Print default vcc/vcc_lo operand of v_cndmask_b32_e32. + switch (MI->getOpcode()) { + default: break; + + case AMDGPU::V_CNDMASK_B32_e32_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10: + + case AMDGPU::V_CNDMASK_B32_e32_gfx6_gfx7: + case AMDGPU::V_CNDMASK_B32_e32_vi: + if ((int)OpNo == AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src1)) + printDefaultVccOperand(OpNo, STI, O); + break; + } } void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, @@ -620,6 +684,33 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, printOperand(MI, OpNo + 1, STI, O); if (InputModifiers & SISrcMods::SEXT) O << ')'; + + // Print default vcc/vcc_lo operand of VOP2b. + switch (MI->getOpcode()) { + default: break; + + case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10: + case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10: + case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10: + if ((int)OpNo + 1 == AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src1)) + printDefaultVccOperand(OpNo, STI, O); + break; + } +} + +void AMDGPUInstPrinter::printDPP8(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (!AMDGPU::isGFX10(STI)) + llvm_unreachable("dpp8 is not supported on ASICs earlier than GFX10"); + + unsigned Imm = MI->getOperand(OpNo).getImm(); + O << " dpp8:[" << formatDec(Imm & 0x7); + for (size_t i = 1; i < 8; ++i) { + O << ',' << formatDec((Imm >> (3 * i)) & 0x7); + } + O << ']'; } void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, @@ -647,21 +738,61 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, O << " row_ror:"; printU4ImmDecOperand(MI, OpNo, O); } else if (Imm == DppCtrl::WAVE_SHL1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_shl is not supported starting from GFX10 */"; + return; + } O << " wave_shl:1"; } else if (Imm == DppCtrl::WAVE_ROL1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_rol is not supported starting from GFX10 */"; + return; + } O << " wave_rol:1"; } else if (Imm == DppCtrl::WAVE_SHR1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_shr is not supported starting from GFX10 */"; + return; + } O << " wave_shr:1"; } else if (Imm == DppCtrl::WAVE_ROR1) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* wave_ror is not supported starting from GFX10 */"; + return; + } O << " wave_ror:1"; } else if (Imm == DppCtrl::ROW_MIRROR) { O << " row_mirror"; } else if (Imm == DppCtrl::ROW_HALF_MIRROR) { O << " row_half_mirror"; } else if (Imm == DppCtrl::BCAST15) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* row_bcast is not supported starting from GFX10 */"; + return; + } O << " row_bcast:15"; } else if (Imm == DppCtrl::BCAST31) { + if (!AMDGPU::isVI(STI) && !AMDGPU::isGFX9(STI)) { + O << " /* row_bcast is not supported starting from GFX10 */"; + return; + } O << " row_bcast:31"; + } else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) && + (Imm <= DppCtrl::ROW_SHARE_LAST)) { + if (!AMDGPU::isGFX10(STI)) { + O << " /* row_share is not supported on ASICs earlier than GFX10 */"; + return; + } + O << " row_share:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) && + (Imm <= DppCtrl::ROW_XMASK_LAST)) { + if (!AMDGPU::isGFX10(STI)) { + O << " /* row_xmask is not supported on ASICs earlier than GFX10 */"; + return; + } + O << "row_xmask:"; + printU4ImmDecOperand(MI, OpNo, O); } else { O << " /* Invalid dpp_ctrl value */"; } @@ -690,6 +821,16 @@ void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printFI(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::DPP; + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (Imm == DPP_FI_1 || Imm == DPP8_FI_1) { + O << " fi:1"; + } +} + void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O) { using namespace llvm::AMDGPU::SDWA; @@ -803,8 +944,10 @@ void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo, O << " mrtz"; else if (Tgt == 9) O << " null"; - else if (Tgt >= 12 && Tgt <= 15) + else if ((Tgt >= 12 && Tgt <= 15) || (Tgt == 16 && AMDGPU::isGFX10(STI))) O << " pos" << Tgt - 12; + else if (AMDGPU::isGFX10(STI) && Tgt == 20) + O << " prim"; else if (Tgt >= 32 && Tgt <= 63) O << " param" << Tgt - 32; else { @@ -875,6 +1018,18 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, const MCSubtargetInfo &STI, raw_ostream &O) { + unsigned Opc = MI->getOpcode(); + if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 || + Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) { + auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); + unsigned FI = !!(MI->getOperand(FIN).getImm() & SISrcMods::OP_SEL_0); + unsigned BC = !!(MI->getOperand(BCN).getImm() & SISrcMods::OP_SEL_0); + if (FI || BC) + O << " op_sel:[" << FI << ',' << BC << ']'; + return; + } + printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O); } @@ -932,23 +1087,24 @@ void AMDGPUInstPrinter::printInterpAttrChan(const MCInst *MI, unsigned OpNum, void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { + using namespace llvm::AMDGPU::VGPRIndexMode; unsigned Val = MI->getOperand(OpNo).getImm(); - if (Val == 0) { - O << " 0"; - return; - } - - if (Val & VGPRIndexMode::DST_ENABLE) - O << " dst"; - - if (Val & VGPRIndexMode::SRC0_ENABLE) - O << " src0"; - if (Val & VGPRIndexMode::SRC1_ENABLE) - O << " src1"; - - if (Val & VGPRIndexMode::SRC2_ENABLE) - O << " src2"; + if ((Val & ~ENABLE_MASK) != 0) { + O << " " << formatHex(static_cast<uint64_t>(Val)); + } else { + O << " gpr_idx("; + bool NeedComma = false; + for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) { + if (Val & (1 << ModeId)) { + if (NeedComma) + O << ','; + O << IdSymbolic[ModeId]; + NeedComma = true; + } + } + O << ')'; + } } void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, @@ -1010,40 +1166,29 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O) { using namespace llvm::AMDGPU::SendMsg; - const unsigned SImm16 = MI->getOperand(OpNo).getImm(); - const unsigned Id = SImm16 & ID_MASK_; - do { - if (Id == ID_INTERRUPT) { - if ((SImm16 & ~ID_MASK_) != 0) // Unused/unknown bits must be 0. - break; - O << "sendmsg(" << IdSymbolic[Id] << ')'; - return; - } - if (Id == ID_GS || Id == ID_GS_DONE) { - if ((SImm16 & ~(ID_MASK_|OP_GS_MASK_|STREAM_ID_MASK_)) != 0) // Unused/unknown bits must be 0. - break; - const unsigned OpGs = (SImm16 & OP_GS_MASK_) >> OP_SHIFT_; - const unsigned StreamId = (SImm16 & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_; - if (OpGs == OP_GS_NOP && Id != ID_GS_DONE) // NOP to be used for GS_DONE only. - break; - if (OpGs == OP_GS_NOP && StreamId != 0) // NOP does not use/define stream id bits. - break; - O << "sendmsg(" << IdSymbolic[Id] << ", " << OpGsSymbolic[OpGs]; - if (OpGs != OP_GS_NOP) { O << ", " << StreamId; } - O << ')'; - return; - } - if (Id == ID_SYSMSG) { - if ((SImm16 & ~(ID_MASK_|OP_SYS_MASK_)) != 0) // Unused/unknown bits must be 0. - break; - const unsigned OpSys = (SImm16 & OP_SYS_MASK_) >> OP_SHIFT_; - if (! (OP_SYS_FIRST_ <= OpSys && OpSys < OP_SYS_LAST_)) // Unused/unknown. - break; - O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')'; - return; + const unsigned Imm16 = MI->getOperand(OpNo).getImm(); + + uint16_t MsgId; + uint16_t OpId; + uint16_t StreamId; + decodeMsg(Imm16, MsgId, OpId, StreamId); + + if (isValidMsgId(MsgId, STI) && + isValidMsgOp(MsgId, OpId) && + isValidMsgStream(MsgId, OpId, StreamId)) { + O << "sendmsg(" << getMsgName(MsgId); + if (msgRequiresOp(MsgId)) { + O << ", " << getMsgOpName(MsgId, OpId); + if (msgSupportsStream(MsgId, OpId)) { + O << ", " << StreamId; + } } - } while (false); - O << SImm16; // Unknown simm16 code. + O << ')'; + } else if (encodeMsg(MsgId, OpId, StreamId) == Imm16) { + O << "sendmsg(" << MsgId << ", " << OpId << ", " << StreamId << ')'; + } else { + O << Imm16; // Unknown imm16 code. + } } static void printSwizzleBitmask(const uint16_t AndMask, @@ -1094,7 +1239,7 @@ void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo, if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) { O << "swizzle(" << IdSymbolic[ID_QUAD_PERM]; - for (auto i = 0; i < LANE_NUM; ++i) { + for (unsigned I = 0; I < LANE_NUM; ++I) { O << ","; O << formatDec(Imm & LANE_MASK); Imm >>= LANE_SHIFT; @@ -1184,32 +1329,42 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - using namespace llvm::AMDGPU::Hwreg; + unsigned Id; + unsigned Offset; + unsigned Width; - unsigned SImm16 = MI->getOperand(OpNo).getImm(); - const unsigned Id = (SImm16 & ID_MASK_) >> ID_SHIFT_; - const unsigned Offset = (SImm16 & OFFSET_MASK_) >> OFFSET_SHIFT_; - const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1; + using namespace llvm::AMDGPU::Hwreg; + unsigned Val = MI->getOperand(OpNo).getImm(); + decodeHwreg(Val, Id, Offset, Width); + StringRef HwRegName = getHwreg(Id, STI); O << "hwreg("; - unsigned Last = ID_SYMBOLIC_LAST_; - if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI) || AMDGPU::isVI(STI)) - Last = ID_SYMBOLIC_FIRST_GFX9_; - if (ID_SYMBOLIC_FIRST_ <= Id && Id < Last && IdSymbolic[Id]) { - O << IdSymbolic[Id]; + if (!HwRegName.empty()) { + O << HwRegName; } else { O << Id; } - if (Width != WIDTH_M1_DEFAULT_ + 1 || Offset != OFFSET_DEFAULT_) { + if (Width != WIDTH_DEFAULT_ || Offset != OFFSET_DEFAULT_) { O << ", " << Offset << ", " << Width; } O << ')'; } +void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm == 0) { + return; + } + + O << ' ' << formatDec(Imm); +} + #include "AMDGPUGenAsmWriter.inc" void R600InstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { + StringRef Annot, const MCSubtargetInfo &STI) { O.flush(); printInstruction(MI, O); printAnnotation(O, Annot); diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 0ba74ca0f3e1..b544d1ef3605 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -1,18 +1,18 @@ //===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H -#define LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H +#include "AMDGPUMCTargetDesc.h" #include "llvm/MC/MCInstPrinter.h" namespace llvm { @@ -26,7 +26,8 @@ public: //Autogenerated by tblgen void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); + static const char *getRegisterName(unsigned RegNo, + unsigned AltIdx = AMDGPU::NoRegAltName); void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, const MCSubtargetInfo &STI) override; @@ -42,7 +43,6 @@ private: void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printS13ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU32ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O, @@ -53,8 +53,8 @@ private: void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); + void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); @@ -68,6 +68,8 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printDLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -76,6 +78,8 @@ private: raw_ostream &O); void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -112,6 +116,8 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printDPP8(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -120,6 +126,8 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printBoundCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printFI(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printSDWADstSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); @@ -150,6 +158,14 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printMemOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printBLGP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printCBSZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printABID(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printDefaultVccOperand(unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); template <unsigned N> @@ -214,6 +230,8 @@ protected: const MCSubtargetInfo &STI, raw_ostream &O); void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printEndpgm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); }; class R600InstPrinter : public MCInstPrinter { diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 2364e7b7b5fb..9e04ab9bae93 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -1,15 +1,16 @@ //===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// #include "AMDGPUMCAsmInfo.h" #include "llvm/ADT/Triple.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" using namespace llvm; @@ -19,7 +20,10 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { HasSingleParameterDotFile = false; //===------------------------------------------------------------------===// MinInstAlignment = 4; - MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 8 : 16; + + // This is the maximum instruction encoded size for gfx10. With a known + // subtarget, it can be reduced to 8 bytes. + MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 20 : 16; SeparatorString = "\n"; CommentString = ";"; PrivateLabelPrefix = ""; @@ -45,3 +49,18 @@ bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const { SectionName == ".hsarodata_readonly_agent" || MCAsmInfo::shouldOmitSectionDirective(SectionName); } + +unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const { + if (!STI || STI->getTargetTriple().getArch() == Triple::r600) + return MaxInstLength; + + // Maximum for NSA encoded images + if (STI->getFeatureBits()[AMDGPU::FeatureNSAEncoding]) + return 20; + + // 64-bit instruction with 32-bit literal. + if (STI->getFeatureBits()[AMDGPU::FeatureVOP3Literal]) + return 12; + + return 8; +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h index 8cb33a3179cd..71e63ec27a8f 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -1,9 +1,8 @@ //===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -28,6 +27,7 @@ class AMDGPUMCAsmInfo : public MCAsmInfoELF { public: explicit AMDGPUMCAsmInfo(const Triple &TT); bool shouldOmitSectionDirective(StringRef SectionName) const override; + unsigned getMaxInstLength(const MCSubtargetInfo *STI) const override; }; } // namespace llvm #endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index cae7a7a6c7e7..f3d945cc0764 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index dcc10a032afe..62757a707890 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -1,9 +1,8 @@ //===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -64,10 +63,17 @@ public: return 0; } + virtual unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + protected: - uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; - void verifyInstructionPredicates(const MCInst &MI, - uint64_t AvailableFeatures) const; + FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; + void + verifyInstructionPredicates(const MCInst &MI, + const FeatureBitset &AvailableFeatures) const; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index c579c7d60e16..88df64d18cc5 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,13 +13,15 @@ #include "AMDGPUMCTargetDesc.h" #include "AMDGPUELFStreamer.h" +#include "AMDGPUInstPrinter.h" #include "AMDGPUMCAsmInfo.h" #include "AMDGPUTargetStreamer.h" -#include "InstPrinter/AMDGPUInstPrinter.h" #include "SIDefines.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" @@ -104,6 +105,35 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, std::move(Emitter), RelaxAll); } +namespace { + +class AMDGPUMCInstrAnalysis : public MCInstrAnalysis { +public: + explicit AMDGPUMCInstrAnalysis(const MCInstrInfo *Info) + : MCInstrAnalysis(Info) {} + + bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, + uint64_t &Target) const override { + if (Inst.getNumOperands() == 0 || !Inst.getOperand(0).isImm() || + Info->get(Inst.getOpcode()).OpInfo[0].OperandType != + MCOI::OPERAND_PCREL) + return false; + + int64_t Imm = Inst.getOperand(0).getImm(); + // Our branches take a simm16, but we need two extra bits to account for + // the factor of 4. + APInt SignedOffset(18, Imm * 4, true); + Target = (SignedOffset.sext(64) + Addr + Size).getZExtValue(); + return true; + } +}; + +} // end anonymous namespace + +static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) { + return new AMDGPUMCInstrAnalysis(Info); +} + extern "C" void LLVMInitializeAMDGPUTargetMC() { TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo); @@ -114,6 +144,7 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() { TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); + TargetRegistry::RegisterMCInstrAnalysis(*T, createAMDGPUMCInstrAnalysis); TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); TargetRegistry::RegisterELFStreamer(*T, createMCStreamer); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index f3628d96d6e9..9754d31fee60 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -1,9 +1,8 @@ //===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -34,9 +33,6 @@ class Target; class Triple; class raw_pwrite_stream; -Target &getTheAMDGPUTarget(); -Target &getTheGCNTarget(); - MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); @@ -53,7 +49,7 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T, std::unique_ptr<MCObjectTargetWriter> createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, - bool HasRelocationAddend); + bool HasRelocationAddend, uint8_t ABIVersion); } // End llvm namespace #define GET_REGINFO_ENUM diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index c17fe126546c..8f11433476f4 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUTargetStreamer.cpp - Mips Target Streamer Methods -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,7 +18,6 @@ #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/BinaryFormat/MsgPackTypes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Metadata.h" @@ -52,51 +50,53 @@ bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) { } bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) { - std::shared_ptr<msgpack::Node> HSAMetadataRoot; - yaml::Input YIn(HSAMetadataString); - YIn >> HSAMetadataRoot; - if (YIn.error()) + msgpack::Document HSAMetadataDoc; + if (!HSAMetadataDoc.fromYAML(HSAMetadataString)) return false; - return EmitHSAMetadata(HSAMetadataRoot, false); + return EmitHSAMetadata(HSAMetadataDoc, false); } StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { AMDGPU::GPUKind AK; switch (ElfMach) { - case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break; - case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break; - case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break; - case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break; - case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break; - case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break; - case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break; - case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break; - case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break; - case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break; - case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break; - case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break; - case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break; - case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break; - case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break; - case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; - case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; + case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break; + case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break; + case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break; + case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break; + case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break; + case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break; + case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break; + case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break; + case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break; + case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break; + case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break; + case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break; + case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break; + case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break; + case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break; + case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: AK = GK_GFX908; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break; + case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; } StringRef GPUName = getArchNameAMDGCN(AK); @@ -142,7 +142,11 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX902: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX902; case GK_GFX904: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904; case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906; + case GK_GFX908: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX908; case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; + case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; + case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011; + case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012; case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE; } @@ -157,6 +161,14 @@ AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) : AMDGPUTargetStreamer(S), OS(OS) { } +// A hook for emitting stuff at the end. +// We use it for emitting the accumulated PAL metadata as directives. +void AMDGPUTargetAsmStreamer::finish() { + std::string S; + getPALMetadata()->toString(S); + OS << S; +} + void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) { OS << "\t.amdgcn_target \"" << Target << "\"\n"; } @@ -196,6 +208,12 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, } } +void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, + unsigned Align) { + OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", " << Align + << '\n'; +} + bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) { OS << "\t.amd_amdgpu_isa \"" << IsaVersionString << "\"\n"; return true; @@ -214,15 +232,14 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( } bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( - std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) { + msgpack::Document &HSAMetadataDoc, bool Strict) { V3::MetadataVerifier Verifier(Strict); - if (!Verifier.verify(*HSAMetadataRoot)) + if (!Verifier.verify(HSAMetadataDoc.getRoot())) return false; std::string HSAMetadataString; raw_string_ostream StrOS(HSAMetadataString); - yaml::Output YOut(StrOS); - YOut << HSAMetadataRoot; + HSAMetadataDoc.toYAML(StrOS); OS << '\t' << V3::AssemblerDirectiveBegin << '\n'; OS << StrOS.str() << '\n'; @@ -230,13 +247,10 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( return true; } -bool AMDGPUTargetAsmStreamer::EmitPALMetadata( - const PALMD::Metadata &PALMetadata) { - std::string PALMetadataString; - if (PALMD::toString(PALMetadata, PALMetadataString)) - return false; - - OS << '\t' << PALMD::AssemblerDirective << PALMetadataString << '\n'; +bool AMDGPUTargetAsmStreamer::EmitCodeEnd() { + const uint32_t Encoded_s_code_end = 0xbf9f0000; + OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n'; + OS << "\t.fill 32, 4, " << Encoded_s_code_end << '\n'; return true; } @@ -278,6 +292,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + if (IVersion.Major >= 10) + PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); PRINT_FIELD( OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, compute_pgm_rsrc2, @@ -331,6 +349,17 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD, compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL); + if (IVersion.Major >= 10) { + PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE); + PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED); + PRINT_FIELD(OS, ".amdhsa_forward_progress", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS); + } PRINT_FIELD( OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, compute_pgm_rsrc2, @@ -387,6 +416,19 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { return static_cast<MCELFStreamer &>(Streamer); } +// A hook for emitting stuff at the end. +// We use it for emitting the accumulated PAL metadata as a .note record. +void AMDGPUTargetELFStreamer::finish() { + std::string Blob; + const char *Vendor = getPALMetadata()->getVendor(); + unsigned Type = getPALMetadata()->getType(); + getPALMetadata()->toBlob(Type, Blob); + if (Blob.empty()) + return; + EmitNote(Vendor, MCConstantExpr::create(Blob.size(), getContext()), Type, + [&](MCELFStreamer &OS) { OS.EmitBytes(Blob); }); +} + void AMDGPUTargetELFStreamer::EmitNote( StringRef Name, const MCExpr *DescSZ, unsigned NoteType, function_ref<void(MCELFStreamer &)> EmitDesc) { @@ -463,6 +505,27 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, Symbol->setType(Type); } +void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, + unsigned Align) { + assert(isPowerOf2_32(Align)); + + MCSymbolELF *SymbolELF = cast<MCSymbolELF>(Symbol); + SymbolELF->setType(ELF::STT_OBJECT); + + if (!SymbolELF->isBindingSet()) { + SymbolELF->setBinding(ELF::STB_GLOBAL); + SymbolELF->setExternal(true); + } + + if (SymbolELF->declareCommon(Size, Align, true)) { + report_fatal_error("Symbol: " + Symbol->getName() + + " redeclared as different type"); + } + + SymbolELF->setIndex(ELF::SHN_AMDGPU_LDS); + SymbolELF->setSize(MCConstantExpr::create(Size, getContext())); +} + bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) { // Create two labels to mark the beginning and end of the desc field // and a MCExpr to calculate the size of the desc field. @@ -482,16 +545,14 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) { return true; } -bool AMDGPUTargetELFStreamer::EmitHSAMetadata( - std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) { +bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc, + bool Strict) { V3::MetadataVerifier Verifier(Strict); - if (!Verifier.verify(*HSAMetadataRoot)) + if (!Verifier.verify(HSAMetadataDoc.getRoot())) return false; std::string HSAMetadataString; - raw_string_ostream StrOS(HSAMetadataString); - msgpack::Writer MPWriter(StrOS); - HSAMetadataRoot->write(MPWriter); + HSAMetadataDoc.writeToBlob(HSAMetadataString); // Create two labels to mark the beginning and end of the desc field // and a MCExpr to calculate the size of the desc field. @@ -505,7 +566,7 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata( EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA, [&](MCELFStreamer &OS) { OS.EmitLabel(DescBegin); - OS.EmitBytes(StrOS.str()); + OS.EmitBytes(HSAMetadataString); OS.EmitLabel(DescEnd); }); return true; @@ -535,15 +596,15 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata( return true; } -bool AMDGPUTargetELFStreamer::EmitPALMetadata( - const PALMD::Metadata &PALMetadata) { - EmitNote(ElfNote::NoteNameV2, - MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), - getContext()), - ELF::NT_AMD_AMDGPU_PAL_METADATA, [&](MCELFStreamer &OS) { - for (auto I : PALMetadata) - OS.EmitIntValue(I, sizeof(uint32_t)); - }); +bool AMDGPUTargetELFStreamer::EmitCodeEnd() { + const uint32_t Encoded_s_code_end = 0xbf9f0000; + + MCStreamer &OS = getStreamer(); + OS.PushSection(); + OS.EmitValueToAlignment(64, Encoded_s_code_end, 4); + for (unsigned I = 0; I < 32; ++I) + OS.EmitIntValue(Encoded_s_code_end, 4); + OS.PopSection(); return true; } @@ -555,16 +616,25 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( auto &Streamer = getStreamer(); auto &Context = Streamer.getContext(); + MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>( + Context.getOrCreateSymbol(Twine(KernelName))); MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>( Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd"))); - KernelDescriptorSymbol->setBinding(ELF::STB_GLOBAL); + + // Copy kernel descriptor symbol's binding, other and visibility from the + // kernel code symbol. + KernelDescriptorSymbol->setBinding(KernelCodeSymbol->getBinding()); + KernelDescriptorSymbol->setOther(KernelCodeSymbol->getOther()); + KernelDescriptorSymbol->setVisibility(KernelCodeSymbol->getVisibility()); + // Kernel descriptor symbol's type and size are fixed. KernelDescriptorSymbol->setType(ELF::STT_OBJECT); KernelDescriptorSymbol->setSize( MCConstantExpr::create(sizeof(KernelDescriptor), Context)); - MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>( - Context.getOrCreateSymbol(Twine(KernelName))); - KernelCodeSymbol->setBinding(ELF::STB_LOCAL); + // The visibility of the kernel code symbol must be protected or less to allow + // static relocations from the kernel descriptor to be used. + if (KernelCodeSymbol->getVisibility() == ELF::STV_DEFAULT) + KernelCodeSymbol->setVisibility(ELF::STV_PROTECTED); Streamer.EmitLabel(KernelDescriptorSymbol); Streamer.EmitBytes(StringRef( diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 9a807c804f9f..683b3e363b9a 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -1,9 +1,8 @@ //===-- AMDGPUTargetStreamer.h - AMDGPU Target Streamer --------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -11,7 +10,8 @@ #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H #include "AMDKernelCodeT.h" -#include "llvm/BinaryFormat/MsgPackTypes.h" +#include "Utils/AMDGPUPALMetadata.h" +#include "llvm/BinaryFormat/MsgPackDocument.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDGPUMetadata.h" @@ -29,12 +29,16 @@ class Module; class Type; class AMDGPUTargetStreamer : public MCTargetStreamer { + AMDGPUPALMetadata PALMetadata; + protected: MCContext &getContext() const { return Streamer.getContext(); } public: AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} + AMDGPUPALMetadata *getPALMetadata() { return &PALMetadata; } + virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0; virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major, @@ -49,6 +53,9 @@ public: virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0; + virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, + unsigned Align) = 0; + /// \returns True on success, false on failure. virtual bool EmitISAVersion(StringRef IsaVersionString) = 0; @@ -65,14 +72,13 @@ public: /// the \p HSAMetadata structure is updated with the correct types. /// /// \returns True on success, false on failure. - virtual bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata, - bool Strict) = 0; + virtual bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) = 0; /// \returns True on success, false on failure. virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0; /// \returns True on success, false on failure. - virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0; + virtual bool EmitCodeEnd() = 0; virtual void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, @@ -89,6 +95,8 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer { public: AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); + void finish() override; + void EmitDirectiveAMDGCNTarget(StringRef Target) override; void EmitDirectiveHSACodeObjectVersion(uint32_t Major, @@ -102,18 +110,19 @@ public: void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override; + /// \returns True on success, false on failure. bool EmitISAVersion(StringRef IsaVersionString) override; /// \returns True on success, false on failure. - bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata, - bool Strict) override; + bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override; /// \returns True on success, false on failure. bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; /// \returns True on success, false on failure. - bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override; + bool EmitCodeEnd() override; void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, @@ -133,6 +142,8 @@ public: MCELFStreamer &getStreamer(); + void finish() override; + void EmitDirectiveAMDGCNTarget(StringRef Target) override; void EmitDirectiveHSACodeObjectVersion(uint32_t Major, @@ -146,18 +157,19 @@ public: void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override; + /// \returns True on success, false on failure. bool EmitISAVersion(StringRef IsaVersionString) override; /// \returns True on success, false on failure. - bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata, - bool Strict) override; + bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override; /// \returns True on success, false on failure. bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; /// \returns True on success, false on failure. - bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override; + bool EmitCodeEnd() override; void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 28d4bc1829e2..2f1f4e7a0392 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -65,9 +64,10 @@ private: uint64_t getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; - void verifyInstructionPredicates(const MCInst &MI, - uint64_t AvailableFeatures) const; + FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; + void + verifyInstructionPredicates(const MCInst &MI, + const FeatureBitset &AvailableFeatures) const; }; diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp index 1c99a708e5ac..a4809af29daa 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp @@ -1,9 +1,8 @@ //===-- R600MCTargetDesc.cpp - R600 Target Descriptions -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 36913bd04274..f8ec3c36f019 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -1,9 +1,8 @@ //===-- SIMCCodeEmitter.cpp - SI Code Emitter -----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,9 +13,11 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPURegisterInfo.h" #include "MCTargetDesc/AMDGPUFixupKinds.h" #include "MCTargetDesc/AMDGPUMCCodeEmitter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" @@ -77,6 +78,10 @@ public: unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; + + unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; }; } // end anonymous namespace @@ -233,6 +238,8 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: return getLit32Encoding(static_cast<uint32_t>(Imm), STI); case AMDGPU::OPERAND_REG_IMM_INT64: @@ -245,12 +252,21 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: // FIXME Is this correct? What do inline immediates do on SI for f16 src // which does not have f16 support? return getLit16Encoding(static_cast<uint16_t>(Imm), STI); + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + if (!isUInt<16>(Imm) && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) + return getLit32Encoding(static_cast<uint32_t>(Imm), STI); + LLVM_FALLTHROUGH; case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { uint16_t Lo16 = static_cast<uint16_t>(Imm); uint32_t Encoding = getLit16Encoding(Lo16, STI); return Encoding; @@ -274,7 +290,25 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); } - if (bytes > 4) + // NSA encoding. + if (AMDGPU::isGFX10(STI) && Desc.TSFlags & SIInstrFlags::MIMG) { + int vaddr0 = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vaddr0); + int srsrc = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::srsrc); + assert(vaddr0 >= 0 && srsrc > vaddr0); + unsigned NumExtraAddrs = srsrc - vaddr0 - 1; + unsigned NumPadding = (-NumExtraAddrs) & 3; + + for (unsigned i = 0; i < NumExtraAddrs; ++i) + OS.write((uint8_t)getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i), + Fixups, STI)); + for (unsigned i = 0; i < NumPadding; ++i) + OS.write(0); + } + + if ((bytes > 8 && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) || + (bytes > 4 && !STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal])) return; // Check for additional literals in SRC0/1/2 (Op 1/2/3) @@ -366,7 +400,7 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, const MCOperand &MO = MI.getOperand(OpNo); unsigned Reg = MO.getReg(); - if (Reg != AMDGPU::VCC) { + if (Reg != AMDGPU::VCC && Reg != AMDGPU::VCC_LO) { RegEnc |= MRI.getEncodingValue(Reg); RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK; @@ -374,10 +408,31 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, return RegEnc; } +unsigned +SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + unsigned Reg = MI.getOperand(OpNo).getReg(); + uint64_t Enc = MRI.getEncodingValue(Reg); + + // VGPR and AGPR have the same encoding, but SrcA and SrcB operands of mfma + // instructions use acc[0:1] modifier bits to distinguish. These bits are + // encoded as a virtual 9th bit of the register for these operands. + if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg)) + Enc |= 512; + + return Enc; +} + static bool needsPCRel(const MCExpr *Expr) { switch (Expr->getKind()) { - case MCExpr::SymbolRef: - return true; + case MCExpr::SymbolRef: { + auto *SE = cast<MCSymbolRefExpr>(Expr); + MCSymbolRefExpr::VariantKind Kind = SE->getKind(); + return Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_LO && + Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_HI; + } case MCExpr::Binary: { auto *BE = cast<MCBinaryExpr>(Expr); if (BE->getOpcode() == MCBinaryExpr::Sub) @@ -416,7 +471,13 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, Kind = FK_PCRel_4; else Kind = FK_Data_4; - Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc())); + + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + uint32_t Offset = Desc.getSize(); + assert(Offset == 4 || Offset == 8); + + Fixups.push_back( + MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc())); } // Figure out the operand number, needed for isSrcOperand check @@ -429,7 +490,8 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); if (AMDGPU::isSISrcOperand(Desc, OpNo)) { uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI); - if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) + if (Enc != ~0U && + (Enc != 255 || Desc.getSize() == 4 || Desc.getSize() == 8)) return Enc; } else if (MO.isImm()) diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index 1c68dbd78e75..4735e6cb2446 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -1,9 +1,8 @@ //===-- MIMGInstructions.td - MIMG Instruction Defintions -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -12,10 +11,14 @@ // // - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8) // - MIMGEncGfx8: encoding introduced with gfx8 for atomics +// - MIMGEncGfx10Default: gfx default (non-NSA) encoding +// - MIMGEncGfx10NSA: gfx10 NSA encoding class MIMGEncoding; def MIMGEncGfx6 : MIMGEncoding; def MIMGEncGfx8 : MIMGEncoding; +def MIMGEncGfx10Default : MIMGEncoding; +def MIMGEncGfx10NSA : MIMGEncoding; def MIMGEncoding : GenericEnum { let FilterClass = "MIMGEncoding"; @@ -60,13 +63,28 @@ def MIMGDim : GenericEnum { def MIMGDimInfoTable : GenericTable { let FilterClass = "AMDGPUDimProps"; let CppTypeName = "MIMGDimInfo"; - let Fields = ["Dim", "NumCoords", "NumGradients", "DA"]; + let Fields = ["Dim", "NumCoords", "NumGradients", "DA", "Encoding", "AsmSuffix"]; GenericEnum TypeOf_Dim = MIMGDim; let PrimaryKey = ["Dim"]; let PrimaryKeyName = "getMIMGDimInfo"; } +def getMIMGDimInfoByEncoding : SearchIndex { + let Table = MIMGDimInfoTable; + let Key = ["Encoding"]; +} + +def getMIMGDimInfoByAsmSuffix : SearchIndex { + let Table = MIMGDimInfoTable; + let Key = ["AsmSuffix"]; +} + +class mimg <bits<8> si_gfx10, bits<8> vi = si_gfx10> { + field bits<8> SI_GFX10 = si_gfx10; + field bits<8> VI = vi; +} + class MIMGLZMapping<MIMGBaseOpcode l, MIMGBaseOpcode lz> { MIMGBaseOpcode L = l; MIMGBaseOpcode LZ = lz; @@ -83,12 +101,23 @@ def MIMGLZMappingTable : GenericTable { let PrimaryKeyName = "getMIMGLZMappingInfo"; } -class mimg <bits<7> si, bits<7> vi = si> { - field bits<7> SI = si; - field bits<7> VI = vi; +class MIMGMIPMapping<MIMGBaseOpcode mip, MIMGBaseOpcode nonmip> { + MIMGBaseOpcode MIP = mip; + MIMGBaseOpcode NONMIP = nonmip; } -class MIMG <dag outs, string dns = ""> +def MIMGMIPMappingTable : GenericTable { + let FilterClass = "MIMGMIPMapping"; + let CppTypeName = "MIMGMIPMappingInfo"; + let Fields = ["MIP", "NONMIP"]; + GenericEnum TypeOf_MIP = MIMGBaseOpcode; + GenericEnum TypeOf_NONMIP = MIMGBaseOpcode; + + let PrimaryKey = ["MIP"]; + let PrimaryKeyName = "getMIMGMIPMappingInfo"; +} + +class MIMG_Base <dag outs, string dns = ""> : InstSI <outs, (ins), "", []> { let VM_CNT = 1; @@ -97,20 +126,24 @@ class MIMG <dag outs, string dns = ""> let Uses = [EXEC]; let mayLoad = 1; let mayStore = 0; - let hasPostISelHook = 1; let SchedRW = [WriteVMEM]; let UseNamedOperandTable = 1; let hasSideEffects = 0; // XXX ???? - let SubtargetPredicate = isGCN; let DecoderNamespace = dns; let isAsmParserOnly = !if(!eq(dns,""), 1, 0); - let AsmMatchConverter = "cvtMIMG"; let usesCustomInserter = 1; +} + +class MIMG <dag outs, string dns = ""> + : MIMG_Base <outs, dns> { + + let hasPostISelHook = 1; + let AsmMatchConverter = "cvtMIMG"; Instruction Opcode = !cast<Instruction>(NAME); MIMGBaseOpcode BaseOpcode; - MIMGEncoding MIMGEncoding = MIMGEncGfx6; + MIMGEncoding MIMGEncoding; bits<8> VDataDwords; bits<8> VAddrDwords; } @@ -131,15 +164,66 @@ def getMIMGInfo : SearchIndex { let Key = ["Opcode"]; } -class MIMG_NoSampler_Helper <bits<7> op, string asm, +// This is a separate class so that TableGen memoizes the computations. +class MIMGNSAHelper<int num_addrs> { + list<string> AddrAsmNames = + !foldl([]<string>, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], lhs, i, + !if(!lt(i, num_addrs), !listconcat(lhs, ["vaddr"#!size(lhs)]), lhs)); + dag AddrIns = !dag(ins, !foreach(arg, AddrAsmNames, VGPR_32), AddrAsmNames); + string AddrAsm = "[" # !foldl("$" # !head(AddrAsmNames), !tail(AddrAsmNames), lhs, rhs, + lhs # ", $" # rhs) # "]"; + + int NSA = !if(!le(num_addrs, 1), ?, + !if(!le(num_addrs, 5), 1, + !if(!le(num_addrs, 9), 2, + !if(!le(num_addrs, 13), 3, ?)))); +} + +// Base class of all pre-gfx10 MIMG instructions. +class MIMG_gfx6789<bits<8> op, dag outs, string dns = ""> + : MIMG<outs, dns>, MIMGe_gfx6789<op> { + let SubtargetPredicate = isGFX6GFX7GFX8GFX9; + let AssemblerPredicates = [isGFX6GFX7GFX8GFX9]; + + let MIMGEncoding = MIMGEncGfx6; + + let d16 = !if(BaseOpcode.HasD16, ?, 0); +} + +// Base class of all non-NSA gfx10 MIMG instructions. +class MIMG_gfx10<int op, dag outs, string dns = ""> + : MIMG<outs, dns>, MIMGe_gfx10<op> { + let SubtargetPredicate = isGFX10Plus; + let AssemblerPredicates = [isGFX10Plus]; + + let MIMGEncoding = MIMGEncGfx10Default; + + let d16 = !if(BaseOpcode.HasD16, ?, 0); + let nsa = 0; +} + +// Base class for all NSA MIMG instructions. Note that 1-dword addresses always +// use non-NSA variants. +class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns=""> + : MIMG<outs, dns>, MIMGe_gfx10<op> { + let SubtargetPredicate = isGFX10Plus; + let AssemblerPredicates = [isGFX10Plus]; + + let MIMGEncoding = MIMGEncGfx10NSA; + + MIMGNSAHelper nsah = MIMGNSAHelper<num_addrs>; + dag AddrIns = nsah.AddrIns; + string AddrAsm = nsah.AddrAsm; + + let d16 = !if(BaseOpcode.HasD16, ?, 0); + let nsa = nsah.NSA; +} + +class MIMG_NoSampler_Helper <bits<8> op, string asm, RegisterClass dst_rc, RegisterClass addr_rc, string dns=""> - : MIMG <(outs dst_rc:$vdata), dns>, - MIMGe<op> { - let ssamp = 0; - let d16 = !if(BaseOpcode.HasD16, ?, 0); - + : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> { let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), @@ -148,23 +232,66 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm, #!if(BaseOpcode.HasD16, "$d16", ""); } -multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm, +class MIMG_NoSampler_gfx10<int op, string opcode, + RegisterClass DataRC, RegisterClass AddrRC, + string dns=""> + : MIMG_gfx10<op, (outs DataRC:$vdata), dns> { + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, + SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_NoSampler_nsa_gfx10<int op, string opcode, + RegisterClass DataRC, int num_addrs, + string dns=""> + : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> { + let InOperandList = !con(AddrIns, + (ins SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, + SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm, RegisterClass dst_rc, bit enableDisasm> { - let VAddrDwords = 1 in - def NAME # _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, - !if(enableDisasm, "AMDGPU", "")>; - let VAddrDwords = 2 in - def NAME # _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>; - let VAddrDwords = 3 in - def NAME # _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>; - let VAddrDwords = 4 in - def NAME # _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>; -} - -multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0, + let ssamp = 0 in { + let VAddrDwords = 1 in { + def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + } + + let VAddrDwords = 2 in { + def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>; + def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>; + def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>; + } + + let VAddrDwords = 3 in { + def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>; + def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>; + def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>; + } + + let VAddrDwords = 4 in { + def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>; + def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>; + def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4, + !if(enableDisasm, "AMDGPU", "")>; + } + } +} + +multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0, bit isResInfo = 0> { - def "" : MIMGBaseOpcode { + def "" : MIMGBaseOpcode, PredicateControl { let Coordinates = !if(isResInfo, 0, 1); let LodOrClampOrMip = mip; let HasD16 = has_d16; @@ -180,26 +307,16 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0, defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>; let VDataDwords = 4 in defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>; - let VDataDwords = 8 in - defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>; + let VDataDwords = 5 in + defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0>; } } -class MIMG_Store_Helper <bits<7> op, string asm, +class MIMG_Store_Helper <bits<8> op, string asm, RegisterClass data_rc, RegisterClass addr_rc, string dns = ""> - : MIMG <(outs), dns>, - MIMGe<op> { - let ssamp = 0; - let d16 = !if(BaseOpcode.HasD16, ?, 0); - - let mayLoad = 0; - let mayStore = 1; - let hasSideEffects = 0; - let hasPostISelHook = 0; - let DisableWQM = 1; - + : MIMG_gfx6789<op, (outs), dns> { let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), @@ -208,21 +325,63 @@ class MIMG_Store_Helper <bits<7> op, string asm, #!if(BaseOpcode.HasD16, "$d16", ""); } -multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm, +class MIMG_Store_gfx10<int op, string opcode, + RegisterClass DataRC, RegisterClass AddrRC, + string dns=""> + : MIMG_gfx10<op, (outs), dns> { + let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, + GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Store_nsa_gfx10<int op, string opcode, + RegisterClass DataRC, int num_addrs, + string dns=""> + : MIMG_nsa_gfx10<op, (outs), num_addrs, dns> { + let InOperandList = !con((ins DataRC:$vdata), + AddrIns, + (ins SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, + SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +multiclass MIMG_Store_Addr_Helper <int op, string asm, RegisterClass data_rc, bit enableDisasm> { - let VAddrDwords = 1 in - def NAME # _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32, - !if(enableDisasm, "AMDGPU", "")>; - let VAddrDwords = 2 in - def NAME # _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>; - let VAddrDwords = 3 in - def NAME # _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>; - let VAddrDwords = 4 in - def NAME # _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>; -} - -multiclass MIMG_Store <bits<7> op, string asm, bit has_d16, bit mip = 0> { + let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0, + DisableWQM = 1, ssamp = 0 in { + let VAddrDwords = 1 in { + def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + } + let VAddrDwords = 2 in { + def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>; + def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>; + def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>; + } + let VAddrDwords = 3 in { + def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>; + def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>; + def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>; + } + let VAddrDwords = 4 in { + def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>; + def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>; + def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4, + !if(enableDisasm, "AMDGPU", "")>; + } + } +} + +multiclass MIMG_Store <bits<8> op, string asm, bit has_d16, bit mip = 0> { def "" : MIMGBaseOpcode { let Store = 1; let LodOrClampOrMip = mip; @@ -241,15 +400,9 @@ multiclass MIMG_Store <bits<7> op, string asm, bit has_d16, bit mip = 0> { } } -class MIMG_Atomic_Helper <string asm, RegisterClass data_rc, - RegisterClass addr_rc, string dns="", - bit enableDasm = 0> - : MIMG <(outs data_rc:$vdst), !if(enableDasm, dns, "")> { - let mayLoad = 1; - let mayStore = 1; - let hasSideEffects = 1; // FIXME: Remove this - let hasPostISelHook = 0; - let DisableWQM = 1; +class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc, + RegisterClass addr_rc, string dns=""> + : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> { let Constraints = "$vdst = $vdata"; let AsmMatchConverter = "cvtMIMGAtomic"; @@ -259,39 +412,80 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc, let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"; } -multiclass MIMG_Atomic_Helper_m <mimg op, string asm, RegisterClass data_rc, - RegisterClass addr_rc, bit enableDasm = 0> { - let ssamp = 0, d16 = 0 in { - def _si : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "SICI", enableDasm>, - SIMCInstr<NAME, SIEncodingFamily.SI>, - MIMGe<op.SI> { - let AssemblerPredicates = [isSICI]; - let DisableDecoder = DisableSIDecoder; - } +class MIMG_Atomic_si<mimg op, string asm, RegisterClass data_rc, + RegisterClass addr_rc, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.SI_GFX10, asm, data_rc, addr_rc, + !if(enableDasm, "GFX6GFX7", "")> { + let AssemblerPredicates = [isGFX6GFX7]; +} - def _vi : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "VI", enableDasm>, - SIMCInstr<NAME, SIEncodingFamily.VI>, - MIMGe<op.VI> { - let AssemblerPredicates = [isVI]; - let DisableDecoder = DisableVIDecoder; - let MIMGEncoding = MIMGEncGfx8; - } - } +class MIMG_Atomic_vi<mimg op, string asm, RegisterClass data_rc, + RegisterClass addr_rc, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> { + let AssemblerPredicates = [isGFX8GFX9]; + let MIMGEncoding = MIMGEncGfx8; +} + +class MIMG_Atomic_gfx10<mimg op, string opcode, + RegisterClass DataRC, RegisterClass AddrRC, + bit enableDisasm = 0> + : MIMG_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst), + !if(enableDisasm, "AMDGPU", "")> { + let Constraints = "$vdst = $vdata"; + let AsmMatchConverter = "cvtMIMGAtomic"; + + let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, + GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe); + let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"; +} + +class MIMG_Atomic_nsa_gfx10<mimg op, string opcode, + RegisterClass DataRC, int num_addrs, + bit enableDisasm = 0> + : MIMG_nsa_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst), num_addrs, + !if(enableDisasm, "AMDGPU", "")> { + let Constraints = "$vdst = $vdata"; + let AsmMatchConverter = "cvtMIMGAtomic"; + + let InOperandList = !con((ins DataRC:$vdata), + AddrIns, + (ins SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, + SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe)); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"; } multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm, RegisterClass data_rc, bit enableDasm = 0> { - // _V* variants have different address size, but the size is not encoded. - // So only one variant can be disassembled. V1 looks the safest to decode. - let VAddrDwords = 1 in - defm _V1 : MIMG_Atomic_Helper_m <op, asm, data_rc, VGPR_32, enableDasm>; - let VAddrDwords = 2 in - defm _V2 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_64>; - let VAddrDwords = 3 in - defm _V3 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_96>; - let VAddrDwords = 4 in - defm _V4 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_128>; + let hasSideEffects = 1, // FIXME: remove this + mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1, + ssamp = 0 in { + let VAddrDwords = 1 in { + def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>; + } + let VAddrDwords = 2 in { + def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>; + def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; + def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; + def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>; + } + let VAddrDwords = 3 in { + def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>; + def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; + def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; + def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>; + } + let VAddrDwords = 4 in { + def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>; + def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; + def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; + def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>; + } + } } multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atomics @@ -311,12 +505,9 @@ multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atom } } -class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc, +class MIMG_Sampler_Helper <bits<8> op, string asm, RegisterClass dst_rc, RegisterClass src_rc, string dns=""> - : MIMG <(outs dst_rc:$vdata), dns>, - MIMGe<op> { - let d16 = !if(BaseOpcode.HasD16, ?, 0); - + : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> { let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), @@ -325,6 +516,33 @@ class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc, #!if(BaseOpcode.HasD16, "$d16", ""); } +class MIMG_Sampler_gfx10<int op, string opcode, + RegisterClass DataRC, RegisterClass AddrRC, + string dns=""> + : MIMG_gfx10<op, (outs DataRC:$vdata), dns> { + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp, + DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, + GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm" + #"$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Sampler_nsa_gfx10<int op, string opcode, + RegisterClass DataRC, int num_addrs, + string dns=""> + : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> { + let InOperandList = !con(AddrIns, + (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, + Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, + SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm" + #"$dlc$glc$slc$r128$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + class MIMGAddrSize<int dw, bit enable_disasm> { int NumWords = dw; @@ -341,6 +559,11 @@ class MIMGAddrSize<int dw, bit enable_disasm> { bit Disassemble = enable_disasm; } +// Return whether x is in lst. +class isIntInList<int x, list<int> lst> { + bit ret = !foldl(0, lst, lhs, y, !or(lhs, !eq(x, y))); +} + // Return whether a value inside the range [min, max] (endpoints inclusive) // is in the given list. class isRangeInList<int min, int max, list<int> lst> { @@ -376,16 +599,41 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> { !listconcat(lhs.List, [MIMGAddrSize<dw, !empty(lhs.List)>]), !if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords lhs)).List; -} -multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, + // For NSA, generate machine instructions for all possible numbers of words + // except 1 (which is already covered by the non-NSA case). + // The disassembler defaults to the largest number of arguments among the + // variants with the same number of NSA words, and custom code then derives + // the exact variant based on the sample variant and the image dimension. + list<MIMGAddrSize> NSAInstrs = + !foldl([]<MIMGAddrSize>, [[12, 11, 10], [9, 8, 7, 6], [5, 4, 3, 2]], prev, nsa_group, + !listconcat(prev, + !foldl([]<MIMGAddrSize>, nsa_group, lhs, dw, + !if(isIntInList<dw, AllNumAddrWords>.ret, + !listconcat(lhs, [MIMGAddrSize<dw, !empty(lhs)>]), + lhs)))); +} + +multiclass MIMG_Sampler_Src_Helper <bits<8> op, string asm, AMDGPUSampleVariant sample, RegisterClass dst_rc, bit enableDisasm = 0> { foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in { - let VAddrDwords = addr.NumWords in - def _V # addr.NumWords - : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass, - !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + let VAddrDwords = addr.NumWords in { + def _V # addr.NumWords + : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass, + !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + def _V # addr.NumWords # _gfx10 + : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass, + !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + } + } + + foreach addr = MIMG_Sampler_AddrSizes<sample>.NSAInstrs in { + let VAddrDwords = addr.NumWords in { + def _V # addr.NumWords # _nsa_gfx10 + : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords, + !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + } } } @@ -397,7 +645,7 @@ class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample> let LodOrClampOrMip = !ne(sample.LodOrClamp, ""); } -multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, +multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0, bit isGetLod = 0, string asm = "image_sample"#sample.LowerCaseMod> { def "" : MIMG_Sampler_BaseOpcode<sample> { @@ -414,15 +662,15 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>; let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>; - let VDataDwords = 8 in - defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>; + let VDataDwords = 5 in + defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>; } } -multiclass MIMG_Sampler_WQM <bits<7> op, AMDGPUSampleVariant sample> +multiclass MIMG_Sampler_WQM <bits<8> op, AMDGPUSampleVariant sample> : MIMG_Sampler<op, sample, 1>; -multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, +multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0, string asm = "image_gather4"#sample.LowerCaseMod> { def "" : MIMG_Sampler_BaseOpcode<sample> { let HasD16 = 1; @@ -435,12 +683,12 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */ let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>; - let VDataDwords = 8 in - defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>; + let VDataDwords = 5 in + defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>; } } -multiclass MIMG_Gather_WQM <bits<7> op, AMDGPUSampleVariant sample> +multiclass MIMG_Gather_WQM <bits<8> op, AMDGPUSampleVariant sample> : MIMG_Gather<op, sample, 1>; //===----------------------------------------------------------------------===// @@ -473,9 +721,11 @@ defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">; defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">; defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">; defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">; +//let FPAtomic = 1 in { //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI +//} // End let FPAtomic = 1 defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>; defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>; defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>; @@ -581,3 +831,7 @@ def : MIMGLZMapping<IMAGE_GATHER4_L, IMAGE_GATHER4_LZ>; def : MIMGLZMapping<IMAGE_GATHER4_C_L, IMAGE_GATHER4_C_LZ>; def : MIMGLZMapping<IMAGE_GATHER4_L_O, IMAGE_GATHER4_LZ_O>; def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>; + +// MIP to NONMIP Optimization Mapping +def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>; +def : MIMGMIPMapping<IMAGE_STORE_MIP, IMAGE_STORE>; diff --git a/lib/Target/AMDGPU/R600.td b/lib/Target/AMDGPU/R600.td index 5c9c1c1ed504..1d11da969474 100644 --- a/lib/Target/AMDGPU/R600.td +++ b/lib/Target/AMDGPU/R600.td @@ -1,9 +1,8 @@ //===-- R600.td - R600 Tablegen files ----------------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp index 68f8c30775b8..3fb18862fca8 100644 --- a/lib/Target/AMDGPU/R600AsmPrinter.cpp +++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp @@ -1,9 +1,8 @@ //===-- R600AsmPrinter.cpp - R600 Assebly printer ------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600AsmPrinter.h b/lib/Target/AMDGPU/R600AsmPrinter.h index 079fc707b03c..0da9526d716e 100644 --- a/lib/Target/AMDGPU/R600AsmPrinter.h +++ b/lib/Target/AMDGPU/R600AsmPrinter.h @@ -1,9 +1,8 @@ //===-- R600AsmPrinter.h - Print R600 assembly code -------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp index 0c62d6a4b3d9..290a960ae901 100644 --- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -1,9 +1,8 @@ //===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index a19020276f35..8098b81d1ea2 100644 --- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -1,9 +1,8 @@ //===- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h index 0d33d82e8e0f..d72534908dcf 100644 --- a/lib/Target/AMDGPU/R600Defines.h +++ b/lib/Target/AMDGPU/R600Defines.h @@ -1,9 +1,8 @@ //===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index 679cf18d2c20..b97e3c8b8dd7 100644 --- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -1,9 +1,8 @@ //===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index b924ff019dd1..c6e8a060d8a0 100644 --- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -1,9 +1,8 @@ //===- R600ExpandSpecialInstrs.cpp - Expand special instructions ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600FrameLowering.cpp b/lib/Target/AMDGPU/R600FrameLowering.cpp index 37787b3c5f72..d9aa9ebe878d 100644 --- a/lib/Target/AMDGPU/R600FrameLowering.cpp +++ b/lib/Target/AMDGPU/R600FrameLowering.cpp @@ -1,9 +1,8 @@ //===----------------------- R600FrameLowering.cpp ------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h index fe367d73682f..950e238f4979 100644 --- a/lib/Target/AMDGPU/R600FrameLowering.h +++ b/lib/Target/AMDGPU/R600FrameLowering.h @@ -1,9 +1,8 @@ //===--------------------- R600FrameLowering.h ------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index e2a0f05d2b34..f80a53ba1dc6 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1,9 +1,8 @@ //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -1240,11 +1239,13 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); + const bool TruncatingStore = StoreNode->isTruncatingStore(); + // Neither LOCAL nor PRIVATE can do vectors at the moment - if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && + if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS || + TruncatingStore) && VT.isVector()) { - if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && - StoreNode->isTruncatingStore()) { + if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); // TODO: can the chain be replaced without creating a new store? @@ -1260,7 +1261,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { unsigned Align = StoreNode->getAlignment(); if (Align < MemVT.getStoreSize() && - !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) { + !allowsMisalignedMemoryAccesses( + MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) { return expandUnalignedStore(StoreNode, DAG); } @@ -1270,7 +1272,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (AS == AMDGPUAS::GLOBAL_ADDRESS) { // It is beneficial to create MSKOR here instead of combiner to avoid // artificial dependencies introduced by RMW - if (StoreNode->isTruncatingStore()) { + if (TruncatingStore) { assert(VT.bitsLE(MVT::i32)); SDValue MaskConstant; if (MemVT == MVT::i8) { @@ -1310,8 +1312,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // Convert pointer from byte address to dword address. Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); - if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { - llvm_unreachable("Truncated and indexed stores not supported yet"); + if (StoreNode->isIndexed()) { + llvm_unreachable("Indexed stores not supported yet"); } else { Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); } @@ -1662,10 +1664,9 @@ bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, return true; } -bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - unsigned Align, - bool *IsFast) const { +bool R600TargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *IsFast) const { if (IsFast) *IsFast = false; @@ -1713,6 +1714,12 @@ static SDValue CompactSwizzlableVector( if (NewBldVec[i].isUndef()) continue; + // Fix spurious warning with gcc 7.3 -O3 + // warning: array subscript is above array bounds [-Warray-bounds] + // if (NewBldVec[i] == NewBldVec[j]) { + // ~~~~~~~~~~~^ + if (i >= 4) + continue; for (unsigned j = 0; j < i; j++) { if (NewBldVec[i] == NewBldVec[j]) { NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 767c3c7bd5bf..b560da8e91d9 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -1,9 +1,8 @@ //===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -50,9 +49,10 @@ public: bool canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const override; - bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, - unsigned Align, - bool *IsFast) const override; + bool allowsMisalignedMemoryAccesses( + EVT VT, unsigned AS, unsigned Align, + MachineMemOperand::Flags Flags = MachineMemOperand::MONone, + bool *IsFast = nullptr) const override; private: unsigned Gen; diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td index 687a9affa138..f62e6313b148 100644 --- a/lib/Target/AMDGPU/R600InstrFormats.td +++ b/lib/Target/AMDGPU/R600InstrFormats.td @@ -1,9 +1,8 @@ //===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index 9cc3e5f3c314..d9e839fe2035 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -1,9 +1,8 @@ //===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -402,6 +401,7 @@ Swizzle(std::vector<std::pair<int, unsigned>> Src, } static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { + assert(Op < 3 && "Out of range swizzle index"); switch (Swz) { case R600InstrInfo::ALU_VEC_012_SCL_210: { unsigned Cycles[3] = { 2, 1, 0}; diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h index e6e34dc125f4..00d96c9676aa 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.h +++ b/lib/Target/AMDGPU/R600InstrInfo.h @@ -1,9 +1,8 @@ //===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index 10e873755222..f40eece859ee 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -1,9 +1,8 @@ //===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -296,6 +295,34 @@ class VTX_READ <string name, dag outs, list<dag> pattern> let VTXInst = 1; } +// FIXME: Deprecated. +class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress; + +class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr), + (ld_node node:$ptr), [{ + LoadSDNode *L = cast<LoadSDNode>(N); + return L->getExtensionType() == ISD::ZEXTLOAD || + L->getExtensionType() == ISD::EXTLOAD; +}]>; + +def az_extload : AZExtLoadBase <unindexedload>; + +def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; + +// FIXME: These are deprecated +def az_extloadi8_local : LocalLoad <az_extloadi8>; +def az_extloadi16_local : LocalLoad <az_extloadi16>; + class LoadParamFrag <PatFrag load_type> : PatFrag < (ops node:$ptr), (load_type node:$ptr), [{ return isConstantLoad(cast<LoadSDNode>(N), 0) || diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp index 3ca319c6c6c2..65011a9eadf8 100644 --- a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h index 29ac0920f997..6a5ac9023329 100644 --- a/lib/Target/AMDGPU/R600MachineFunctionInfo.h +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h @@ -1,9 +1,8 @@ //===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp index 7769a35aadce..34267a909b5e 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -1,9 +1,8 @@ //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h index 8a9a8d3d1e23..bc66f2ef5907 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.h +++ b/lib/Target/AMDGPU/R600MachineScheduler.h @@ -1,9 +1,8 @@ //===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp index 7de5e2c9577d..1fe92d2269d3 100644 --- a/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp +++ b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp @@ -1,9 +1,8 @@ //===- R600OpenCLImageTypeLoweringPass.cpp ------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 692451cb8fe0..9f1cb6582b5c 100644 --- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -1,9 +1,8 @@ //===- R600MergeVectorRegisters.cpp ---------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -57,17 +56,12 @@ using namespace llvm; #define DEBUG_TYPE "vec-merger" -static bool -isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { - for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg), - E = MRI.def_instr_end(); It != E; ++It) { - return (*It).isImplicitDef(); - } - if (MRI.isReserved(Reg)) { +static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { + assert(MRI.isSSA()); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) return false; - } - llvm_unreachable("Reg without a def"); - return false; + const MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + return MI && MI->isImplicitDef(); } namespace { diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp index 612c62b514fd..df200baf11c1 100644 --- a/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/lib/Target/AMDGPU/R600Packetizer.cpp @@ -1,9 +1,8 @@ //===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -187,8 +186,8 @@ public: // Does MII and MIJ share the same pred_sel ? int OpI = TII->getOperandIdx(MII->getOpcode(), R600::OpName::pred_sel), OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600::OpName::pred_sel); - unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, - PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; + Register PredI = (OpI > -1)?MII->getOperand(OpI).getReg() : Register(), + PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg() : Register(); if (PredI != PredJ) return false; if (SUJ->isSucc(SUI)) { diff --git a/lib/Target/AMDGPU/R600Processors.td b/lib/Target/AMDGPU/R600Processors.td index f39b3dc1bfd4..fff884e4848e 100644 --- a/lib/Target/AMDGPU/R600Processors.td +++ b/lib/Target/AMDGPU/R600Processors.td @@ -1,9 +1,8 @@ //===-- R600Processors.td - R600 Processor definitions --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -41,23 +40,24 @@ def FeatureCFALUBug : SubtargetFeature<"cfalubug", "GPU has CF_ALU bug" >; -class R600SubtargetFeatureGeneration <string Value, +class R600SubtargetFeatureGeneration <string Value, string FeatureName, list<SubtargetFeature> Implies> : - SubtargetFeatureGeneration <Value, "R600Subtarget", Implies>; + SubtargetFeatureGeneration <Value, FeatureName, "R600Subtarget", Implies>; -def FeatureR600 : R600SubtargetFeatureGeneration<"R600", +def FeatureR600 : R600SubtargetFeatureGeneration<"R600", "r600", [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0] >; -def FeatureR700 : R600SubtargetFeatureGeneration<"R700", +def FeatureR700 : R600SubtargetFeatureGeneration<"R700", "r700", [FeatureFetchLimit16, FeatureLocalMemorySize0] >; -def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", +def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", "evergreen", [FeatureFetchLimit16, FeatureLocalMemorySize32768] >; def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS", + "northern-islands", [FeatureFetchLimit16, FeatureWavefrontSize64, FeatureLocalMemorySize32768] >; diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp index 38933e7616a0..685df74490fe 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- R600RegisterInfo.cpp - R600 Register Information ------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -68,7 +67,7 @@ const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs( return &CalleeSavedReg; } -unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const { +Register R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const { return R600::NoRegister; } diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h index c4c77172b299..9378b70ca580 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.h +++ b/lib/Target/AMDGPU/R600RegisterInfo.h @@ -1,9 +1,8 @@ //===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -27,7 +26,7 @@ struct R600RegisterInfo final : public R600GenRegisterInfo { BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; /// get the HW encoding for a register's channel. unsigned getHWRegChan(unsigned reg) const; diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td index 70fb46c1a7d6..c998fe848193 100644 --- a/lib/Target/AMDGPU/R600Schedule.td +++ b/lib/Target/AMDGPU/R600Schedule.td @@ -1,9 +1,8 @@ //===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/R700Instructions.td b/lib/Target/AMDGPU/R700Instructions.td index 613a0d729bb3..9c9a03209ec2 100644 --- a/lib/Target/AMDGPU/R700Instructions.td +++ b/lib/Target/AMDGPU/R700Instructions.td @@ -1,9 +1,8 @@ //===-- R700Instructions.td - R700 Instruction defs -------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp index 69cafef4a351..f8094e35816c 100644 --- a/lib/Target/AMDGPU/SIAddIMGInit.cpp +++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -1,9 +1,8 @@ //===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 98e9ea662324..b764ca7d7061 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -1,9 +1,8 @@ //===- SIAnnotateControlFlow.cpp ------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -13,12 +12,13 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" @@ -38,6 +38,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <utility> @@ -56,13 +57,13 @@ class SIAnnotateControlFlow : public FunctionPass { Type *Boolean; Type *Void; - Type *Int64; + Type *IntMask; Type *ReturnStruct; ConstantInt *BoolTrue; ConstantInt *BoolFalse; UndefValue *BoolUndef; - Constant *Int64Zero; + Constant *IntMaskZero; Function *If; Function *Else; @@ -75,6 +76,8 @@ class SIAnnotateControlFlow : public FunctionPass { LoopInfo *LI; + void initialize(Module &M, const GCNSubtarget &ST); + bool isUniform(BranchInst *T); bool isTopOfStack(BasicBlock *BB); @@ -104,8 +107,6 @@ public: SIAnnotateControlFlow() : FunctionPass(ID) {} - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "SI annotate control flow"; } @@ -115,6 +116,7 @@ public: AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LegacyDivergenceAnalysis>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<TargetPassConfig>(); FunctionPass::getAnalysisUsage(AU); } }; @@ -125,31 +127,34 @@ INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) char SIAnnotateControlFlow::ID = 0; /// Initialize all the types and constants used in the pass -bool SIAnnotateControlFlow::doInitialization(Module &M) { +void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) { LLVMContext &Context = M.getContext(); Void = Type::getVoidTy(Context); Boolean = Type::getInt1Ty(Context); - Int64 = Type::getInt64Ty(Context); - ReturnStruct = StructType::get(Boolean, Int64); + IntMask = ST.isWave32() ? Type::getInt32Ty(Context) + : Type::getInt64Ty(Context); + ReturnStruct = StructType::get(Boolean, IntMask); BoolTrue = ConstantInt::getTrue(Context); BoolFalse = ConstantInt::getFalse(Context); BoolUndef = UndefValue::get(Boolean); - Int64Zero = ConstantInt::get(Int64, 0); - - If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if); - Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else); - IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break); - Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop); - EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf); - return false; + IntMaskZero = ConstantInt::get(IntMask, 0); + + If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if, { IntMask }); + Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else, + { IntMask, IntMask }); + IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break, + { IntMask, IntMask }); + Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask }); + EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask }); } /// Is the branch condition uniform or did the StructurizeCFG pass @@ -259,14 +264,23 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { return; BasicBlock *Target = Term->getSuccessor(1); - PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front()); + PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken", &Target->front()); Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); Value *Arg = handleLoopCondition(Cond, Broken, L, Term); - for (BasicBlock *Pred : predecessors(Target)) - Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred); + for (BasicBlock *Pred : predecessors(Target)) { + Value *PHIValue = IntMaskZero; + if (Pred == BB) // Remember the value of the previous iteration. + PHIValue = Arg; + // If the backedge from Pred to Target could be executed before the exit + // of the loop at BB, it should not reset or change "Broken", which keeps + // track of the number of threads exited the loop at BB. + else if (L->contains(Pred) && DT->dominates(Pred, BB)) + PHIValue = Broken; + Broken->addIncoming(PHIValue, Pred); + } Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); @@ -308,6 +322,10 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DA = &getAnalysis<LegacyDivergenceAnalysis>(); + TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); + const TargetMachine &TM = TPC.getTM<TargetMachine>(); + + initialize(*F.getParent(), TM.getSubtarget<GCNSubtarget>(F)); for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp deleted file mode 100644 index 7e884ad93a23..000000000000 --- a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp +++ /dev/null @@ -1,97 +0,0 @@ -//===--- SIDebuggerInsertNops.cpp - Inserts nops for debugger usage -------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Inserts one nop instruction for each high level source statement for -/// debugger usage. -/// -/// Tools, such as a debugger, need to pause execution based on user input (i.e. -/// breakpoint). In order to do this, one nop instruction is inserted before the -/// first isa instruction of each high level source statement. Further, the -/// debugger may replace nop instructions with trap instructions based on user -/// input. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -using namespace llvm; - -#define DEBUG_TYPE "si-debugger-insert-nops" -#define PASS_NAME "SI Debugger Insert Nops" - -namespace { - -class SIDebuggerInsertNops : public MachineFunctionPass { -public: - static char ID; - - SIDebuggerInsertNops() : MachineFunctionPass(ID) { } - StringRef getPassName() const override { return PASS_NAME; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; -}; - -} // anonymous namespace - -INITIALIZE_PASS(SIDebuggerInsertNops, DEBUG_TYPE, PASS_NAME, false, false) - -char SIDebuggerInsertNops::ID = 0; -char &llvm::SIDebuggerInsertNopsID = SIDebuggerInsertNops::ID; - -FunctionPass *llvm::createSIDebuggerInsertNopsPass() { - return new SIDebuggerInsertNops(); -} - -bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) { - // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not - // specified. - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (!ST.debuggerInsertNops()) - return false; - - // Skip machine functions without debug info. - if (!MF.getMMI().hasDebugInfo()) - return false; - - // Target instruction info. - const SIInstrInfo *TII = ST.getInstrInfo(); - - // Set containing line numbers that have nop inserted. - DenseSet<unsigned> NopInserted; - - for (auto &MBB : MF) { - for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { - // Skip debug instructions and instructions without location. - if (MI->isDebugInstr() || !MI->getDebugLoc()) - continue; - - // Insert nop instruction if line number does not have nop inserted. - auto DL = MI->getDebugLoc(); - if (NopInserted.find(DL.getLine()) == NopInserted.end()) { - BuildMI(MBB, *MI, DL, TII->get(AMDGPU::S_NOP)) - .addImm(0); - NopInserted.insert(DL.getLine()); - } - } - } - - return true; -} diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index 7f6abc34cff3..a0e1ec6ac235 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -1,9 +1,8 @@ //===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// @@ -90,13 +89,22 @@ enum : uint64_t { // Is a D16 buffer instruction. D16Buf = UINT64_C(1) << 50, + // FLAT instruction accesses FLAT_GLBL or FLAT_SCRATCH segment. + IsNonFlatSeg = UINT64_C(1) << 51, + // Uses floating point double precision rounding mode - FPDPRounding = UINT64_C(1) << 51 + FPDPRounding = UINT64_C(1) << 52, + + // Instruction is FP atomic. + FPAtomic = UINT64_C(1) << 53, + + // Is a MFMA instruction. + IsMAI = UINT64_C(1) << 54 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. // The result is true if any of these tests are true. -enum ClassFlags { +enum ClassFlags : unsigned { S_NAN = 1 << 0, // Signaling NaN Q_NAN = 1 << 1, // Quiet NaN N_INFINITY = 1 << 2, // Negative infinity @@ -111,7 +119,7 @@ enum ClassFlags { } namespace AMDGPU { - enum OperandType { + enum OperandType : unsigned { /// Operands with register or 32-bit immediate OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET, OPERAND_REG_IMM_INT64, @@ -119,6 +127,8 @@ namespace AMDGPU { OPERAND_REG_IMM_FP32, OPERAND_REG_IMM_FP64, OPERAND_REG_IMM_FP16, + OPERAND_REG_IMM_V2FP16, + OPERAND_REG_IMM_V2INT16, /// Operands with register or inline constant OPERAND_REG_INLINE_C_INT16, @@ -130,11 +140,22 @@ namespace AMDGPU { OPERAND_REG_INLINE_C_V2FP16, OPERAND_REG_INLINE_C_V2INT16, + /// Operands with an AccVGPR register or inline constant + OPERAND_REG_INLINE_AC_INT16, + OPERAND_REG_INLINE_AC_INT32, + OPERAND_REG_INLINE_AC_FP16, + OPERAND_REG_INLINE_AC_FP32, + OPERAND_REG_INLINE_AC_V2FP16, + OPERAND_REG_INLINE_AC_V2INT16, + OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32, - OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16, + OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2INT16, OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16, - OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_V2INT16, + OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2INT16, + + OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16, + OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2INT16, OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32, OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST, @@ -151,17 +172,10 @@ namespace AMDGPU { }; } -namespace SIStackID { -enum StackTypes : uint8_t { - SCRATCH = 0, - SGPR_SPILL = 1 -}; -} - // Input operand modifiers bit-masks // NEG and SEXT share same bit-mask because they can't be set simultaneously. namespace SISrcMods { - enum { + enum : unsigned { NEG = 1 << 0, // Floating-point negate modifier ABS = 1 << 1, // Floating-point absolute modifier SEXT = 1 << 0, // Integer sign-extend modifier @@ -173,7 +187,7 @@ namespace SISrcMods { } namespace SIOutMods { - enum { + enum : unsigned { NONE = 0, MUL2 = 1, MUL4 = 2, @@ -181,17 +195,33 @@ namespace SIOutMods { }; } +namespace AMDGPU { namespace VGPRIndexMode { - enum { - SRC0_ENABLE = 1 << 0, - SRC1_ENABLE = 1 << 1, - SRC2_ENABLE = 1 << 2, - DST_ENABLE = 1 << 3 - }; -} + +enum Id : unsigned { // id of symbolic names + ID_SRC0 = 0, + ID_SRC1, + ID_SRC2, + ID_DST, + + ID_MIN = ID_SRC0, + ID_MAX = ID_DST +}; + +enum EncBits : unsigned { + OFF = 0, + SRC0_ENABLE = 1 << ID_SRC0, + SRC1_ENABLE = 1 << ID_SRC1, + SRC2_ENABLE = 1 << ID_SRC2, + DST_ENABLE = 1 << ID_DST, + ENABLE_MASK = SRC0_ENABLE | SRC1_ENABLE | SRC2_ENABLE | DST_ENABLE +}; + +} // namespace VGPRIndexMode +} // namespace AMDGPU namespace AMDGPUAsmVariants { - enum { + enum : unsigned { DEFAULT = 0, VOP3 = 1, SDWA = 2, @@ -203,13 +233,14 @@ namespace AMDGPUAsmVariants { namespace AMDGPU { namespace EncValues { // Encoding values of enum9/8/7 operands -enum { +enum : unsigned { SGPR_MIN = 0, - SGPR_MAX = 101, + SGPR_MAX_SI = 101, + SGPR_MAX_GFX10 = 105, TTMP_VI_MIN = 112, TTMP_VI_MAX = 123, - TTMP_GFX9_MIN = 108, - TTMP_GFX9_MAX = 123, + TTMP_GFX9_GFX10_MIN = 108, + TTMP_GFX9_GFX10_MAX = 123, INLINE_INTEGER_C_MIN = 128, INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64 INLINE_INTEGER_C_MAX = 208, @@ -231,6 +262,8 @@ enum Id { // Message ID, width(4) [3:0]. ID_INTERRUPT = 1, ID_GS, ID_GS_DONE, + ID_GS_ALLOC_REQ = 9, + ID_GET_DOORBELL = 10, ID_SYSMSG = 15, ID_GAPS_LAST_, // Indicate that sequence has gaps. ID_GAPS_FIRST_ = ID_INTERRUPT, @@ -242,27 +275,28 @@ enum Id { // Message ID, width(4) [3:0]. enum Op { // Both GS and SYS operation IDs. OP_UNKNOWN_ = -1, OP_SHIFT_ = 4, - // width(2) [5:4] + OP_NONE_ = 0, + // Bits used for operation encoding + OP_WIDTH_ = 3, + OP_MASK_ = (((1 << OP_WIDTH_) - 1) << OP_SHIFT_), + // GS operations are encoded in bits 5:4 OP_GS_NOP = 0, OP_GS_CUT, OP_GS_EMIT, OP_GS_EMIT_CUT, OP_GS_LAST_, OP_GS_FIRST_ = OP_GS_NOP, - OP_GS_WIDTH_ = 2, - OP_GS_MASK_ = (((1 << OP_GS_WIDTH_) - 1) << OP_SHIFT_), - // width(3) [6:4] + // SYS operations are encoded in bits 6:4 OP_SYS_ECC_ERR_INTERRUPT = 1, OP_SYS_REG_RD, OP_SYS_HOST_TRAP_ACK, OP_SYS_TTRACE_PC, OP_SYS_LAST_, OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT, - OP_SYS_WIDTH_ = 3, - OP_SYS_MASK_ = (((1 << OP_SYS_WIDTH_) - 1) << OP_SHIFT_) }; -enum StreamId { // Stream ID, (2) [9:8]. +enum StreamId : unsigned { // Stream ID, (2) [9:8]. + STREAM_ID_NONE_ = 0, STREAM_ID_DEFAULT_ = 0, STREAM_ID_LAST_ = 4, STREAM_ID_FIRST_ = STREAM_ID_DEFAULT_, @@ -287,23 +321,34 @@ enum Id { // HwRegCode, (6) [5:0] ID_IB_STS = 7, ID_MEM_BASES = 15, ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES, - ID_SYMBOLIC_LAST_ = 16, + ID_TBA_LO = 16, + ID_SYMBOLIC_FIRST_GFX10_ = ID_TBA_LO, + ID_TBA_HI = 17, + ID_TMA_LO = 18, + ID_TMA_HI = 19, + ID_FLAT_SCR_LO = 20, + ID_FLAT_SCR_HI = 21, + ID_XNACK_MASK = 22, + ID_POPS_PACKER = 25, + ID_SYMBOLIC_LAST_ = 26, ID_SHIFT_ = 0, ID_WIDTH_ = 6, ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) }; -enum Offset { // Offset, (5) [10:6] +enum Offset : unsigned { // Offset, (5) [10:6] OFFSET_DEFAULT_ = 0, OFFSET_SHIFT_ = 6, OFFSET_WIDTH_ = 5, OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), + OFFSET_MEM_VIOL = 8, + OFFSET_SRC_SHARED_BASE = 16, OFFSET_SRC_PRIVATE_BASE = 0 }; -enum WidthMinusOne { // WidthMinusOne, (5) [15:11] +enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11] WIDTH_M1_DEFAULT_ = 31, WIDTH_M1_SHIFT_ = 11, WIDTH_M1_WIDTH_ = 5, @@ -313,11 +358,16 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11] WIDTH_M1_SRC_PRIVATE_BASE = 15 }; +// Some values from WidthMinusOne mapped into Width domain. +enum Width : unsigned { + WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1, +}; + } // namespace Hwreg namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32. -enum Id { // id of symbolic names +enum Id : unsigned { // id of symbolic names ID_QUAD_PERM = 0, ID_BITMASK_PERM, ID_SWAP, @@ -325,7 +375,7 @@ enum Id { // id of symbolic names ID_BROADCAST }; -enum EncBits { +enum EncBits : unsigned { // swizzle mode encodings @@ -357,7 +407,7 @@ enum EncBits { namespace SDWA { -enum SdwaSel { +enum SdwaSel : unsigned { BYTE_0 = 0, BYTE_1 = 1, BYTE_2 = 2, @@ -367,13 +417,13 @@ enum SdwaSel { DWORD = 6, }; -enum DstUnused { +enum DstUnused : unsigned { UNUSED_PAD = 0, UNUSED_SEXT = 1, UNUSED_PRESERVE = 2, }; -enum SDWA9EncValues{ +enum SDWA9EncValues : unsigned { SRC_SGPR_MASK = 0x100, SRC_VGPR_MASK = 0xFF, VOPC_DST_VCC_MASK = 0x80, @@ -382,7 +432,8 @@ enum SDWA9EncValues{ SRC_VGPR_MIN = 0, SRC_VGPR_MAX = 255, SRC_SGPR_MIN = 256, - SRC_SGPR_MAX = 357, + SRC_SGPR_MAX_SI = 357, + SRC_SGPR_MAX_GFX10 = 361, SRC_TTMP_MIN = 364, SRC_TTMP_MAX = 379, }; @@ -391,7 +442,7 @@ enum SDWA9EncValues{ namespace DPP { -enum DppCtrl { +enum DppCtrl : unsigned { QUAD_PERM_FIRST = 0, QUAD_PERM_LAST = 0xFF, DPP_UNUSED1 = 0x100, @@ -422,7 +473,20 @@ enum DppCtrl { ROW_HALF_MIRROR = 0x141, BCAST15 = 0x142, BCAST31 = 0x143, - DPP_LAST = BCAST31 + DPP_UNUSED8_FIRST = 0x144, + DPP_UNUSED8_LAST = 0x14F, + ROW_SHARE_FIRST = 0x150, + ROW_SHARE_LAST = 0x15F, + ROW_XMASK_FIRST = 0x160, + ROW_XMASK_LAST = 0x16F, + DPP_LAST = ROW_XMASK_LAST +}; + +enum DppFiMode { + DPP_FI_0 = 0, + DPP_FI_1 = 1, + DPP8_FI_0 = 0xE9, + DPP8_FI_1 = 0xEA, }; } // namespace DPP @@ -505,6 +569,15 @@ enum DppCtrl { #define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) #define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) #define C_00B848_IEEE_MODE 0xFF7FFFFF +#define S_00B848_WGP_MODE(x) (((x) & 0x1) << 29) +#define G_00B848_WGP_MODE(x) (((x) >> 29) & 0x1) +#define C_00B848_WGP_MODE 0xDFFFFFFF +#define S_00B848_MEM_ORDERED(x) (((x) & 0x1) << 30) +#define G_00B848_MEM_ORDERED(x) (((x) >> 30) & 0x1) +#define C_00B848_MEM_ORDERED 0xBFFFFFFF +#define S_00B848_FWD_PROGRESS(x) (((x) & 0x1) << 31) +#define G_00B848_FWD_PROGRESS(x) (((x) >> 31) & 0x1) +#define C_00B848_FWD_PROGRESS 0x7FFFFFFF // Helpers for setting FLOAT_MODE @@ -535,6 +608,15 @@ enum DppCtrl { #define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 #define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54 +#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21) +#define S_028B54_GS_W32_EN(x) (((x) & 0x1) << 22) +#define S_028B54_VS_W32_EN(x) (((x) & 0x1) << 23) +#define R_0286D8_SPI_PS_IN_CONTROL 0x0286D8 +#define S_0286D8_PS_W32_EN(x) (((x) & 0x1) << 15) +#define R_00B800_COMPUTE_DISPATCH_INITIATOR 0x00B800 +#define S_00B800_CS_W32_EN(x) (((x) & 0x1) << 15) + #define R_SPILLED_SGPRS 0x4 #define R_SPILLED_VGPRS 0x8 } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 809f5bab4693..624953963cf4 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -1,9 +1,8 @@ //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -104,7 +103,7 @@ using namespace llvm; static cl::opt<bool> EnableM0Merge( "amdgpu-enable-merge-m0", cl::desc("Merge and hoist M0 initializations"), - cl::init(false)); + cl::init(true)); namespace { @@ -144,14 +143,15 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass() { return new SIFixSGPRCopies(); } -static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { +static bool hasVectorOperands(const MachineInstr &MI, + const SIRegisterInfo *TRI) { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (!MI.getOperand(i).isReg() || !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) continue; - if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) + if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg()))) return true; } return false; @@ -184,14 +184,14 @@ static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && - TRI.hasVGPRs(SrcRC); + TRI.hasVectorRegisters(SrcRC); } static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && - TRI.hasVGPRs(DstRC); + TRI.hasVectorRegisters(DstRC); } static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, @@ -278,6 +278,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, // VGPRz = REG_SEQUENCE VGPRx, sub0 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); + bool IsAGPR = TRI->hasAGPRs(DstRC); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { unsigned SrcReg = MI.getOperand(I).getReg(); @@ -296,6 +297,17 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, TmpReg) .add(MI.getOperand(I)); + if (IsAGPR) { + const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); + unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC); + unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? + AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), + TmpAReg) + .addReg(TmpReg, RegState::Kill); + TmpReg = TmpAReg; + } + MI.getOperand(I).setReg(TmpReg); } @@ -440,18 +452,32 @@ static bool isReachable(const MachineInstr *From, (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); } +// Return the first non-prologue instruction in the block. +static MachineBasicBlock::iterator +getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { + MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); + while (I != MBB->end() && TII->isBasicBlockPrologue(*I)) + ++I; + + return I; +} + // Hoist and merge identical SGPR initializations into a common predecessor. // This is intended to combine M0 initializations, but can work with any // SGPR. A VGPR cannot be processed since we cannot guarantee vector // executioon. static bool hoistAndMergeSGPRInits(unsigned Reg, const MachineRegisterInfo &MRI, - MachineDominatorTree &MDT) { + MachineDominatorTree &MDT, + const TargetInstrInfo *TII) { // List of inits by immediate value. using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; InitListMap Inits; // List of clobbering instructions. SmallVector<MachineInstr*, 8> Clobbers; + // List of instructions marked for deletion. + SmallSet<MachineInstr*, 8> MergedInstrs; + bool Changed = false; for (auto &MI : MRI.def_instructions(Reg)) { @@ -480,8 +506,8 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, MachineInstr *MI2 = *I2; // Check any possible interference - auto intereferes = [&](MachineBasicBlock::iterator From, - MachineBasicBlock::iterator To) -> bool { + auto interferes = [&](MachineBasicBlock::iterator From, + MachineBasicBlock::iterator To) -> bool { assert(MDT.dominates(&*To, &*From)); @@ -513,23 +539,23 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, }; if (MDT.dominates(MI1, MI2)) { - if (!intereferes(MI2, MI1)) { + if (!interferes(MI2, MI1)) { LLVM_DEBUG(dbgs() << "Erasing from " << printMBBReference(*MI2->getParent()) << " " << *MI2); - MI2->eraseFromParent(); - Defs.erase(I2++); + MergedInstrs.insert(MI2); Changed = true; + ++I2; continue; } } else if (MDT.dominates(MI2, MI1)) { - if (!intereferes(MI1, MI2)) { + if (!interferes(MI1, MI2)) { LLVM_DEBUG(dbgs() << "Erasing from " << printMBBReference(*MI1->getParent()) << " " << *MI1); - MI1->eraseFromParent(); - Defs.erase(I1++); + MergedInstrs.insert(MI1); Changed = true; + ++I1; break; } } else { @@ -540,8 +566,8 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, continue; } - MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); - if (!intereferes(MI1, I) && !intereferes(MI2, I)) { + MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); + if (!interferes(MI1, I) && !interferes(MI2, I)) { LLVM_DEBUG(dbgs() << "Erasing from " << printMBBReference(*MI1->getParent()) << " " << *MI1 @@ -549,9 +575,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, << printMBBReference(*MI2->getParent()) << " to " << printMBBReference(*I->getParent()) << " " << *MI2); I->getParent()->splice(I, MI2->getParent(), MI2); - MI1->eraseFromParent(); - Defs.erase(I1++); + MergedInstrs.insert(MI1); Changed = true; + ++I1; break; } } @@ -561,6 +587,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, } } + for (auto MI : MergedInstrs) + MI->removeFromParent(); + if (Changed) MRI.clearKillFlags(Reg); @@ -679,11 +708,12 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); TII->moveToVALU(MI, MDT); } + break; } case AMDGPU::REG_SEQUENCE: - if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || - !hasVGPROperands(MI, TRI)) { + if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || + !hasVectorOperands(MI, TRI)) { foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); continue; } @@ -698,7 +728,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); if (TRI->isSGPRClass(DstRC) && - (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { + (TRI->hasVectorRegisters(Src0RC) || + TRI->hasVectorRegisters(Src1RC))) { LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); TII->moveToVALU(MI, MDT); } @@ -709,7 +740,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) - hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT); + hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII); return true; } diff --git a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp index 15ba78edf919..29484668a01d 100644 --- a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp @@ -1,9 +1,8 @@ //===-- SIFixVGPRCopies.cpp - Fix VGPR Copies after regalloc --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp deleted file mode 100644 index 7761418c5336..000000000000 --- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp +++ /dev/null @@ -1,418 +0,0 @@ -//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Computations in WWM can overwrite values in inactive channels for -/// variables that the register allocator thinks are dead. This pass adds fake -/// uses of those variables to their def(s) to make sure that they aren't -/// overwritten. -/// -/// As an example, consider this snippet: -/// %vgpr0 = V_MOV_B32_e32 0.0 -/// if (...) { -/// %vgpr1 = ... -/// %vgpr2 = WWM killed %vgpr1 -/// ... = killed %vgpr2 -/// %vgpr0 = V_MOV_B32_e32 1.0 -/// } -/// ... = %vgpr0 -/// -/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally, -/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since -/// writing %vgpr1 would only write to channels that would be clobbered by the -/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled, -/// it would clobber even the inactive channels for which the if-condition is -/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use -/// of %vgpr0 to its def to make sure they aren't allocated to the -/// same register. -/// -/// In general, we need to figure out what registers might have their inactive -/// channels which are eventually used accidentally clobbered by a WWM -/// instruction. We do that by spotting three separate cases of registers: -/// -/// 1. A "then phi": the value resulting from phi elimination of a phi node at -/// the end of an if..endif. If there is WWM code in the "then", then we -/// make the def at the end of the "then" branch a partial def by adding an -/// implicit use of the register. -/// -/// 2. A "loop exit register": a value written inside a loop but used outside the -/// loop, where there is WWM code inside the loop (the case in the example -/// above). We add an implicit_def of the register in the loop pre-header, -/// and make the original def a partial def by adding an implicit use of the -/// register. -/// -/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node -/// in a loop header. If there is WWM code inside the loop, then we make all -/// defs inside the loop partial defs by adding an implicit use of the -/// register on each one. -/// -/// Note that we do not need to consider an if..else..endif phi. We only need to -/// consider non-uniform control flow, and control flow structurization would -/// have transformed a non-uniform if..else..endif into two if..endifs. -/// -/// The analysis to detect these cases relies on a property of the MIR -/// arising from this pass running straight after PHIElimination and before any -/// coalescing: that any virtual register with more than one definition must be -/// the new register added to lower a phi node by PHIElimination. -/// -/// FIXME: We should detect whether a register in one of the above categories is -/// already live at the WWM code before deciding to add the implicit uses to -/// synthesize its liveness. -/// -/// FIXME: I believe this whole scheme may be flawed due to the possibility of -/// the register allocator doing live interval splitting. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SparseBitVector.h" -#include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-fix-wwm-liveness" - -namespace { - -class SIFixWWMLiveness : public MachineFunctionPass { -private: - MachineDominatorTree *DomTree; - MachineLoopInfo *LoopInfo; - LiveIntervals *LIS = nullptr; - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; - MachineRegisterInfo *MRI; - - std::vector<MachineInstr *> WWMs; - std::vector<MachineOperand *> ThenDefs; - std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopExitDefs; - std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopPhiDefs; - -public: - static char ID; - - SIFixWWMLiveness() : MachineFunctionPass(ID) { - initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { return "SI Fix WWM Liveness"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequiredID(MachineDominatorsID); - AU.addRequiredID(MachineLoopInfoID); - // Should preserve the same set that TwoAddressInstructions does. - AU.addPreserved<SlotIndexes>(); - AU.addPreserved<LiveIntervals>(); - AU.addPreservedID(LiveVariablesID); - AU.addPreservedID(MachineLoopInfoID); - AU.addPreservedID(MachineDominatorsID); - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - -private: - void processDef(MachineOperand &DefOpnd); - bool processThenDef(MachineOperand *DefOpnd); - bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop); - bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop); -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE, - "SI fix WWM liveness", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE, - "SI fix WWM liveness", false, false) - -char SIFixWWMLiveness::ID = 0; - -char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID; - -FunctionPass *llvm::createSIFixWWMLivenessPass() { - return new SIFixWWMLiveness(); -} - -bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) { - LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n"); - bool Modified = false; - - // This doesn't actually need LiveIntervals, but we can preserve them. - LIS = getAnalysisIfAvailable<LiveIntervals>(); - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - - TII = ST.getInstrInfo(); - TRI = &TII->getRegisterInfo(); - MRI = &MF.getRegInfo(); - - DomTree = &getAnalysis<MachineDominatorTree>(); - LoopInfo = &getAnalysis<MachineLoopInfo>(); - - // Scan the function to find the WWM sections and the candidate registers for - // having liveness modified. - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (MI.getOpcode() == AMDGPU::EXIT_WWM) - WWMs.push_back(&MI); - else { - for (MachineOperand &DefOpnd : MI.defs()) { - if (DefOpnd.isReg()) { - unsigned Reg = DefOpnd.getReg(); - if (TRI->isVGPR(*MRI, Reg)) - processDef(DefOpnd); - } - } - } - } - } - if (!WWMs.empty()) { - // Synthesize liveness over WWM sections as required. - for (auto ThenDef : ThenDefs) - Modified |= processThenDef(ThenDef); - for (auto LoopExitDef : LoopExitDefs) - Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second); - for (auto LoopPhiDef : LoopPhiDefs) - Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second); - } - - WWMs.clear(); - ThenDefs.clear(); - LoopExitDefs.clear(); - LoopPhiDefs.clear(); - - return Modified; -} - -// During the function scan, process an operand that defines a VGPR. -// This categorizes the register and puts it in the appropriate list for later -// use when processing a WWM section. -void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) { - unsigned Reg = DefOpnd.getReg(); - // Get all the defining instructions. For convenience, make Defs[0] the def - // we are on now. - SmallVector<const MachineInstr *, 4> Defs; - Defs.push_back(DefOpnd.getParent()); - for (auto &MI : MRI->def_instructions(Reg)) { - if (&MI != DefOpnd.getParent()) - Defs.push_back(&MI); - } - // Check whether this def dominates all the others. If not, ignore this def. - // Either it is going to be processed when the scan encounters its other def - // that dominates all defs, or there is no def that dominates all others. - // The latter case is an eliminated phi from an if..else..endif or similar, - // which must be for uniform control flow so can be ignored. - // Because this pass runs shortly after PHIElimination, we assume that any - // multi-def register is a lowered phi, and thus has each def in a separate - // basic block. - for (unsigned I = 1; I != Defs.size(); ++I) { - if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent())) - return; - } - // Check for the case of an if..endif lowered phi: It has two defs, one - // dominates the other, and there is a single use in a successor of the - // dominant def. - // Later we will spot any WWM code inside - // the "then" clause and turn the second def into a partial def so its - // liveness goes through the WWM code in the "then" clause. - if (Defs.size() == 2) { - auto DomDefBlock = Defs[0]->getParent(); - if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) { - auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); - for (auto Succ : DomDefBlock->successors()) { - if (Succ == UseBlock) { - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n"); - ThenDefs.push_back(&DefOpnd); - return; - } - } - } - } - // Check for the case of a non-lowered-phi register (single def) that exits - // a loop, that is, it has a use that is outside a loop that the def is - // inside. We find the outermost loop that the def is inside but a use is - // outside. Later we will spot any WWM code inside that loop and then make - // the def a partial def so its liveness goes round the loop and through the - // WWM code. - if (Defs.size() == 1) { - auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent()); - if (!Loop) - return; - bool IsLoopExit = false; - for (auto &Use : MRI->use_instructions(Reg)) { - auto UseBlock = Use.getParent(); - if (Loop->contains(UseBlock)) - continue; - IsLoopExit = true; - while (auto Parent = Loop->getParentLoop()) { - if (Parent->contains(UseBlock)) - break; - Loop = Parent; - } - } - if (!IsLoopExit) - return; - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is a loop exit reg with loop header at " - << "bb." << Loop->getHeader()->getNumber() << "\n"); - LoopExitDefs.push_back(std::pair<MachineOperand *, MachineLoop *>( - &DefOpnd, Loop)); - return; - } - // Check for the case of a lowered single-preheader-loop phi, that is, a - // multi-def register where the dominating def is in the loop pre-header and - // all other defs are in backedges. Later we will spot any WWM code inside - // that loop and then make the backedge defs partial defs so the liveness - // goes through the WWM code. - // Note that we are ignoring multi-preheader loops on the basis that the - // structurizer does not allow that for non-uniform loops. - // There must be a single use in the loop header. - if (!MRI->hasOneUse(Reg)) - return; - auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); - auto Loop = LoopInfo->getLoopFor(UseBlock); - if (!Loop || Loop->getHeader() != UseBlock - || Loop->contains(Defs[0]->getParent())) { - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is multi-def but single use not in loop header\n"); - return; - } - for (unsigned I = 1; I != Defs.size(); ++I) { - if (!Loop->contains(Defs[I]->getParent())) - return; - } - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) - << " is a loop phi reg with loop header at " - << "bb." << Loop->getHeader()->getNumber() << "\n"); - LoopPhiDefs.push_back( - std::pair<MachineOperand *, MachineLoop *>(&DefOpnd, Loop)); -} - -// Process a then phi def: It has two defs, one dominates the other, and there -// is a single use in a successor of the dominant def. Here we spot any WWM -// code inside the "then" clause and turn the second def into a partial def so -// its liveness goes through the WWM code in the "then" clause. -bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) { - LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent()); - if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) { - // Ignore if dominating def is undef. - LLVM_DEBUG(dbgs() << " ignoring as dominating def is undef\n"); - return false; - } - unsigned Reg = DefOpnd->getReg(); - // Get the use block, which is the endif block. - auto UseBlock = MRI->use_instr_begin(Reg)->getParent(); - // Check whether there is WWM code inside the then branch. The WWM code must - // be dominated by the if but not dominated by the endif. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent()) - && !DomTree->dominates(UseBlock, WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - // Get the other def. - MachineInstr *OtherDef = nullptr; - for (auto &MI : MRI->def_instructions(Reg)) { - if (&MI != DefOpnd->getParent()) - OtherDef = &MI; - } - // Make it a partial def. - OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *OtherDef); - return true; -} - -// Process a loop exit def, that is, a register with a single use in a loop -// that has a use outside the loop. Here we spot any WWM code inside that loop -// and then make the def a partial def so its liveness goes round the loop and -// through the WWM code. -bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd, - MachineLoop *Loop) { - LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent()); - // Check whether there is WWM code inside the loop. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (Loop->contains(WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - unsigned Reg = DefOpnd->getReg(); - // Add a new implicit_def in loop preheader(s). - for (auto Pred : Loop->getHeader()->predecessors()) { - if (!Loop->contains(Pred)) { - auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(), - TII->get(TargetOpcode::IMPLICIT_DEF), Reg); - LLVM_DEBUG(dbgs() << *ImplicitDef); - (void)ImplicitDef; - } - } - // Make the original def partial. - DefOpnd->getParent()->addOperand(MachineOperand::CreateReg( - Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *DefOpnd->getParent()); - return true; -} - -// Process a loop phi def, that is, a multi-def register where the dominating -// def is in the loop pre-header and all other defs are in backedges. Here we -// spot any WWM code inside that loop and then make the backedge defs partial -// defs so the liveness goes through the WWM code. -bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd, - MachineLoop *Loop) { - LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent()); - // Check whether there is WWM code inside the loop. - bool ContainsWWM = false; - for (auto WWM : WWMs) { - if (Loop->contains(WWM->getParent())) { - LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); - ContainsWWM = true; - break; - } - } - if (!ContainsWWM) - return false; - unsigned Reg = DefOpnd->getReg(); - // Remove kill mark from uses. - for (auto &Use : MRI->use_operands(Reg)) - Use.setIsKill(false); - // Make all defs except the dominating one partial defs. - SmallVector<MachineInstr *, 4> Defs; - for (auto &Def : MRI->def_instructions(Reg)) - Defs.push_back(&Def); - for (auto Def : Defs) { - if (DefOpnd->getParent() == Def) - continue; - Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); - LLVM_DEBUG(dbgs() << *Def); - } - return true; -} - diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp index ee39eb04d831..5b834c8de13a 100644 --- a/lib/Target/AMDGPU/SIFixupVectorISel.cpp +++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp @@ -1,9 +1,8 @@ //===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file /// SIFixupVectorISel pass cleans up post ISEL Vector issues. @@ -198,6 +197,11 @@ static bool fixupGlobalSaddr(MachineBasicBlock &MBB, // Atomics dont have a GLC, so omit the field if not there. if (Glc) NewGlob->addOperand(MF, *Glc); + + MachineOperand *DLC = TII->getNamedOperand(MI, AMDGPU::OpName::dlc); + if (DLC) + NewGlob->addOperand(MF, *DLC); + NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc)); // _D16 have an vdst_in operand, copy it in. MachineOperand *VDstInOp = TII->getNamedOperand(MI, diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index f4e866958369..74d77d328019 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1,9 +1,8 @@ //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// @@ -51,7 +50,7 @@ struct FoldCandidate { } else if (FoldOp->isFI()) { FrameIndexToFold = FoldOp->getIndex(); } else { - assert(FoldOp->isReg()); + assert(FoldOp->isReg() || FoldOp->isGlobal()); OpToFold = FoldOp; } } @@ -68,6 +67,8 @@ struct FoldCandidate { return Kind == MachineOperand::MO_Register; } + bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } + bool isCommuted() const { return Commuted; } @@ -88,10 +89,11 @@ public: const SIInstrInfo *TII; const SIRegisterInfo *TRI; const GCNSubtarget *ST; + const SIMachineFunctionInfo *MFI; void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, - unsigned UseOpIdx, + int UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; @@ -160,19 +162,34 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, } } +// TODO: Add heuristic that the frame index might not fit in the addressing mode +// immediate offset to avoid materializing in loops. +static bool frameIndexMayFold(const SIInstrInfo *TII, + const MachineInstr &UseMI, + int OpNo, + const MachineOperand &OpToFold) { + return OpToFold.isFI() && + (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) && + OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr); +} + FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } static bool updateOperand(FoldCandidate &Fold, const SIInstrInfo &TII, - const TargetRegisterInfo &TRI) { + const TargetRegisterInfo &TRI, + const GCNSubtarget &ST) { MachineInstr *MI = Fold.UseMI; MachineOperand &Old = MI->getOperand(Fold.UseOpNo); assert(Old.isReg()); if (Fold.isImm()) { - if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) { + if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked && + !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) && + AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold), + ST.hasInv2PiInlineImm())) { // Set op_sel/op_sel_hi on this operand or bail out if op_sel is // already set. unsigned Opcode = MI->getOpcode(); @@ -190,77 +207,94 @@ static bool updateOperand(FoldCandidate &Fold, unsigned Val = Mod.getImm(); if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) return false; - // If upper part is all zero we do not need op_sel_hi. - if (!isUInt<16>(Fold.ImmToFold)) { - if (!(Fold.ImmToFold & 0xffff)) { - Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); + // Only apply the following transformation if that operand requries + // a packed immediate. + switch (TII.get(Opcode).OpInfo[OpNo].OperandType) { + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + // If upper part is all zero we do not need op_sel_hi. + if (!isUInt<16>(Fold.ImmToFold)) { + if (!(Fold.ImmToFold & 0xffff)) { + Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); + Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + return true; + } Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); return true; } - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + break; + default: + break; } } + } - if (Fold.needsShrink()) { - MachineBasicBlock *MBB = MI->getParent(); - auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); - if (Liveness != MachineBasicBlock::LQR_Dead) - return false; - - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - int Op32 = Fold.getShrinkOpcode(); - MachineOperand &Dst0 = MI->getOperand(0); - MachineOperand &Dst1 = MI->getOperand(1); - assert(Dst0.isDef() && Dst1.isDef()); - - bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); + if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { + MachineBasicBlock *MBB = MI->getParent(); + auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); + if (Liveness != MachineBasicBlock::LQR_Dead) + return false; - const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); - unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); - const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg()); - unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + int Op32 = Fold.getShrinkOpcode(); + MachineOperand &Dst0 = MI->getOperand(0); + MachineOperand &Dst1 = MI->getOperand(1); + assert(Dst0.isDef() && Dst1.isDef()); - MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); + bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); - if (HaveNonDbgCarryUse) { - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg()) - .addReg(AMDGPU::VCC, RegState::Kill); - } + const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); + unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); - // Keep the old instruction around to avoid breaking iterators, but - // replace the outputs with dummy registers. - Dst0.setReg(NewReg0); - Dst1.setReg(NewReg1); + MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); - if (Fold.isCommuted()) - TII.commuteInstruction(*Inst32, false); - return true; + if (HaveNonDbgCarryUse) { + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg()) + .addReg(AMDGPU::VCC, RegState::Kill); } - Old.ChangeToImmediate(Fold.ImmToFold); + // Keep the old instruction around to avoid breaking iterators, but + // replace it with a dummy instruction to remove uses. + // + // FIXME: We should not invert how this pass looks at operands to avoid + // this. Should track set of foldable movs instead of looking for uses + // when looking at a use. + Dst0.setReg(NewReg0); + for (unsigned I = MI->getNumOperands() - 1; I > 0; --I) + MI->RemoveOperand(I); + MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF)); + + if (Fold.isCommuted()) + TII.commuteInstruction(*Inst32, false); return true; } assert(!Fold.needsShrink() && "not handled"); - if (Fold.isFI()) { - Old.ChangeToFrameIndex(Fold.FrameIndexToFold); + if (Fold.isImm()) { + Old.ChangeToImmediate(Fold.ImmToFold); return true; } - MachineOperand *New = Fold.OpToFold; - if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && - TargetRegisterInfo::isVirtualRegister(New->getReg())) { - Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); - - Old.setIsUndef(New->isUndef()); + if (Fold.isGlobal()) { + Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), + Fold.OpToFold->getTargetFlags()); return true; } - // FIXME: Handle physical registers. + if (Fold.isFI()) { + Old.ChangeToFrameIndex(Fold.FrameIndexToFold); + return true; + } - return false; + MachineOperand *New = Fold.OpToFold; + Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + Old.setIsUndef(New->isUndef()); + return true; } static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList, @@ -277,7 +311,6 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, MachineOperand *OpToFold, const SIInstrInfo *TII) { if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { - // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || @@ -344,7 +377,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, if ((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64 || Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME - OpToFold->isImm()) { + (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) { MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); // Verify the other operand is a VGPR, otherwise we would violate the @@ -357,7 +390,10 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, assert(MI->getOperand(1).isDef()); - int Op32 = AMDGPU::getVOPe32(Opc); + // Make sure to get the 32-bit version of the commuted opcode. + unsigned MaybeCommutedOpc = MI->getOpcode(); + int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); + FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true, Op32)); return true; @@ -384,10 +420,75 @@ static bool isUseSafeToFold(const SIInstrInfo *TII, //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } +static bool tryToFoldACImm(const SIInstrInfo *TII, + const MachineOperand &OpToFold, + MachineInstr *UseMI, + unsigned UseOpIdx, + SmallVectorImpl<FoldCandidate> &FoldList) { + const MCInstrDesc &Desc = UseMI->getDesc(); + const MCOperandInfo *OpInfo = Desc.OpInfo; + if (!OpInfo || UseOpIdx >= Desc.getNumOperands()) + return false; + + uint8_t OpTy = OpInfo[UseOpIdx].OperandType; + if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST || + OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) + return false; + + if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) { + UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); + return true; + } + + if (!OpToFold.isReg()) + return false; + + unsigned UseReg = OpToFold.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(UseReg)) + return false; + + if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) { + return FC.UseMI == UseMI; }) != FoldList.end()) + return false; + + MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo(); + const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg); + if (!Def || !Def->isRegSequence()) + return false; + + int64_t Imm; + MachineOperand *Op; + for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { + const MachineOperand &Sub = Def->getOperand(I); + if (!Sub.isReg() || Sub.getSubReg()) + return false; + MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg()); + while (SubDef && !SubDef->isMoveImmediate() && + !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef)) + SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg()); + if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm()) + return false; + Op = &SubDef->getOperand(1); + auto SubImm = Op->getImm(); + if (I == 1) { + if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy)) + return false; + + Imm = SubImm; + continue; + } + if (Imm != SubImm) + return false; // Can only fold splat constants + } + + FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op)); + return true; +} + void SIFoldOperands::foldOperand( MachineOperand &OpToFold, MachineInstr *UseMI, - unsigned UseOpIdx, + int UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); @@ -420,11 +521,18 @@ void SIFoldOperands::foldOperand( unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + MachineRegisterInfo::use_iterator Next; for (MachineRegisterInfo::use_iterator RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); - RSUse != RSE; ++RSUse) { + RSUse != RSE; RSUse = Next) { + Next = std::next(RSUse); MachineInstr *RSUseMI = RSUse->getParent(); + + if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI, + RSUse.getOperandNo(), FoldList)) + continue; + if (RSUse->getSubReg() != RegSeqDstSubReg) continue; @@ -435,10 +543,32 @@ void SIFoldOperands::foldOperand( return; } + if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList)) + return; - bool FoldingImm = OpToFold.isImm(); + if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) { + // Sanity check that this is a stack access. + // FIXME: Should probably use stack pseudos before frame lowering. + MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); + if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() && + SOff->getReg() != MFI->getStackPtrOffsetReg())) + return; + + if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != + MFI->getScratchRSrcReg()) + return; - if (FoldingImm && UseMI->isCopy()) { + // A frame index will resolve to a positive constant, so it should always be + // safe to fold the addressing mode, even pre-GFX9. + UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); + SOff->setReg(MFI->getStackPtrOffsetReg()); + return; + } + + bool FoldingImmLike = + OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); + + if (FoldingImmLike && UseMI->isCopy()) { unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? @@ -449,7 +579,7 @@ void SIFoldOperands::foldOperand( if (TargetRegisterInfo::isVirtualRegister(DestReg) && TargetRegisterInfo::isVirtualRegister(SrcReg)) { const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); - if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) { + if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { MachineRegisterInfo::use_iterator NextUse; SmallVector<FoldCandidate, 4> CopyUses; for (MachineRegisterInfo::use_iterator @@ -467,6 +597,14 @@ void SIFoldOperands::foldOperand( } } + if (DestRC == &AMDGPU::AGPR_32RegClass && + TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + CopiesToReplace.push_back(UseMI); + return; + } + // In order to fold immediates into copies, we need to change the // copy to a MOV. @@ -479,18 +617,71 @@ void SIFoldOperands::foldOperand( } else { if (UseMI->isCopy() && OpToFold.isReg() && TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) && - TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) && - TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && - TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) && + TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) && !UseMI->getOperand(1).getSubReg()) { + unsigned Size = TII->getOpSize(*UseMI, 1); UseMI->getOperand(1).setReg(OpToFold.getReg()); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); UseMI->getOperand(1).setIsKill(false); CopiesToReplace.push_back(UseMI); OpToFold.setIsKill(false); + if (Size != 4) + return; + if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); + else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32)); return; } + unsigned UseOpc = UseMI->getOpcode(); + if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 || + (UseOpc == AMDGPU::V_READLANE_B32 && + (int)UseOpIdx == + AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) { + // %vgpr = V_MOV_B32 imm + // %sgpr = V_READFIRSTLANE_B32 %vgpr + // => + // %sgpr = S_MOV_B32 imm + if (FoldingImmLike) { + if (execMayBeModifiedBeforeUse(*MRI, + UseMI->getOperand(UseOpIdx).getReg(), + *OpToFold.getParent(), + *UseMI)) + return; + + UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); + + // FIXME: ChangeToImmediate should clear subreg + UseMI->getOperand(1).setSubReg(0); + if (OpToFold.isImm()) + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + else + UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex()); + UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) + return; + } + + if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) { + if (execMayBeModifiedBeforeUse(*MRI, + UseMI->getOperand(UseOpIdx).getReg(), + *OpToFold.getParent(), + *UseMI)) + return; + + // %vgpr = COPY %sgpr0 + // %sgpr1 = V_READFIRSTLANE_B32 %vgpr + // => + // %sgpr1 = COPY %sgpr0 + UseMI->setDesc(TII->get(AMDGPU::COPY)); + UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) + return; + } + } + const MCInstrDesc &UseDesc = UseMI->getDesc(); // Don't fold into target independent nodes. Target independent opcodes @@ -501,7 +692,7 @@ void SIFoldOperands::foldOperand( return; } - if (!FoldingImm) { + if (!FoldingImmLike) { tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); // FIXME: We could try to change the instruction from 64-bit to 32-bit @@ -515,14 +706,10 @@ void SIFoldOperands::foldOperand( const TargetRegisterClass *FoldRC = TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); - // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { unsigned UseReg = UseOp.getReg(); - const TargetRegisterClass *UseRC - = TargetRegisterInfo::isVirtualRegister(UseReg) ? - MRI->getRegClass(UseReg) : - TRI->getPhysRegClass(UseReg); + const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) return; @@ -763,14 +950,23 @@ static bool tryFoldInst(const SIInstrInfo *TII, Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) { const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); - if (Src1->isIdenticalTo(*Src0)) { + int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); + int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + if (Src1->isIdenticalTo(*Src0) && + (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) && + (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) { LLVM_DEBUG(dbgs() << "Folded " << *MI << " into "); + auto &NewDesc = + TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false)); int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (Src2Idx != -1) MI->RemoveOperand(Src2Idx); MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); - mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY - : getMovOpc(false))); + if (Src1ModIdx != -1) + MI->RemoveOperand(Src1ModIdx); + if (Src0ModIdx != -1) + MI->RemoveOperand(Src0ModIdx); + mutateCopyOp(*MI, NewDesc); LLVM_DEBUG(dbgs() << *MI << '\n'); return true; } @@ -788,7 +984,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, SmallVector<FoldCandidate, 4> FoldList; MachineOperand &Dst = MI.getOperand(0); - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); if (FoldingImm) { unsigned NumLiteralUses = 0; MachineOperand *NonInlineUse = nullptr; @@ -840,6 +1036,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, // in some cases. A better heuristic is needed. if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); + } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { + foldOperand(OpToFold, UseMI, OpNo, FoldList, + CopiesToReplace); } else { if (++NumLiteralUses == 1) { NonInlineUse = &*Use; @@ -874,7 +1073,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { - if (updateOperand(Fold, *TII, *TRI)) { + if (updateOperand(Fold, *TII, *TRI, *ST)) { // Clear kill flags. if (Fold.isReg()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); @@ -926,7 +1125,8 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { // Having a 0 op_sel_hi would require swizzling the output in the source // instruction, which we can't do. - unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0; + unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 + : 0u; if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) return nullptr; return Src0; @@ -1105,13 +1305,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget<GCNSubtarget>(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); - - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MFI = MF.getInfo<SIMachineFunctionInfo>(); // omod is ignored by hardware if IEEE bit is enabled. omod also does not // correctly handle signed zeros. // - bool IsIEEEMode = ST->enableIEEEBit(MF); + // FIXME: Also need to check strictfp + bool IsIEEEMode = MFI->getMode().IEEE; bool HasNSZ = MFI->hasNoSignedZerosFPMath(); for (MachineBasicBlock *MBB : depth_first(&MF)) { @@ -1132,7 +1332,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { } MachineOperand &OpToFold = MI.getOperand(1); - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + bool FoldingImm = + OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); // FIXME: We could also be folding things like TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index aa976d5141f8..f3c9ad63a80a 100644 --- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -1,9 +1,8 @@ //===-- SIFormMemoryClauses.cpp -------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -119,6 +118,17 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) { return false; if (!IsVMEMClause && !isSMEMClauseInst(MI)) return false; + // If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it. + for (const MachineOperand &ResMO : MI.defs()) { + unsigned ResReg = ResMO.getReg(); + for (const MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || MO.isDef()) + continue; + if (MO.getReg() == ResReg) + return false; + } + break; // Only check the first def. + } return true; } @@ -309,6 +319,8 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count(); MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count(); + unsigned FuncMaxClause = AMDGPU::getIntegerAttribute( + MF.getFunction(), "amdgpu-max-memory-clause", MaxClause); for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::instr_iterator Next; @@ -329,7 +341,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { continue; unsigned Length = 1; - for ( ; Next != E && Length < MaxClause; ++Next) { + for ( ; Next != E && Length < FuncMaxClause; ++Next) { if (!isValidClauseInst(*Next, IsVMEM)) break; diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index e4633c88e18f..feab6bed2603 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1,9 +1,8 @@ //===----------------------- SIFrameLowering.cpp --------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //==-----------------------------------------------------------------------===// @@ -22,6 +21,8 @@ using namespace llvm; +#define DEBUG_TYPE "frame-info" + static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST, const MachineFunction &MF) { @@ -35,6 +36,150 @@ static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, ST.getMaxNumSGPRs(MF)); } +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, + LivePhysRegs &LiveRegs, + const TargetRegisterClass &RC, + bool Unused = false) { + // Mark callee saved registers as used so we will not choose them. + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + if (Unused) { + // We are looking for a register that can be used throughout the entire + // function, so any use is unacceptable. + for (unsigned Reg : RC) { + if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) + return Reg; + } + } else { + for (unsigned Reg : RC) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + } + + // If we require an unused register, this is used in contexts where failure is + // an option and has an alternative plan. In other contexts, this must + // succeed0. + if (!Unused) + report_fatal_error("failed to find free scratch register"); + + return AMDGPU::NoRegister; +} + +static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) { + LivePhysRegs LiveRegs; + LiveRegs.init(*MRI.getTargetRegisterInfo()); + return findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); +} + +// We need to specially emit stack operations here because a different frame +// register is used than in the rest of the function, as getFrameRegister would +// use. +static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const SIInstrInfo *TII, unsigned SpillReg, + unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + + int64_t Offset = MFI.getObjectOffset(FI); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, + MFI.getObjectAlignment(FI)); + + if (isUInt<12>(Offset)) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) + .addReg(SpillReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) + .addImm(Offset); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) + .addReg(SpillReg, RegState::Kill) + .addReg(OffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); +} + +static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const SIInstrInfo *TII, unsigned SpillReg, + unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + int64_t Offset = MFI.getObjectOffset(FI); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, + MFI.getObjectAlignment(FI)); + + if (isUInt<12>(Offset)) { + BuildMI(MBB, I, DebugLoc(), + TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) + .addImm(Offset); + + BuildMI(MBB, I, DebugLoc(), + TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg) + .addReg(OffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); +} + void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const { @@ -71,6 +216,24 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, // Do a 64-bit pointer add. if (ST.flatScratchIsPointer()) { + if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi) + .addReg(FlatScrInitHi) + .addImm(0); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). + addReg(FlatScrInitLo). + addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO | + (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)). + addReg(FlatScrInitHi). + addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI | + (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_))); + return; + } + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) .addReg(FlatScrInitLo) .addReg(ScratchWaveOffsetReg); @@ -81,6 +244,8 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, return; } + assert(ST.getGeneration() < AMDGPUSubtarget::GFX10); + // Copy the size in bytes. BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) .addReg(FlatScrInitHi, RegState::Kill); @@ -145,34 +310,30 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( return ScratchRsrcReg; } -// Shift down registers reserved for the scratch wave offset and stack pointer -// SGPRs. -std::pair<unsigned, unsigned> +// Shift down registers reserved for the scratch wave offset. +std::pair<unsigned, bool> SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const { + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, MachineFunction &MF) const { MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(MFI->isEntryFunction()); + // No replacement necessary. if (ScratchWaveOffsetReg == AMDGPU::NoRegister || - !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) { - assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG); - return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister); + (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) { + return std::make_pair(AMDGPU::NoRegister, false); } - unsigned SPReg = MFI->getStackPtrOffsetReg(); if (ST.hasSGPRInitBug()) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return std::make_pair(ScratchWaveOffsetReg, false); unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); if (NumPreloaded > AllSGPRs.size()) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return std::make_pair(ScratchWaveOffsetReg, false); AllSGPRs = AllSGPRs.slice(NumPreloaded); @@ -193,10 +354,11 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( unsigned ReservedRegCount = 13; if (AllSGPRs.size() < ReservedRegCount) - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return std::make_pair(ScratchWaveOffsetReg, false); bool HandledScratchWaveOffsetReg = ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + bool FPAdjusted = false; for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { // Pick the first unallocated SGPR. Be careful not to pick an alias of the @@ -206,24 +368,25 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( HandledScratchWaveOffsetReg = true; MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { + assert(!hasFP(MF)); + MFI->setStackPtrOffsetReg(Reg); + } + MFI->setScratchWaveOffsetReg(Reg); + MFI->setFrameOffsetReg(Reg); ScratchWaveOffsetReg = Reg; + FPAdjusted = true; break; } } } - return std::make_pair(ScratchWaveOffsetReg, SPReg); + return std::make_pair(ScratchWaveOffsetReg, FPAdjusted); } void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was - // specified. - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (ST.debuggerEmitPrologue()) - emitDebuggerPrologue(MF, MBB); - assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -234,6 +397,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // FIXME: We should be cleaning up these unused SGPR spill frame indices // somewhere. + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -251,38 +415,13 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, if (MFI->hasFlatScratchInit()) emitFlatScratchInit(ST, MF, MBB); - unsigned SPReg = MFI->getStackPtrOffsetReg(); - if (SPReg != AMDGPU::SP_REG) { - assert(MRI.isReserved(SPReg) && "SPReg used but not reserved"); - - DebugLoc DL; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - int64_t StackSize = FrameInfo.getStackSize(); - - if (StackSize == 0) { - BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } else { - BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()) - .addImm(StackSize * ST.getWavefrontSize()); - } - } - unsigned ScratchRsrcReg = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); unsigned ScratchWaveOffsetReg; - std::tie(ScratchWaveOffsetReg, SPReg) - = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); - - // It's possible to have uses of only ScratchWaveOffsetReg without - // ScratchRsrcReg if it's only used for the initialization of flat_scratch, - // but the inverse is not true. - if (ScratchWaveOffsetReg == AMDGPU::NoRegister) { - assert(ScratchRsrcReg == AMDGPU::NoRegister); - return; - } + bool FPAdjusted; + std::tie(ScratchWaveOffsetReg, FPAdjusted) = + getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); // We need to insert initialization of the scratch resource descriptor. unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( @@ -294,18 +433,19 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); } - bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg); + bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister && + MRI.isPhysRegUsed(ScratchWaveOffsetReg); bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && MRI.isPhysRegUsed(ScratchRsrcReg); + // FIXME: Hack to not crash in situations which emitted an error. + if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) + return; + // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. - if (OffsetRegUsed) { - assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister && - "scratch wave offset input is required"); - MRI.addLiveIn(PreloadedScratchWaveOffsetReg); - MBB.addLiveIn(PreloadedScratchWaveOffsetReg); - } + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F)); @@ -318,7 +458,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, if (&OtherBB == &MBB) continue; - if (OffsetRegUsed) + if (OffsetRegUsed || FPAdjusted) OtherBB.addLiveIn(ScratchWaveOffsetReg); if (ResourceRegUsed) @@ -346,11 +486,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, .addReg(PreloadedPrivateBufferReg, RegState::Kill); } - if (OffsetRegUsed && - PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + unsigned SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + + // FIXME: Remove the isPhysRegUsed checks + const bool HasFP = hasFP(MF); + + if (HasFP || OffsetRegUsed) { + assert(ScratchWaveOffsetReg); BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, - MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill); + .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); } if (CopyBuffer && !CopyBufferFirst) { @@ -358,9 +503,26 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, .addReg(PreloadedPrivateBufferReg, RegState::Kill); } - if (ResourceRegUsed) + if (ResourceRegUsed) { emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I, PreloadedPrivateBufferReg, ScratchRsrcReg); + } + + if (HasFP) { + DebugLoc DL; + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + int64_t StackSize = FrameInfo.getStackSize(); + + // On kernel entry, the private scratch wave offset is the SP value. + if (StackSize == 0) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(StackSize * ST.getWavefrontSize()); + } + } } // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. @@ -405,7 +567,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, } } MF.getRegInfo().addLiveIn(GitPtrLo); - MF.front().addLiveIn(GitPtrLo); + MBB.addLiveIn(GitPtrLo); BuildMI(MBB, I, DL, SMovB32, RsrcLo) .addReg(GitPtrLo) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); @@ -421,12 +583,15 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable, - 0, 0); + 16, 4); unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; + const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); + unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset); BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) .addReg(Rsrc01) - .addImm(Offset) // offset + .addImm(EncodedOffset) // offset .addImm(0) // glc + .addImm(0) // dlc .addReg(ScratchRsrcReg, RegState::ImplicitDefine) .addMemOperand(MMO); return; @@ -462,13 +627,17 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable, - 0, 0); + 8, 4); BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addImm(0) // offset .addImm(0) // glc + .addImm(0) // dlc .addMemOperand(MMO) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); + MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); } } else { unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); @@ -494,38 +663,14 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, } } -// Find a scratch register that we can use at the start of the prologue to -// re-align the stack pointer. We avoid using callee-save registers since they -// may appear to be free when this is called from canUseAsPrologue (during -// shrink wrapping), but then no longer be free when this is called from -// emitPrologue. -// -// FIXME: This is a bit conservative, since in the above case we could use one -// of the callee-save registers as a scratch temp to re-align the stack pointer, -// but we would then have to make sure that we were in fact saving at least one -// callee-save register in the prologue, which is additional complexity that -// doesn't seem worth the benefit. -static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) { - MachineFunction *MF = MBB.getParent(); - - const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); - const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo(); - LivePhysRegs LiveRegs(TRI); - LiveRegs.addLiveIns(MBB); - - // Mark callee saved registers as used so we will not choose them. - const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); - for (unsigned i = 0; CSRegs[i]; ++i) - LiveRegs.addReg(CSRegs[i]); - - MachineRegisterInfo &MRI = MF->getRegInfo(); - - for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) { - if (LiveRegs.available(MRI, Reg)) - return Reg; +bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { + switch (ID) { + case TargetStackID::Default: + case TargetStackID::NoAlloc: + case TargetStackID::SGPRSpill: + return true; } - - return AMDGPU::NoRegister; + llvm_unreachable("Invalid TargetStackID::Value"); } void SIFrameLowering::emitPrologue(MachineFunction &MF, @@ -537,31 +682,105 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, } const MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); + LivePhysRegs LiveRegs; MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; - // XXX - Is this the right predicate? - - bool NeedFP = hasFP(MF); + bool HasFP = false; uint32_t NumBytes = MFI.getStackSize(); uint32_t RoundedSize = NumBytes; - const bool NeedsRealignment = TRI.needsStackRealignment(MF); + // To avoid clobbering VGPRs in lanes that weren't active on function entry, + // turn on all lanes before doing the spill to memory. + unsigned ScratchExecCopy = AMDGPU::NoRegister; + + // Emit the copy if we need an FP, and are using a free SGPR to save it. + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) + .addReg(FramePtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg + : FuncInfo->getSGPRSpillVGPRs()) { + if (!Reg.FI.hasValue()) + continue; + + if (ScratchExecCopy == AMDGPU::NoRegister) { + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + if (FuncInfo->SGPRForFPSaveRestoreCopy) + LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); + } + + ScratchExecCopy + = findScratchNonCalleeSaveRegister(MRI, LiveRegs, + *TRI.getWaveMaskRegClass()); + assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy); + + const unsigned OrSaveExec = ST.isWave32() ? + AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), + ScratchExecCopy) + .addImm(-1); + } - if (NeedsRealignment) { - assert(NeedFP); + buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + FuncInfo->getScratchRSrcReg(), + StackPtrReg, + Reg.FI.getValue()); + } + + if (ScratchExecCopy != AMDGPU::NoRegister) { + // FIXME: Split block and make terminator. + unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); + LiveRegs.addReg(ScratchExecCopy); + } + + + if (FuncInfo->FramePointerSaveIndex) { + const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + assert(!MFI.isDeadObjectIndex(FI) && + MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill + = FuncInfo->getSGPRToVGPRSpills(FI); + assert(Spill.size() == 1); + + // Save FP before setting it up. + // FIXME: This should respect spillSGPRToVGPR; + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill[0].VGPR) + .addReg(FramePtrReg) + .addImm(Spill[0].Lane) + .addReg(Spill[0].VGPR, RegState::Undef); + } + + if (TRI.needsStackRealignment(MF)) { + HasFP = true; const unsigned Alignment = MFI.getMaxAlignment(); RoundedSize += Alignment; + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); + } - unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB); - assert(ScratchSPReg != AMDGPU::NoRegister); + unsigned ScratchSPReg = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); + assert(ScratchSPReg != AMDGPU::NoRegister && + ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy); // s_add_u32 tmp_reg, s32, NumBytes // s_and_b32 s32, tmp_reg, 0b111...0000 @@ -574,7 +793,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .addImm(-Alignment * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); FuncInfo->setIsStackRealigned(true); - } else if (NeedFP) { + } else if ((HasFP = hasFP(MF))) { // If we need a base pointer, set it up here. It's whatever the value of // the stack pointer is at this point. Any variable size objects will be // allocated after this, so we can still use the base pointer to reference @@ -584,21 +803,20 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - if (RoundedSize != 0 && hasSP(MF)) { + if (HasFP && RoundedSize != 0) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) .addImm(RoundedSize * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); } - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg - : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) - continue; - TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); - } + assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister || + FuncInfo->FramePointerSaveIndex)) && + "Needed to save FP but didn't save it anywhere"); + + assert((HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister && + !FuncInfo->FramePointerSaveIndex)) && + "Saved FP but didn't need it"); } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -609,39 +827,87 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + LivePhysRegs LiveRegs; + DebugLoc DL; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + uint32_t NumBytes = MFI.getStackSize(); + uint32_t RoundedSize = FuncInfo->isStackRealigned() ? + NumBytes + MFI.getMaxAlignment() : NumBytes; + + if (RoundedSize != 0 && hasFP(MF)) { + const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) + .addReg(StackPtrReg) + .addImm(RoundedSize * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameDestroy); + } + + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg()) + .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (FuncInfo->FramePointerSaveIndex) { + const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + + assert(!MF.getFrameInfo().isDeadObjectIndex(FI) && + MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill); + + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill + = FuncInfo->getSGPRToVGPRSpills(FI); + assert(Spill.size() == 1); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + FuncInfo->getFrameOffsetReg()) + .addReg(Spill[0].VGPR) + .addImm(Spill[0].Lane); + } + unsigned ScratchExecCopy = AMDGPU::NoRegister; for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { if (!Reg.FI.hasValue()) continue; - TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); - } - unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); - if (StackPtrReg == AMDGPU::NoRegister) - return; + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + if (ScratchExecCopy == AMDGPU::NoRegister) { + // See emitPrologue + if (LiveRegs.empty()) { + LiveRegs.init(*ST.getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + LiveRegs.stepBackward(*MBBI); + } - const MachineFrameInfo &MFI = MF.getFrameInfo(); - uint32_t NumBytes = MFI.getStackSize(); + ScratchExecCopy = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, *TRI.getWaveMaskRegClass()); + LiveRegs.removeReg(ScratchExecCopy); - DebugLoc DL; + const unsigned OrSaveExec = + ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; - // FIXME: Clarify distinction between no set SP and SP. For callee functions, - // it's really whether we need SP to be accurate or not. + BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy) + .addImm(-1); + } - if (NumBytes != 0 && hasSP(MF)) { - uint32_t RoundedSize = FuncInfo->isStackRealigned() ? - NumBytes + MFI.getMaxAlignment() : NumBytes; + buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + FuncInfo->getScratchRSrcReg(), + FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue()); + } - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) - .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()); + if (ScratchExecCopy != AMDGPU::NoRegister) { + // FIXME: Split block and make terminator. + unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); } } +// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not +// memory. They should have been removed by now. static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { @@ -652,6 +918,22 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { return true; } +#ifndef NDEBUG +static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, + Optional<int> FramePointerSaveIndex) { + for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); + I != E; ++I) { + if (!MFI.isDeadObjectIndex(I) && + MFI.getStackID(I) == TargetStackID::SGPRSpill && + FramePointerSaveIndex && I != FramePointerSaveIndex) { + return false; + } + } + + return true; +} +#endif + int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); @@ -665,81 +947,145 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( RegScavenger *RS) const { MachineFrameInfo &MFI = MF.getFrameInfo(); - if (!MFI.hasStackObjects()) - return; - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - bool AllSGPRSpilledToVGPRs = false; - - if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) { - AllSGPRSpilledToVGPRs = true; - - // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs - // are spilled to VGPRs, in which case we can eliminate the stack usage. - // - // XXX - This operates under the assumption that only other SGPR spills are - // users of the frame index. I'm not 100% sure this is correct. The - // StackColoring pass has a comment saying a future improvement would be to - // merging of allocas with spill slots, but for now according to - // MachineFrameInfo isSpillSlot can't alias any other object. - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator Next; - for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { - MachineInstr &MI = *I; - Next = std::next(I); - - if (TII->isSGPRSpill(MI)) { - int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); - assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL); - if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { - bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); - (void)Spilled; - assert(Spilled && "failed to spill SGPR to VGPR when allocated"); - } else - AllSGPRSpilledToVGPRs = false; - } - } - } - FuncInfo->removeSGPRToVGPRFrameIndices(MFI); - } + FuncInfo->removeDeadFrameIndices(MFI); + assert(allSGPRSpillsAreDead(MFI, None) && + "SGPR spill should have been removed in SILowerSGPRSpills"); // FIXME: The other checks should be redundant with allStackObjectsAreDead, // but currently hasNonSpillStackObjects is set only from source // allocas. Stack temps produced from legalization are not counted currently. - if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() || - !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) { + if (!allStackObjectsAreDead(MFI)) { assert(RS && "RegScavenger required if spilling"); - // We force this to be at offset 0 so no user object ever has 0 as an - // address, so we may use 0 as an invalid pointer value. This is because - // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca - // is required to be address space 0, we are forced to accept this for - // now. Ideally we could have the stack in another address space with 0 as a - // valid pointer, and -1 as the null value. - // - // This will also waste additional space when user stack objects require > 4 - // byte alignment. - // - // The main cost here is losing the offset for addressing modes. However - // this also ensures we shouldn't need a register for the offset when - // emergency scavenging. - int ScavengeFI = MFI.CreateFixedObject( - TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); - RS->addScavengingFrameIndex(ScavengeFI); + if (FuncInfo->isEntryFunction()) { + int ScavengeFI = MFI.CreateFixedObject( + TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); + RS->addScavengingFrameIndex(ScavengeFI); + } else { + int ScavengeFI = MFI.CreateStackObject( + TRI->getSpillSize(AMDGPU::SGPR_32RegClass), + TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass), + false); + RS->addScavengingFrameIndex(ScavengeFI); + } } } -void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, +// Only report VGPRs to generic code. +void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedVGPRs, RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + if (MFI->isEntryFunction()) + return; + + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + // Ignore the SGPRs the default implementation found. + SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); + + // hasFP only knows about stack objects that already exist. We're now + // determining the stack slots that will be created, so we have to predict + // them. Stack objects force FP usage with calls. + // + // Note a new VGPR CSR may be introduced if one is used for the spill, but we + // don't want to report it here. + // + // FIXME: Is this really hasReservedCallFrame? + const bool WillHaveFP = + FrameInfo.hasCalls() && + (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); + + // VGPRs used for SGPR spilling need to be specially inserted in the prolog, + // so don't allow the default insertion to handle them. + for (auto SSpill : MFI->getSGPRSpillVGPRs()) + SavedVGPRs.reset(SSpill.VGPR); + + const bool HasFP = WillHaveFP || hasFP(MF); + if (!HasFP) + return; + + if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { + int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, + TargetStackID::SGPRSpill); + + // If there is already a VGPR with free lanes, use it. We may already have + // to pay the penalty for spilling a CSR VGPR. + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + + MFI->FramePointerSaveIndex = NewFI; + + LLVM_DEBUG( + auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI) + << ':' << Spill.Lane << '\n'); + return; + } + + MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo()); + + if (!MFI->SGPRForFPSaveRestoreCopy) { + // There's no free lane to spill, and no free register to save FP, so we're + // forced to spill another VGPR to use for the spill. + int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, + TargetStackID::SGPRSpill); + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + MFI->FramePointerSaveIndex = NewFI; + + LLVM_DEBUG( + auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI) + << ':' << Spill.Lane << '\n';); + } else { + LLVM_DEBUG(dbgs() << "Saving FP with copy to " << + printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n'); + } +} + +void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + if (MFI->isEntryFunction()) + return; + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); // The SP is specifically managed and we don't want extra spills of it. SavedRegs.reset(MFI->getStackPtrOffsetReg()); + SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask()); +} + +bool SIFrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const { + if (CSI.empty()) + return true; // Early exit if no callee saved registers are modified! + + const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + if (!FuncInfo->SGPRForFPSaveRestoreCopy) + return false; + + for (auto &CS : CSI) { + if (CS.getReg() == FuncInfo->getFrameOffsetReg()) { + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) + CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); + break; + } + } + + return false; } MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( @@ -757,8 +1103,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - if (!TFI->hasReservedCallFrame(MF)) { + if (!hasReservedCallFrame(MF)) { unsigned Align = getStackAlignment(); Amount = alignTo(Amount, Align); @@ -777,60 +1122,25 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( return MBB.erase(I); } -void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - - MachineBasicBlock::iterator I = MBB.begin(); - DebugLoc DL; - - // For each dimension: - for (unsigned i = 0; i < 3; ++i) { - // Get work group ID SGPR, and make it live-in again. - unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i); - MF.getRegInfo().addLiveIn(WorkGroupIDSGPR); - MBB.addLiveIn(WorkGroupIDSGPR); - - // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in - // order to spill it to scratch. - unsigned WorkGroupIDVGPR = - MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR) - .addReg(WorkGroupIDSGPR); - - // Spill work group ID. - int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i); - TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false, - WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); - - // Get work item ID VGPR, and make it live-in again. - unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i); - MF.getRegInfo().addLiveIn(WorkItemIDVGPR); - MBB.addLiveIn(WorkItemIDVGPR); - - // Spill work item ID. - int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i); - TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false, - WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); - } -} - bool SIFrameLowering::hasFP(const MachineFunction &MF) const { - // All stack operations are relative to the frame offset SGPR. - // TODO: Still want to eliminate sometimes. const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.hasCalls()) { + // All offsets are unsigned, so need to be addressed in the same direction + // as stack growth. + + // FIXME: This function is pretty broken, since it can be called before the + // frame layout is determined or CSR spills are inserted. + if (MFI.getStackSize() != 0) + return true; + + // For the entry point, the input wave scratch offset must be copied to the + // API SP if there are calls. + if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) + return true; + } - // XXX - Is this only called after frame is finalized? Should be able to check - // frame size. - return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI); -} - -bool SIFrameLowering::hasSP(const MachineFunction &MF) const { - const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); - // All stack operations are relative to the frame offset SGPR. - const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF); + return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || + MFI.hasStackMap() || MFI.hasPatchPoint() || + MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) || + MF.getTarget().Options.DisableFramePointerElim(MF); } diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index 2f35b3631cdc..c644f4726e2c 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -1,9 +1,8 @@ //===--------------------- SIFrameLowering.h --------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -37,6 +36,14 @@ public: void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const override; + void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const; + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const override; + + bool isSupportedStackID(TargetStackID::Value ID) const override; void processFunctionBeforeFrameFinalized( MachineFunction &MF, @@ -59,15 +66,9 @@ private: SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const; - - /// Emits debugger prologue. - void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const; + std::pair<unsigned, bool> getReservedPrivateSegmentWaveByteOffsetReg( + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, + SIMachineFunctionInfo *MFI, MachineFunction &MF) const; // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF, @@ -77,7 +78,6 @@ private: public: bool hasFP(const MachineFunction &MF) const override; - bool hasSP(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 0ba921647097..db0782e2bf3e 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1,9 +1,8 @@ //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,7 +18,6 @@ #include "SIISelLowering.h" #include "AMDGPU.h" -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "SIDefines.h" @@ -95,11 +93,10 @@ static cl::opt<bool> EnableVGPRIndexMode( cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); -static cl::opt<unsigned> AssumeFrameIndexHighZeroBits( - "amdgpu-frame-index-zero-bits", - cl::desc("High bits of frame index assumed to be zero"), - cl::init(5), - cl::ReallyHidden); +static cl::opt<bool> DisableLoopAlignment( + "amdgpu-disable-loop-alignment", + cl::desc("Do not align and prefetch loops"), + cl::init(false)); static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); @@ -125,12 +122,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); + addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); + addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); @@ -148,18 +151,27 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } + if (Subtarget->hasMAIInsts()) { + addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); + } + computeRegisterProperties(Subtarget->getRegisterInfo()); // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v3i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v5i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); setOperationAction(ISD::LOAD, MVT::i1, Custom); setOperationAction(ISD::LOAD, MVT::v32i32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v3i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::STORE, MVT::v5i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); @@ -218,11 +230,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -248,8 +264,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) { + for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, + MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, + MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -323,6 +340,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); + // Deal with vec3 vector operations when widened to vec4. + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom); + + // Deal with vec5 vector operations when widened to vec8. + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, // and output demarshalling setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); @@ -400,7 +429,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + if (Subtarget->haveRoundOpsF64()) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); @@ -492,7 +521,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP3 Actions. setOperationAction(ISD::FMA, MVT::f16, Legal); - if (!Subtarget->hasFP16Denormals()) + if (!Subtarget->hasFP16Denormals() && STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { @@ -607,6 +636,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + setOperationAction(ISD::SHL, MVT::v4i16, Custom); setOperationAction(ISD::SRA, MVT::v4i16, Custom); setOperationAction(ISD::SRL, MVT::v4i16, Custom); @@ -679,6 +711,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FCANONICALIZE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); @@ -701,13 +734,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); + setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD); setSchedulingPreference(Sched::RegPressure); - - // SI at least has hardware support for floating point exceptions, but no way - // of using or handling them is implemented. They are also optional in OpenCL - // (Section 7.3) - setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); } const GCNSubtarget *SITargetLowering::getSubtarget() const { @@ -910,6 +939,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -919,13 +950,75 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align = 0; Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4)); + if (!Vol->isZero()) + Info.flags |= MachineMemOperand::MOVolatile; + + return true; + } + case Intrinsic::amdgcn_buffer_atomic_fadd: { + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); + Info.ptrVal = MFI->getBufferPSV( + *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), + CI.getArgOperand(1)); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); if (!Vol || !Vol->isZero()) Info.flags |= MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_global_atomic_fadd: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType() + ->getPointerElementType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + + return true; + } + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + + const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1)); + if (!Vol->isZero()) + Info.flags |= MachineMemOperand::MOVolatile; + + return true; + } + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_br: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: { + Info.opc = ISD::INTRINSIC_VOID; + + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + Info.ptrVal = + MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); + // This is an abstract access, but we need to specify a type and size. + Info.memVT = MVT::i32; + Info.size = 4; + Info.align = 4; + + Info.flags = MachineMemOperand::MOStore; + if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) + Info.flags = MachineMemOperand::MOLoad; + return true; + } default: return false; } @@ -937,6 +1030,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -960,6 +1055,13 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { // GFX9 added a 13-bit signed offset. When using regular flat instructions, // the sign bit is ignored and is treated as a 12-bit unsigned offset. + // GFX10 shrinked signed offset to 12 bits. When using regular flat + // instructions, the sign bit is also ignored and is treated as 11-bit + // unsigned offset. + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) + return isUInt<11>(AM.BaseOffs) && AM.Scale == 0; + // Just r + i return isUInt<12>(AM.BaseOffs) && AM.Scale == 0; } @@ -1030,7 +1132,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return isLegalGlobalAddressingMode(AM); if (AS == AMDGPUAS::CONSTANT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + AS == AMDGPUAS::BUFFER_FAT_POINTER) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -1106,16 +1209,15 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); return (MemVT.getSizeInBits() <= MaxPrivateBits); - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { return (MemVT.getSizeInBits() <= 2 * 32); } return true; } -bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - unsigned Align, - bool *IsFast) const { +bool SITargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *IsFast) const { if (IsFast) *IsFast = false; @@ -1178,11 +1280,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return VT.bitsGT(MVT::i32) && Align % 4 == 0; } -EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { +EVT SITargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { // FIXME: Should account for address space here. // The default fallback uses the private pointer size as a guess for a type to @@ -1201,7 +1302,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, static bool isFlatGlobalAddrSpace(unsigned AS) { return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; + AS == AMDGPUAS::CONSTANT_ADDRESS || + AS > AMDGPUAS::MAX_AMDGPU_ADDRESS; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -1216,8 +1318,8 @@ bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { return I && I->getMetadata("amdgpu.noclobber"); } -bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, - unsigned DestAS) const { +bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { // Flat -> private/local is a simple truncate. // Flat -> global is no-op if (SrcAS == AMDGPUAS::FLAT_ADDRESS) @@ -1305,6 +1407,17 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, const ISD::InputArg *Arg) const { + // First, if it is a widened vector, narrow it. + if (VT.isVector() && + VT.getVectorNumElements() != MemVT.getVectorNumElements()) { + EVT NarrowedVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), + VT.getVectorNumElements()); + Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val, + DAG.getConstant(0, SL, MVT::i32)); + } + + // Then convert the vector elements or scalar value. if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) { unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; @@ -1441,8 +1554,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && - !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) { - + !Arg->Flags.isInReg() && PSInputNum <= 15) { bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum); // Inconveniently only the first part of the split is marked as isSplit, @@ -1508,7 +1620,13 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, // Try to allocate a VGPR at the end of the argument list, or if no argument // VGPRs are left allocating a stack slot. -static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { +// If \p Mask is is given it indicates bitfield position in the register. +// If \p Arg is given use it with new ]p Mask instead of allocating new. +static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, + ArgDescriptor Arg = ArgDescriptor()) { + if (Arg.isSet()) + return ArgDescriptor::createArg(Arg, Mask); + ArrayRef<MCPhysReg> ArgVGPRs = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); @@ -1516,7 +1634,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { // Spill to stack required. int64_t Offset = CCInfo.AllocateStack(4, 4); - return ArgDescriptor::createStack(Offset); + return ArgDescriptor::createStack(Offset, Mask); } unsigned Reg = ArgVGPRs[RegIdx]; @@ -1525,7 +1643,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { MachineFunction &MF = CCInfo.getMachineFunction(); MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - return ArgDescriptor::createRegister(Reg); + return ArgDescriptor::createRegister(Reg, Mask); } static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, @@ -1557,14 +1675,21 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { - if (Info.hasWorkItemIDX()) - Info.setWorkItemIDX(allocateVGPR32Input(CCInfo)); + const unsigned Mask = 0x3ff; + ArgDescriptor Arg; + + if (Info.hasWorkItemIDX()) { + Arg = allocateVGPR32Input(CCInfo, Mask); + Info.setWorkItemIDX(Arg); + } - if (Info.hasWorkItemIDY()) - Info.setWorkItemIDY(allocateVGPR32Input(CCInfo)); + if (Info.hasWorkItemIDY()) { + Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg); + Info.setWorkItemIDY(Arg); + } if (Info.hasWorkItemIDZ()) - Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo)); + Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); } static void allocateSpecialInputSGPRs(CCState &CCInfo, @@ -1714,6 +1839,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // should reserve the arguments and use them directly. MachineFrameInfo &MFI = MF.getFrameInfo(); bool HasStackObjects = MFI.hasStackObjects(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); // Record that we know we have non-spill stack objects so we don't need to // check all stack objects later. @@ -1729,65 +1855,89 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // the scratch registers to pass in. bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (ST.isAmdHsaOrMesa(MF.getFunction())) { - if (RequiresStackAccess) { - // If we have stack objects, we unquestionably need the private buffer - // resource. For the Code Object V2 ABI, this will be the first 4 user - // SGPR inputs. We can reserve those and use them directly. - - unsigned PrivateSegmentBufferReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - Info.setScratchRSrcReg(PrivateSegmentBufferReg); - - if (MFI.hasCalls()) { - // If we have calls, we need to keep the frame register in a register - // that won't be clobbered by a call, so ensure it is copied somewhere. - - // This is not a problem for the scratch wave offset, because the same - // registers are reserved in all functions. - - // FIXME: Nothing is really ensuring this is a call preserved register, - // it's just selected from the end so it happens to be. - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - } else { - unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); - } - } else { - unsigned ReservedBufferReg - = TRI.reservedPrivateSegmentBufferReg(MF); - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - - // We tentatively reserve the last registers (skipping the last two - // which may contain VCC). After register allocation, we'll replace - // these with the ones immediately after those which were really - // allocated. In the prologue copies will be inserted from the argument - // to these reserved registers. - Info.setScratchRSrcReg(ReservedBufferReg); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - } + if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the Code Object V2 ABI, this will be the first 4 user + // SGPR inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = + Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + Info.setScratchRSrcReg(PrivateSegmentBufferReg); } else { unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); + // We tentatively reserve the last registers (skipping the last registers + // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, + // we'll replace these with the ones immediately after those which were + // really allocated. In the prologue copies will be inserted from the + // argument to these reserved registers. // Without HSA, relocations are used for the scratch pointer and the // buffer resource setup is always inserted in the prologue. Scratch wave // offset is still in an input SGPR. Info.setScratchRSrcReg(ReservedBufferReg); + } - if (HasStackObjects && !MFI.hasCalls()) { - unsigned ScratchWaveOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); + // hasFP should be accurate for kernels even before the frame is finalized. + if (ST.getFrameLowering()->hasFP(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Try to use s32 as the SP, but move it if it would interfere with input + // arguments. This won't work with calls though. + // + // FIXME: Move SP to avoid any possible inputs, or find a way to spill input + // registers. + if (!MRI.isLiveIn(AMDGPU::SGPR32)) { + Info.setStackPtrOffsetReg(AMDGPU::SGPR32); } else { - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); + + if (MFI.hasCalls()) + report_fatal_error("call in graphics shader with too many input SGPRs"); + + for (unsigned Reg : AMDGPU::SGPR_32RegClass) { + if (!MRI.isLiveIn(Reg)) { + Info.setStackPtrOffsetReg(Reg); + break; + } + } + + if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) + report_fatal_error("failed to find register for SP"); + } + + if (MFI.hasCalls()) { + Info.setScratchWaveOffsetReg(AMDGPU::SGPR33); + Info.setFrameOffsetReg(AMDGPU::SGPR33); + } else { + unsigned ReservedOffsetReg = + TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); Info.setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setFrameOffsetReg(ReservedOffsetReg); } + } else if (RequiresStackAccess) { + assert(!MFI.hasCalls()); + // We know there are accesses and they will be done relative to SP, so just + // pin it to the input. + // + // FIXME: Should not do this if inline asm is reading/writing these + // registers. + unsigned PreloadedSP = Info.getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + Info.setStackPtrOffsetReg(PreloadedSP); + Info.setScratchWaveOffsetReg(PreloadedSP); + Info.setFrameOffsetReg(PreloadedSP); + } else { + assert(!MFI.hasCalls()); + + // There may not be stack access at all. There may still be spills, or + // access of a constant pointer (in which cases an extra copy will be + // emitted in the prolog). + unsigned ReservedOffsetReg + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + Info.setStackPtrOffsetReg(ReservedOffsetReg); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setFrameOffsetReg(ReservedOffsetReg); } } @@ -1845,7 +1995,6 @@ SDValue SITargetLowering::LowerFormalArguments( const Function &Fn = MF.getFunction(); FunctionType *FType = MF.getFunction().getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { DiagnosticInfoUnsupported NoGraphicsHSA( @@ -1854,11 +2003,6 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getEntryNode(); } - // Create stack objects that are used for emitting debugger prologue if - // "amdgpu-debugger-emit-prologue" attribute was specified. - if (ST.debuggerEmitPrologue()) - createDebuggerPrologueStackObjects(MF); - SmallVector<ISD::InputArg, 16> Splits; SmallVector<CCValAssign, 16> ArgLocs; BitVector Skipped(Ins.size()); @@ -1869,12 +2013,6 @@ SDValue SITargetLowering::LowerFormalArguments( bool IsKernel = AMDGPU::isKernel(CallConv); bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); - if (!IsEntryFunc) { - // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over - // this when allocating argument fixed offsets. - CCInfo.AllocateStack(4, 4); - } - if (IsShader) { processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); @@ -1975,7 +2113,8 @@ SDValue SITargetLowering::LowerFormalArguments( auto *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && - ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) { // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be // real pointers, so we can't guarantee their size. @@ -2002,13 +2141,14 @@ SDValue SITargetLowering::LowerFormalArguments( Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) { + if (Arg.Flags.isSRet()) { // The return object should be reasonably addressable. // FIXME: This helps when the return is a real sret. If it is a // automatically inserted sret (i.e. CanLowerReturn returns false), an // extra copy is inserted in SelectionDAGBuilder which obscures this. - unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits; + unsigned NumBits + = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); } @@ -2126,16 +2266,13 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue ReturnAddrReg = CreateLiveInRegister( DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); - // FIXME: Should be able to use a vreg here, but need a way to prevent it - // from being allcoated to a CSR. - - SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), - MVT::i64); - - Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag); + SDValue ReturnAddrVirtualReg = DAG.getRegister( + MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass), + MVT::i64); + Chain = + DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag); Flag = Chain.getValue(1); - - RetOps.push_back(PhysReturnAddrReg); + RetOps.push_back(ReturnAddrVirtualReg); } // Copy the result values into the output registers. @@ -2295,9 +2432,6 @@ void SITargetLowering::passSpecialInputs( AMDGPUFunctionArgInfo::WORKGROUP_ID_X, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, - AMDGPUFunctionArgInfo::WORKITEM_ID_X, - AMDGPUFunctionArgInfo::WORKITEM_ID_Y, - AMDGPUFunctionArgInfo::WORKITEM_ID_Z, AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR }; @@ -2337,6 +2471,71 @@ void SITargetLowering::passSpecialInputs( MemOpChains.push_back(ArgStore); } } + + // Pack workitem IDs into a single register or pass it as is if already + // packed. + const ArgDescriptor *OutgoingArg; + const TargetRegisterClass *ArgRC; + + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); + if (!OutgoingArg) + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + if (!OutgoingArg) + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + if (!OutgoingArg) + return; + + const ArgDescriptor *IncomingArgX + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first; + const ArgDescriptor *IncomingArgY + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first; + const ArgDescriptor *IncomingArgZ + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first; + + SDValue InputReg; + SDLoc SL; + + // If incoming ids are not packed we need to pack them. + if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX) + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); + + if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) { + SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); + Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, + DAG.getShiftAmountConstant(10, MVT::i32, SL)); + InputReg = InputReg.getNode() ? + DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y; + } + + if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) { + SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); + Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, + DAG.getShiftAmountConstant(20, MVT::i32, SL)); + InputReg = InputReg.getNode() ? + DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z; + } + + if (!InputReg.getNode()) { + // Workitem ids are already packed, any of present incoming arguments + // will carry all required fields. + ArgDescriptor IncomingArg = ArgDescriptor::createArg( + IncomingArgX ? *IncomingArgX : + IncomingArgY ? *IncomingArgY : + *IncomingArgZ, ~0u); + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); + } + + if (OutgoingArg->isRegister()) { + RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + } else { + unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4); + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, + SpecialArgOffset); + MemOpChains.push_back(ArgStore); + } } static bool canGuaranteeTCO(CallingConv::ID CC) { @@ -2478,7 +2677,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, "unsupported call from graphics shader of function "); } - // The first 4 bytes are reserved for the callee's emergency stack slot. if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); @@ -2505,9 +2703,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - // The first 4 bytes are reserved for the callee's emergency stack slot. - CCInfo.AllocateStack(4, 4); - CCInfo.AnalyzeCallOperands(Outs, AssignFn); // Get a count of how many bytes are to be pushed on the stack. @@ -2528,31 +2723,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MachineFrameInfo &MFI = MF.getFrameInfo(); SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; - SDValue CallerSavedFP; - // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) { Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); - unsigned OffsetReg = Info->getScratchWaveOffsetReg(); + SmallVector<SDValue, 4> CopyFromChains; // In the HSA case, this should be an identity copy. SDValue ScratchRSrcReg = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); - - // TODO: Don't hardcode these registers and get from the callee function. - SDValue ScratchWaveOffsetReg - = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); - RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg); - - if (!Info->isEntryFunction()) { - // Avoid clobbering this function's FP value. In the current convention - // callee will overwrite this, so do save/restore around the call site. - CallerSavedFP = DAG.getCopyFromReg(Chain, DL, - Info->getFrameOffsetReg(), MVT::i32); - } + CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); + Chain = DAG.getTokenFactor(DL, CopyFromChains); } SmallVector<SDValue, 8> MemOpChains; @@ -2694,6 +2877,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, std::vector<SDValue> Ops; Ops.push_back(Chain); Ops.push_back(Callee); + // Add a redundant copy of the callee global which will not be legalized, as + // we need direct access to the callee later. + GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee); + const GlobalValue *GV = GSD->getGlobal(); + Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so @@ -2735,12 +2923,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = Call.getValue(0); InFlag = Call.getValue(1); - if (CallerSavedFP) { - SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32); - Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag); - InFlag = Chain.getValue(1); - } - uint64_t CalleePopBytes = NumBytes; Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32), DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), @@ -2773,8 +2955,8 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, } - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && - Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { + if (!Subtarget->hasFlatScrRegister() && + Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { report_fatal_error(Twine("invalid register \"" + StringRef(RegName) + "\" for subtarget.")); } @@ -2830,6 +3012,107 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, return SplitBB; } +// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true, +// \p MI will be the only instruction in the loop body block. Otherwise, it will +// be the first instruction in the remainder block. +// +/// \returns { LoopBody, Remainder } +static std::pair<MachineBasicBlock *, MachineBasicBlock *> +splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { + MachineFunction *MF = MBB.getParent(); + MachineBasicBlock::iterator I(&MI); + + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + + if (InstInLoop) { + auto Next = std::next(I); + + // Move instruction to loop body. + LoopBB->splice(LoopBB->begin(), &MBB, I, Next); + + // Move the rest of the block. + RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end()); + } else { + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + } + + MBB.addSuccessor(LoopBB); + + return std::make_pair(LoopBB, RemainderBB); +} + +MachineBasicBlock * +SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, + MachineBasicBlock *BB) const { + const DebugLoc &DL = MI.getDebugLoc(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineBasicBlock::iterator Prev = std::prev(MI.getIterator()); + + std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true); + + MachineBasicBlock::iterator I = LoopBB->end(); + MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + + const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg( + AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); + + // Clear TRAP_STS.MEM_VIOL + BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(0) + .addImm(EncodedReg); + + // This is a pain, but we're not allowed to have physical register live-ins + // yet. Insert a pair of copies if the VGPR0 hack is necessary. + if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) { + unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0) + .add(*Src); + + BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg()) + .addReg(Data0); + + MRI.setSimpleHint(Data0, Src->getReg()); + } + + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + // Load and check TRAP_STS.MEM_VIOL + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) + .addImm(EncodedReg); + + // FIXME: Do we need to use an isel pseudo that may clobber scc? + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)) + .addReg(Reg, RegState::Kill) + .addImm(0); + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) + .addMBB(LoopBB); + + return RemainderBB; +} + // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the // wavefront. If the value is uniform and just happens to be in a VGPR, this // will only do one iteration. In the worst case, this will loop 64 times. @@ -2849,12 +3132,16 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( int Offset, bool UseGPRIdxMode, bool IsIndirectSrc) { + MachineFunction *MF = OrigBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineBasicBlock::iterator I = LoopBB.begin(); - unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + const TargetRegisterClass *BoolRC = TRI->getBoolRC(); + unsigned PhiExec = MRI.createVirtualRegister(BoolRC); + unsigned NewExec = MRI.createVirtualRegister(BoolRC); unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CondReg = MRI.createVirtualRegister(BoolRC); BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) .addReg(InitReg) @@ -2878,7 +3165,9 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); // Update EXEC, save the original EXEC value to VCC. - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) + BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 + : AMDGPU::S_AND_SAVEEXEC_B64), + NewExec) .addReg(CondReg, RegState::Kill); MRI.setSimpleHint(NewExec, CondReg); @@ -2894,7 +3183,7 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( .addImm(Offset); } unsigned IdxMode = IsIndirectSrc ? - VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; + AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE; MachineInstr *SetOn = BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) .addReg(IdxReg, RegState::Kill) @@ -2913,10 +3202,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( } // Update EXEC, switch all done bits to 0 and all todo bits to 1. + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; MachineInstr *InsertPt = - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(NewExec); + BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term + : AMDGPU::S_XOR_B64_term), Exec) + .addReg(Exec) + .addReg(NewExec); // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use // s_cbranch_scc0? @@ -2942,38 +3233,28 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, bool UseGPRIdxMode, bool IsIndirectSrc) { MachineFunction *MF = MBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); + const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); + unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); // Save the EXEC mask - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec) - .addReg(AMDGPU::EXEC); + BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec) + .addReg(Exec); - // To insert the loop we need to split the block. Move everything after this - // point to a new block, and insert a new empty block between the two. - MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); - MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF->insert(MBBI, LoopBB); - MF->insert(MBBI, RemainderBB); - - LoopBB->addSuccessor(LoopBB); - LoopBB->addSuccessor(RemainderBB); - - // Move the rest of the block into a new block. - RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - - MBB.addSuccessor(LoopBB); + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; + std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false); const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); @@ -2982,7 +3263,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, Offset, UseGPRIdxMode, IsIndirectSrc); MachineBasicBlock::iterator First = RemainderBB->begin(); - BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + BuildMI(*RemainderBB, First, DL, TII->get(MovExecOpc), Exec) .addReg(SaveExec); return InsPt; @@ -3025,7 +3306,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, if (UseGPRIdxMode) { unsigned IdxMode = IsIndirectSrc ? - VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; + AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE; if (Offset == 0) { MachineInstr *SetOn = BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) @@ -3274,6 +3555,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *BoolRC = TRI->getBoolRC(); const DebugLoc &DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); @@ -3284,17 +3568,17 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32_XM0RegClass); MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32_XM0RegClass); MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32_XM0RegClass); MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32_XM0RegClass); bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); @@ -3330,6 +3614,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; + case AMDGPU::SI_INIT_EXEC_LO: + // This should be before all vector instructions. + BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), + AMDGPU::EXEC_LO) + .addImm(MI.getOperand(0).getImm()); + MI.eraseFromParent(); + return BB; + case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { // Extract the thread count from an SGPR input and set EXEC accordingly. // Since BFM can't shift by 64, handle that case with CMP + CMOV. @@ -3363,24 +3655,31 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( (void)Found; // This should be before all vector instructions. + unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1; + bool isWave32 = getSubtarget()->isWave32(); + unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) .addReg(InputReg) - .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000); - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64), - AMDGPU::EXEC) + .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); + BuildMI(*BB, FirstMI, DebugLoc(), + TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), + Exec) .addReg(CountReg) .addImm(0); BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) .addReg(CountReg, RegState::Kill) - .addImm(64); - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64), - AMDGPU::EXEC) + .addImm(getSubtarget()->getWavefrontSize()); + BuildMI(*BB, FirstMI, DebugLoc(), + TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), + Exec) .addImm(-1); MI.eraseFromParent(); return BB; } case AMDGPU::GET_GROUPSTATICSIZE: { + assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) .add(MI.getOperand(0)) @@ -3405,6 +3704,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return splitKillBlock(MI, BB); case AMDGPU::V_CNDMASK_B64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); unsigned Dst = MI.getOperand(0).getReg(); unsigned Src0 = MI.getOperand(1).getReg(); @@ -3414,16 +3715,21 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC); BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) .addReg(SrcCond); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) + .addImm(0) .addReg(Src0, 0, AMDGPU::sub0) + .addImm(0) .addReg(Src1, 0, AMDGPU::sub0) .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) + .addImm(0) .addReg(Src0, 0, AMDGPU::sub1) + .addImm(0) .addReg(Src1, 0, AMDGPU::sub1) .addReg(SrcCondCopy); @@ -3457,40 +3763,60 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( .addReg(Info->getFrameOffsetReg(), RegState::Implicit); return BB; } - case AMDGPU::SI_CALL_ISEL: - case AMDGPU::SI_TCRETURN_ISEL: { + case AMDGPU::SI_CALL_ISEL: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); + unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned GlobalAddrReg = MI.getOperand(0).getReg(); - MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg); - assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET); + MachineInstrBuilder MIB; + MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); - const GlobalValue *G = PCRel->getOperand(1).getGlobal(); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); - MachineInstrBuilder MIB; - if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { - MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) - .add(MI.getOperand(0)) - .addGlobalAddress(G); - } else { - MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN)) - .add(MI.getOperand(0)) - .addGlobalAddress(G); + MIB.cloneMemRefs(MI); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::V_ADD_I32_e32: + case AMDGPU::V_SUB_I32_e32: + case AMDGPU::V_SUBREV_I32_e32: { + // TODO: Define distinct V_*_I32_Pseudo instructions instead. + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc = MI.getOpcode(); - // There is an additional imm operand for tcreturn, but it should be in the - // right place already. + bool NeedClampOperand = false; + if (TII->pseudoToMCOpcode(Opc) == -1) { + Opc = AMDGPU::getVOPe64(Opc); + NeedClampOperand = true; } - for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) - MIB.add(MI.getOperand(I)); + auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg()); + if (TII->isVOP3(*I)) { + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + I.addReg(TRI->getVCC(), RegState::Define); + } + I.add(MI.getOperand(1)) + .add(MI.getOperand(2)); + if (NeedClampOperand) + I.addImm(0); // clamp bit for e64 encoding + + TII->legalizeOperands(*I); - MIB.cloneMemRefs(MI); MI.eraseFromParent(); return BB; } + case AMDGPU::DS_GWS_INIT: + case AMDGPU::DS_GWS_SEMA_V: + case AMDGPU::DS_GWS_SEMA_BR: + case AMDGPU::DS_GWS_SEMA_P: + case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: + case AMDGPU::DS_GWS_BARRIER: + if (getSubtarget()->hasGWSAutoReplay()) + return BB; + return emitGWSMemViolTestLoop(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } @@ -3617,6 +3943,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::LOAD: { SDValue Result = LowerLOAD(Op, DAG); assert((!Result.getNode() || @@ -3641,10 +3968,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); + case ISD::INSERT_SUBVECTOR: + return lowerINSERT_SUBVECTOR(Op, DAG); case ISD::INSERT_VECTOR_ELT: return lowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::VECTOR_SHUFFLE: + return lowerVECTOR_SHUFFLE(Op, DAG); case ISD::BUILD_VECTOR: return lowerBUILD_VECTOR(Op, DAG); case ISD::FP_ROUND: @@ -3742,10 +4073,7 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3)); - if (!CD) - return DAG.getUNDEF(VT); - + const auto *CD = cast<ConstantSDNode>(N->getOperand(3)); int CondCode = CD->getSExtValue(); if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE) @@ -3753,7 +4081,6 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); - SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); @@ -3769,16 +4096,20 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, ISD::CondCode CCOpcode = getICmpCondCode(IcInput); - return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS, - DAG.getCondCode(CCOpcode)); + unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); + EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); + + SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS, + DAG.getCondCode(CCOpcode)); + if (VT.bitsEq(CCVT)) + return SetCC; + return DAG.getZExtOrTrunc(SetCC, DL, VT); } static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3)); - if (!CD) - return DAG.getUNDEF(VT); + const auto *CD = cast<ConstantSDNode>(N->getOperand(3)); int CondCode = CD->getSExtValue(); if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE || @@ -3798,8 +4129,13 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); - return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0, - Src1, DAG.getCondCode(CCOpcode)); + unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); + EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); + SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, + Src1, DAG.getCondCode(CCOpcode)); + if (VT.bitsEq(CCVT)) + return SetCC; + return DAG.getZExtOrTrunc(SetCC, SL, VT); } void SITargetLowering::ReplaceNodeResults(SDNode *N, @@ -3957,32 +4293,6 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { return 0; } -void SITargetLowering::createDebuggerPrologueStackObjects( - MachineFunction &MF) const { - // Create stack objects that are used for emitting debugger prologue. - // - // Debugger prologue writes work group IDs and work item IDs to scratch memory - // at fixed location in the following format: - // offset 0: work group ID x - // offset 4: work group ID y - // offset 8: work group ID z - // offset 16: work item ID x - // offset 20: work item ID y - // offset 24: work item ID z - SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - int ObjectIdx = 0; - - // For each dimension: - for (unsigned i = 0; i < 3; ++i) { - // Create fixed stack object for work group ID. - ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true); - Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); - // Create fixed stack object for work item ID. - ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true); - Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); - } -} - bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || @@ -3991,7 +4301,10 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { - return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + // FIXME: Either avoid relying on address space here or change the default + // address space for functions to avoid the explicit check. + return (GV->getValueType()->isFunctionTy() || + GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && @@ -4103,6 +4416,31 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + // Checking the depth + if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0) + return DAG.getConstant(0, DL, VT); + + MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + // Check for kernel and shader functions + if (Info->isEntryFunction()) + return DAG.getConstant(0, DL, VT); + + MachineFrameInfo &MFI = MF.getFrameInfo(); + // There is a call to @llvm.returnaddress in this function + MFI.setReturnAddressIsTaken(true); + + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + // Get the return address reg and mark it as an implicit live-in + unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent())); + + return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); +} + SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, @@ -4131,7 +4469,9 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + bool IsIEEEMode = Info->getMode().IEEE; // FIXME: Assert during eslection that this is only selected for // ieee_mode. Currently a combine can produce the ieee version for non-ieee @@ -4302,6 +4642,32 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, return DAG.getUNDEF(ASC->getValueType(0)); } +// This lowers an INSERT_SUBVECTOR by extracting the individual elements from +// the small vector and inserting them into the big vector. That is better than +// the default expansion of doing it via a stack slot. Even though the use of +// the stack slot would be optimized away afterwards, the stack slot itself +// remains. +SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDValue Vec = Op.getOperand(0); + SDValue Ins = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + EVT VecVT = Vec.getValueType(); + EVT InsVT = Ins.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned InsNumElts = InsVT.getVectorNumElements(); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + SDLoc SL(Op); + + for (unsigned I = 0; I != InsNumElts; ++I) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins, + DAG.getConstant(I, SL, MVT::i32)); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt, + DAG.getConstant(IdxVal + I, SL, MVT::i32)); + } + return Vec; +} + SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue Vec = Op.getOperand(0); @@ -4352,12 +4718,12 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, MVT IntVT = MVT::getIntegerVT(VecSize); // Avoid stack access for dynamic indexing. - SDValue Val = InsVal; - if (InsVal.getValueType() == MVT::f16) - Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); - // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec - SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val); + + // Create a congruent vector with the target value in each element so that + // the required element can be masked and ORed into the target vector. + SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, + DAG.getSplatBuildVector(VecVT, SL, InsVal)); assert(isPowerOf2_32(EltSize)); SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); @@ -4419,6 +4785,63 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT); } +static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) { + assert(Elt % 2 == 0); + return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0); +} + +SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + EVT ResultVT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op); + + EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16; + EVT EltVT = PackVT.getVectorElementType(); + int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements(); + + // vector_shuffle <0,1,6,7> lhs, rhs + // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2) + // + // vector_shuffle <6,7,2,3> lhs, rhs + // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2) + // + // vector_shuffle <6,7,0,1> lhs, rhs + // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0) + + // Avoid scalarizing when both halves are reading from consecutive elements. + SmallVector<SDValue, 4> Pieces; + for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) { + if (elementPairIsContiguous(SVN->getMask(), I)) { + const int Idx = SVN->getMaskElt(I); + int VecIdx = Idx < SrcNumElts ? 0 : 1; + int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts; + SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, + PackVT, SVN->getOperand(VecIdx), + DAG.getConstant(EltIdx, SL, MVT::i32)); + Pieces.push_back(SubVec); + } else { + const int Idx0 = SVN->getMaskElt(I); + const int Idx1 = SVN->getMaskElt(I + 1); + int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1; + int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1; + int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts; + int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts; + + SDValue Vec0 = SVN->getOperand(VecIdx0); + SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32)); + + SDValue Vec1 = SVN->getOperand(VecIdx1); + SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32)); + Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 })); + } + } + + return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces); +} + SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -4512,11 +4935,18 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too // small. This requires us to add 4 to the global variable offset in order to // compute the correct address. - SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, - GAFlags); - SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, - GAFlags == SIInstrInfo::MO_NONE ? - GAFlags : GAFlags + 1); + unsigned LoFlags = GAFlags; + if (LoFlags == SIInstrInfo::MO_NONE) + LoFlags = SIInstrInfo::MO_REL32; + SDValue PtrLo = + DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags); + SDValue PtrHi; + if (GAFlags == SIInstrInfo::MO_NONE) { + PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); + } else { + PtrHi = + DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags + 1); + } return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); } @@ -4525,7 +4955,10 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GSD->getGlobal(); - if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + (!GV->hasExternalLinkage() || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); @@ -4533,7 +4966,12 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDLoc DL(GSD); EVT PtrVT = Op.getValueType(); - // FIXME: Should not make address space based decisions here. + if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), + SIInstrInfo::MO_ABS32_LO); + return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA); + } + if (shouldEmitFixup(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); else if (shouldEmitPCReloc(GV)) @@ -4641,10 +5079,8 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, } static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, - SDValue *GLC, SDValue *SLC) { - auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode()); - if (!CachePolicyConst) - return false; + SDValue *GLC, SDValue *SLC, SDValue *DLC) { + auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode()); uint64_t Value = CachePolicyConst->getZExtValue(); SDLoc DL(CachePolicy); @@ -4656,6 +5092,10 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); Value &= ~(uint64_t)0x2; } + if (DLC) { + *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x4; + } return Value == 0; } @@ -4689,14 +5129,14 @@ static SDValue constructRetValue(SelectionDAG &DAG, EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) : AdjEltVT; - // Special case for v8f16. Rather than add support for this, use v4i32 to + // Special case for v6f16. Rather than add support for this, use v3i32 to // extract the data elements - bool V8F16Special = false; - if (CastVT == MVT::v8f16) { - CastVT = MVT::v4i32; + bool V6F16Special = false; + if (NumElts == 6) { + CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2); DMaskPop >>= 1; ReqRetNumElts >>= 1; - V8F16Special = true; + V6F16Special = true; AdjVT = MVT::v2i32; } @@ -4726,7 +5166,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, PreTFCRes = BVElts[0]; } - if (V8F16Special) + if (V6F16Special) PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); if (!IsTexFail) { @@ -4745,9 +5185,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, SDValue *LWE, bool &IsTexFail) { - auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode()); - if (!TexFailCtrlConst) - return false; + auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode()); uint64_t Value = TexFailCtrlConst->getZExtValue(); if (Value) { @@ -4774,7 +5212,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); + const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = + AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; + bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10; SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end()); SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end()); @@ -4810,9 +5251,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } else { unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1; - auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); - if (!DMaskConst) - return Op; + auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); DMask = DMaskConst->getZExtValue(); DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); @@ -4821,8 +5260,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, MVT StoreVT = VData.getSimpleValueType(); if (StoreVT.getScalarType() == MVT::f16) { - if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || - !BaseOpcode->HasD16) + if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) return Op; // D16 is unsupported for this instruction IsD16 = true; @@ -4835,8 +5273,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, // and whether packing is supported. MVT LoadVT = ResultTypes[0].getSimpleVT(); if (LoadVT.getScalarType() == MVT::f16) { - if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || - !BaseOpcode->HasD16) + if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) return Op; // D16 is unsupported for this instruction IsD16 = true; @@ -4878,6 +5315,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } + // Optimize _mip away, when 'lod' is zero + if (MIPMappingInfo) { + if (auto ConstantLod = + dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) { + if (ConstantLod->isNullValue()) { + IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip + NumMIVAddrs--; // remove 'lod' + } + } + } + // Check for 16 bit addresses and pack if true. unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType(); @@ -4915,7 +5363,22 @@ SDValue SITargetLowering::lowerImage(SDValue Op, VAddrs.push_back(Op.getOperand(AddrIdx + i)); } - SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs); + // If the register allocator cannot place the address registers contiguously + // without introducing moves, then using the non-sequential address encoding + // is always preferable, since it saves VALU instructions and is usually a + // wash in terms of code size or even better. + // + // However, we currently have no way of hinting to the register allocator that + // MIMG addresses should be placed contiguously when it is possible to do so, + // so force non-NSA for the common 2-address case as a heuristic. + // + // SIShrinkInstructions will convert NSA encodings to non-NSA after register + // allocation when possible. + bool UseNSA = + ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3; + SDValue VAddr; + if (!UseNSA) + VAddr = getBuildDwordsVector(DAG, DL, VAddrs); SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); SDValue False = DAG.getTargetConstant(0, DL, MVT::i1); @@ -4926,9 +5389,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, CtrlIdx = AddrIdx + NumVAddrs + 1; } else { auto UnormConst = - dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2)); - if (!UnormConst) - return Op; + cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2)); Unorm = UnormConst->getZExtValue() ? True : False; CtrlIdx = AddrIdx + NumVAddrs + 3; @@ -4965,9 +5426,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op, return Undef; } - // Have to use a power of 2 number of dwords - NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords); - EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) : MVT::f32; @@ -4983,45 +5441,66 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDValue GLC; SDValue SLC; + SDValue DLC; if (BaseOpcode->Atomic) { GLC = True; // TODO no-return optimization - if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC)) + if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC, + IsGFX10 ? &DLC : nullptr)) return Op; } else { - if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC)) + if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC, + IsGFX10 ? &DLC : nullptr)) return Op; } - SmallVector<SDValue, 14> Ops; + SmallVector<SDValue, 26> Ops; if (BaseOpcode->Store || BaseOpcode->Atomic) Ops.push_back(VData); // vdata - Ops.push_back(VAddr); + if (UseNSA) { + for (const SDValue &Addr : VAddrs) + Ops.push_back(Addr); + } else { + Ops.push_back(VAddr); + } Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc if (BaseOpcode->Sampler) Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32)); + if (IsGFX10) + Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); Ops.push_back(Unorm); + if (IsGFX10) + Ops.push_back(DLC); Ops.push_back(GLC); Ops.push_back(SLC); Ops.push_back(IsA16 && // a16 or r128 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); Ops.push_back(TFE); // tfe Ops.push_back(LWE); // lwe - Ops.push_back(DimInfo->DA ? True : False); + if (!IsGFX10) + Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) Ops.push_back(IsD16 ? True : False); if (isa<MemSDNode>(Op)) Ops.push_back(Op.getOperand(0)); // chain - int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32; + int NumVAddrDwords = + UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; int Opcode = -1; - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, - NumVDataDwords, NumVAddrDwords); - if (Opcode == -1) - Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, + if (IsGFX10) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, + UseNSA ? AMDGPU::MIMGEncGfx10NSA + : AMDGPU::MIMGEncGfx10Default, NumVDataDwords, NumVAddrDwords); + } else { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, + NumVDataDwords, NumVAddrDwords); + } assert(Opcode != -1); MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops); @@ -5046,7 +5525,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, - SDValue Offset, SDValue GLC, + SDValue Offset, SDValue GLC, SDValue DLC, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand( @@ -5059,7 +5538,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Ops[] = { Rsrc, Offset, // Offset - GLC // glc + GLC, + DLC, }; return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(VT), Ops, VT, MMO); @@ -5263,16 +5743,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); - case SIIntrinsic::SI_load_const: { - SDValue Load = - lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2), - DAG.getTargetConstant(0, DL, MVT::i1), DAG); - return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load); - } + case Intrinsic::amdgcn_wavefrontsize: + return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(), + SDLoc(Op), MVT::i32); case Intrinsic::amdgcn_s_buffer_load: { - unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); - return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), - DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG); + bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10; + SDValue GLC; + SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1); + if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr, + IsGFX10 ? &DLC : nullptr)) + return Op; + return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), GLC, DLC, + DAG); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); @@ -5295,12 +5777,70 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), Glue); } + case Intrinsic::amdgcn_interp_p1_f16: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = M0.getValue(1); + if (getSubtarget()->getLDSBankCount() == 16) { + // 16 bank LDS + SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + Glue); + SDValue Ops[] = { + Op.getOperand(1), // Src0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + S, // Src2 - holds two f16 values selected by high + DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + Op.getOperand(4), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + DAG.getConstant(0, DL, MVT::i32) // $omod + }; + return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops); + } else { + // 32 bank LDS + SDValue Ops[] = { + Op.getOperand(1), // Src0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + Op.getOperand(4), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + DAG.getConstant(0, DL, MVT::i32), // $omod + Glue + }; + return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops); + } + } + case Intrinsic::amdgcn_interp_p2_f16: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6)); + SDValue Glue = SDValue(M0.getNode(), 1); + SDValue Ops[] = { + Op.getOperand(2), // Src0 + Op.getOperand(3), // Attrchan + Op.getOperand(4), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + Op.getOperand(1), // Src2 + DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + Op.getOperand(5), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + Glue + }; + return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops); + } case Intrinsic::amdgcn_sin: return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_cos: return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_mul_u24: + return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_mul_i24: + return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_log_clamp: { if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return SDValue(); @@ -5334,10 +5874,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_div_scale: { - // 3rd parameter required to be a constant. - const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); - if (!Param) - return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL); + const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3)); // Translate to the operands expected by the machine instruction. The // first parameter must be the same as the first instruction. @@ -5423,6 +5960,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_fmad_ftz: return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::amdgcn_if_break: + return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT, + Op->getOperand(1), Op->getOperand(2)), 0); + + case Intrinsic::amdgcn_groupstaticsize: { + Triple::OSType OS = getTargetMachine().getTargetTriple().getOS(); + if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) + return Op; + + const Module *M = MF.getFunction().getParent(); + const GlobalValue *GV = + M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize)); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0, + SIInstrInfo::MO_ABS32_LO); + return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -5438,9 +5992,99 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDLoc DL(Op); switch (IntrID) { + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: { + MemSDNode *M = cast<MemSDNode>(Op); + SDValue Chain = M->getOperand(0); + SDValue M0 = M->getOperand(2); + SDValue Value = M->getOperand(3); + unsigned IndexOperand = M->getConstantOperandVal(7); + unsigned WaveRelease = M->getConstantOperandVal(8); + unsigned WaveDone = M->getConstantOperandVal(9); + unsigned ShaderType; + unsigned Instruction; + + unsigned OrderedCountIndex = IndexOperand & 0x3f; + IndexOperand &= ~0x3f; + unsigned CountDw = 0; + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) { + CountDw = (IndexOperand >> 24) & 0xf; + IndexOperand &= ~(0xf << 24); + + if (CountDw < 1 || CountDw > 4) { + report_fatal_error( + "ds_ordered_count: dword count must be between 1 and 4"); + } + } + + if (IndexOperand) + report_fatal_error("ds_ordered_count: bad index operand"); + + switch (IntrID) { + case Intrinsic::amdgcn_ds_ordered_add: + Instruction = 0; + break; + case Intrinsic::amdgcn_ds_ordered_swap: + Instruction = 1; + break; + } + + if (WaveDone && !WaveRelease) + report_fatal_error("ds_ordered_count: wave_done requires wave_release"); + + switch (DAG.getMachineFunction().getFunction().getCallingConv()) { + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + ShaderType = 0; + break; + case CallingConv::AMDGPU_PS: + ShaderType = 1; + break; + case CallingConv::AMDGPU_VS: + ShaderType = 2; + break; + case CallingConv::AMDGPU_GS: + ShaderType = 3; + break; + default: + report_fatal_error("ds_ordered_count unsupported for this calling conv"); + } + + unsigned Offset0 = OrderedCountIndex << 2; + unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | + (Instruction << 4); + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) + Offset1 |= (CountDw - 1) << 6; + + unsigned Offset = Offset0 | (Offset1 << 8); + + SDValue Ops[] = { + Chain, + Value, + DAG.getTargetConstant(Offset, DL, MVT::i16), + copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL, + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + } + case Intrinsic::amdgcn_ds_fadd: { + MemSDNode *M = cast<MemSDNode>(Op); + unsigned Opc; + switch (IntrID) { + case Intrinsic::amdgcn_ds_fadd: + Opc = ISD::ATOMIC_LOAD_FADD; + break; + } + + return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), + M->getOperand(0), M->getOperand(2), M->getOperand(3), + M->getMemOperand()); + } case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { MemSDNode *M = cast<MemSDNode>(Op); @@ -5452,9 +6096,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_atomic_dec: Opc = AMDGPUISD::ATOMIC_DEC; break; - case Intrinsic::amdgcn_ds_fadd: - Opc = AMDGPUISD::ATOMIC_LOAD_FADD; - break; case Intrinsic::amdgcn_ds_fmin: Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; break; @@ -5503,8 +6144,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: { @@ -5531,8 +6178,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: { @@ -5559,8 +6212,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); @@ -5588,9 +6247,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_raw_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); @@ -5612,9 +6271,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_struct_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); @@ -5636,9 +6295,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: @@ -5913,6 +6572,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } } +// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to +// dwordx4 if on SI. +SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, + SDVTList VTList, + ArrayRef<SDValue> Ops, EVT MemVT, + MachineMemOperand *MMO, + SelectionDAG &DAG) const { + EVT VT = VTList.VTs[0]; + EVT WidenedVT = VT; + EVT WidenedMemVT = MemVT; + if (!Subtarget->hasDwordx3LoadStores() && + (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) { + WidenedVT = EVT::getVectorVT(*DAG.getContext(), + WidenedVT.getVectorElementType(), 4); + WidenedMemVT = EVT::getVectorVT(*DAG.getContext(), + WidenedMemVT.getVectorElementType(), 4); + MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16); + } + + assert(VTList.NumVTs == 2); + SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]); + + auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops, + WidenedMemVT, MMO); + if (WidenedVT != VT) { + auto Extract = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp, + DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout()))); + NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL); + } + return NewOp; +} + SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG) const { EVT StoreVT = VData.getValueType(); @@ -6129,6 +6821,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6155,6 +6853,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6181,10 +6885,63 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_buffer_atomic_fadd: { + unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) + IdxEn = Idx->getZExtValue() != 0; + SDValue Ops[] = { + Chain, + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + }; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + EVT VT = Op.getOperand(2).getValueType(); + + auto *M = cast<MemSDNode>(Op); + unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD + : AMDGPUISD::BUFFER_ATOMIC_FADD; + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } + + case Intrinsic::amdgcn_global_atomic_fadd: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), // ptr + Op.getOperand(3) // vdata + }; + EVT VT = Op.getOperand(3).getValueType(); + + auto *M = cast<MemSDNode>(Op); + unsigned Opcode = VT.isVector() ? AMDGPUISD::ATOMIC_PK_FADD + : AMDGPUISD::ATOMIC_FADD; + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } + + case Intrinsic::amdgcn_end_cf: + return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, + Op->getOperand(2), Chain), 0); + default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -6283,6 +7040,38 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Offsets[2] = DAG.getConstant(0, DL, MVT::i32); } +// Handle 8 bit and 16 bit buffer loads +SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, + EVT LoadVT, SDLoc DL, + ArrayRef<SDValue> Ops, + MemSDNode *M) const { + EVT IntVT = LoadVT.changeTypeToInteger(); + unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ? + AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT; + + SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other); + SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, + Ops, IntVT, + M->getMemOperand()); + SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, + LoadVT.getScalarType(), BufferLoad); + return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL); +} + +// Handle 8 bit and 16 bit buffer stores +SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, + EVT VDataType, SDLoc DL, + SDValue Ops[], + MemSDNode *M) const { + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); + Ops[1] = BufferStoreExt; + unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : + AMDGPUISD::BUFFER_STORE_SHORT; + ArrayRef<SDValue> OpsRef = makeArrayRef(&Ops[0], 9); + return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType, + M->getMemOperand()); +} + static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT) { @@ -6395,8 +7184,25 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr, RealMemVT, MMO); + if (!MemVT.isVector()) { + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); + } + + SmallVector<SDValue, 3> Elts; + for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) { + SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD, + DAG.getConstant(I, DL, MVT::i32)); + + Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt)); + } + SDValue Ops[] = { - DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1) }; @@ -6409,15 +7215,21 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); - unsigned Alignment = Load->getAlignment(); - unsigned AS = Load->getAddressSpace(); if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - AS, Alignment)) { + *Load->getMemOperand())) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, DL); } + unsigned Alignment = Load->getAlignment(); + unsigned AS = Load->getAddressSpace(); + if (Subtarget->hasLDSMisalignedBug() && + AS == AMDGPUAS::FLAT_ADDRESS && + Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) { + return SplitVectorLoad(Op, DAG); + } + MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory @@ -6430,8 +7242,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { - if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) - return SDValue(); + if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) { + if (MemVT.isPow2VectorType()) + return SDValue(); + if (NumElements == 3) + return WidenVectorLoad(Op, DAG); + return SplitVectorLoad(Op, DAG); + } // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. @@ -6443,8 +7260,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && - Alignment >= 4 && NumElements < 32) - return SDValue(); + Alignment >= 4 && NumElements < 32) { + if (MemVT.isPow2VectorType()) + return SDValue(); + if (NumElements == 3) + return WidenVectorLoad(Op, DAG); + return SplitVectorLoad(Op, DAG); + } // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. @@ -6456,7 +7278,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); - // v4 loads are supported for private and global memory. + // v3 loads not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return WidenVectorLoad(Op, DAG); + // v3 and v4 loads are supported for private and global memory. return SDValue(); } if (AS == AMDGPUAS::PRIVATE_ADDRESS) { @@ -6474,11 +7299,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // Same as global/flat if (NumElements > 4) return SplitVectorLoad(Op, DAG); + // v3 loads not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return WidenVectorLoad(Op, DAG); return SDValue(); default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { // Use ds_read_b128 if possible. if (Subtarget->useDS128() && Load->getAlignment() >= 16 && MemVT.getStoreSize() == 16) @@ -6794,7 +7622,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { SDValue Scale; - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (!Subtarget->hasUsableDivScaleConditionOutput()) { // Workaround a hardware bug on SI where the condition output from div_scale // is not usable. @@ -6856,12 +7684,18 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { assert(VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32); - unsigned AS = Store->getAddressSpace(); if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - AS, Store->getAlignment())) { + *Store->getMemOperand())) { return expandUnalignedStore(Store, DAG); } + unsigned AS = Store->getAddressSpace(); + if (Subtarget->hasLDSMisalignedBug() && + AS == AMDGPUAS::FLAT_ADDRESS && + Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) { + return SplitVectorStore(Op, DAG); + } + MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory @@ -6875,6 +7709,9 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorStore(Op, DAG); + // v3 stores not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return SplitVectorStore(Op, DAG); return SDValue(); } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { switch (Subtarget->getMaxPrivateElementSize()) { @@ -6885,16 +7722,16 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SplitVectorStore(Op, DAG); return SDValue(); case 16: - if (NumElements > 4) + if (NumElements > 4 || NumElements == 3) return SplitVectorStore(Op, DAG); return SDValue(); default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { // Use ds_write_b128 if possible. if (Subtarget->useDS128() && Store->getAlignment() >= 16 && - VT.getStoreSize() == 16) + VT.getStoreSize() == 16 && NumElements != 3) return SDValue(); if (NumElements > 2) @@ -6905,7 +7742,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // out-of-bounds even if base + offsets is in bounds. Split vectorized // stores here to avoid emitting ds_write2_b32. We may re-combine the // store later in the SILoadStoreOptimizer. - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + if (!Subtarget->hasUsableDSOffset() && NumElements == 2 && VT.getStoreSize() == 8 && Store->getAlignment() < 8) { return SplitVectorStore(Op, DAG); @@ -7614,6 +8451,43 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N, + DAGCombinerInfo &DCI) + const { + SDValue Src = N->getOperand(0); + auto *VTSign = cast<VTSDNode>(N->getOperand(1)); + + if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && + VTSign->getVT() == MVT::i8) || + (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && + VTSign->getVT() == MVT::i16)) && + Src.hasOneUse()) { + auto *M = cast<MemSDNode>(Src); + SDValue Ops[] = { + Src.getOperand(0), // Chain + Src.getOperand(1), // rsrc + Src.getOperand(2), // vindex + Src.getOperand(3), // voffset + Src.getOperand(4), // soffset + Src.getOperand(5), // offset + Src.getOperand(6), + Src.getOperand(7) + }; + // replace with BUFFER_LOAD_BYTE/SHORT + SDVTList ResList = DCI.DAG.getVTList(MVT::i32, + Src.getOperand(0).getValueType()); + unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ? + AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT; + SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N), + ResList, + Ops, M->getMemoryVT(), + M->getMemOperand()); + return DCI.DAG.getMergeValues({BufferLoadSignExt, + BufferLoadSignExt.getValue(1)}, SDLoc(N)); + } + return SDValue(); +} + SDValue SITargetLowering::performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -8013,9 +8887,12 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, if (Cmp == APFloat::cmpGreaterThan) return SDValue(); + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + // TODO: Check IEEE bit enabled? EVT VT = Op0.getValueType(); - if (Subtarget->enableDX10Clamp()) { + if (Info->getMode().DX10Clamp) { // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the // hardware fmed3 behavior converting to a min. // FIXME: Should this be allowing -0.0? @@ -8059,10 +8936,10 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && - !VT.isVector() && VT != MVT::f64 && - ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) { + !VT.isVector() && + (VT == MVT::i32 || VT == MVT::f32 || + ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { @@ -8149,9 +9026,12 @@ SDValue SITargetLowering::performFMed3Combine(SDNode *N, return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); } + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother // handling no dx10-clamp? - if (Subtarget->enableDX10Clamp()) { + if (Info->getMode().DX10Clamp) { // If NaNs is clamped to 0, we are free to reorder the inputs. if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) @@ -8342,8 +9222,10 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, // Only do this if we are not trying to support denormals. v_mad_f32 does not // support denormals ever. - if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - (VT == MVT::f16 && !Subtarget->hasFP16Denormals())) + if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || + (VT == MVT::f16 && !Subtarget->hasFP16Denormals() && + getSubtarget()->hasMadF16())) && + isOperationLegal(ISD::FMAD, VT)) return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; @@ -8357,6 +9239,46 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return 0; } +// For a reassociatable opcode perform: +// op x, (op y, z) -> op (op x, z), y, if x and z are uniform +SDValue SITargetLowering::reassociateScalarOps(SDNode *N, + SelectionDAG &DAG) const { + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + unsigned Opc = N->getOpcode(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + if (!(Op0->isDivergent() ^ Op1->isDivergent())) + return SDValue(); + + if (Op0->isDivergent()) + std::swap(Op0, Op1); + + if (Op1.getOpcode() != Opc || !Op1.hasOneUse()) + return SDValue(); + + SDValue Op2 = Op1.getOperand(1); + Op1 = Op1.getOperand(0); + if (!(Op1->isDivergent() ^ Op2->isDivergent())) + return SDValue(); + + if (Op1->isDivergent()) + std::swap(Op1, Op2); + + // If either operand is constant this will conflict with + // DAGCombiner::ReassociateOps(). + if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) || + DAG.isConstantIntBuildVectorOrConstantInt(Op1)) + return SDValue(); + + SDLoc SL(N); + SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1); + return DAG.getNode(Opc, SL, VT, Add1, Op2); +} + static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, @@ -8405,6 +9327,10 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } + if (SDValue V = reassociateScalarOps(N, DAG)) { + return V; + } + if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) return SDValue(); @@ -8452,14 +9378,10 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - unsigned Opc = LHS.getOpcode(); - if (Opc != ISD::SUBCARRY) - std::swap(RHS, LHS); - if (LHS.getOpcode() == ISD::SUBCARRY) { // sub (subcarry x, 0, cc), y => subcarry x, y, cc auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); - if (!C || C->getZExtValue() != 0) + if (!C || !C->isNullValue()) return SDValue(); SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args); @@ -8587,7 +9509,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, EVT VT = N->getValueType(0); SDLoc SL(N); - if (!Subtarget->hasDotInsts() || VT != MVT::f32) + if (!Subtarget->hasDot2Insts() || VT != MVT::f32) return SDValue(); // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> @@ -8801,11 +9723,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, if (!CSrc) return SDValue(); + const MachineFunction &MF = DCI.DAG.getMachineFunction(); const APFloat &F = CSrc->getValueAPF(); APFloat Zero = APFloat::getZero(F.getSemantics()); APFloat::cmpResult Cmp0 = F.compare(Zero); if (Cmp0 == APFloat::cmpLessThan || - (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { + (Cmp0 == APFloat::cmpUnordered && + MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); } @@ -8822,7 +9746,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return SDValue(); - switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); @@ -8873,11 +9796,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD_FADD: case AMDGPUISD::ATOMIC_INC: case AMDGPUISD::ATOMIC_DEC: - case AMDGPUISD::ATOMIC_LOAD_FADD: case AMDGPUISD::ATOMIC_LOAD_FMIN: - case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. + case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; return performMemSDNodeCombine(cast<MemSDNode>(N), DCI); @@ -8889,6 +9812,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performXorCombine(N, DCI); case ISD::ZERO_EXTEND: return performZeroExtendCombine(N, DCI); + case ISD::SIGN_EXTEND_INREG: + return performSignExtendInRegCombine(N , DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); case ISD::FCANONICALIZE: @@ -9034,6 +9959,10 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Don't allow 0 dmask, as hardware assumes one channel enabled. bool NoChannels = !NewDmask; if (NoChannels) { + if (!UsesTFC) { + // No uses of the result and not using TFC. Then do nothing. + return Node; + } // If the original dmask has one channel - then nothing to do if (OldBitsSet == 1) return Node; @@ -9205,7 +10134,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, break; MVT VT = Src0.getValueType().getSimpleVT(); - const TargetRegisterClass *RC = getRegClassFor(VT); + const TargetRegisterClass *RC = + getRegClassFor(VT, Src0.getNode()->isDivergent()); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT); @@ -9238,6 +10168,24 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } + case AMDGPU::V_PERMLANE16_B32: + case AMDGPU::V_PERMLANEX16_B32: { + ConstantSDNode *FI = cast<ConstantSDNode>(Node->getOperand(0)); + ConstantSDNode *BC = cast<ConstantSDNode>(Node->getOperand(2)); + if (!FI->getZExtValue() && !BC->getZExtValue()) + break; + SDValue VDstIn = Node->getOperand(6); + if (VDstIn.isMachineOpcode() + && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) + break; + MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, + SDLoc(Node), MVT::i32); + SmallVector<SDValue, 8> Ops = { SDValue(FI, 0), Node->getOperand(1), + SDValue(BC, 0), Node->getOperand(3), + Node->getOperand(4), Node->getOperand(5), + SDValue(ImpDef, 0), Node->getOperand(7) }; + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } default: break; } @@ -9256,6 +10204,36 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); + + // Prefer VGPRs over AGPRs in mAI instructions where possible. + // This saves a chain-copy of registers and better ballance register + // use between vgpr and agpr as agpr tuples tend to be big. + if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) { + unsigned Opc = MI.getOpcode(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) { + if (I == -1) + break; + MachineOperand &Op = MI.getOperand(I); + if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && + OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || + !TargetRegisterInfo::isVirtualRegister(Op.getReg()) || + !TRI->isAGPR(MRI, Op.getReg())) + continue; + auto *Src = MRI.getUniqueVRegDef(Op.getReg()); + if (!Src || !Src->isCopy() || + !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) + continue; + auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); + auto *NewRC = TRI->getEquivalentVGPRClass(RC); + // All uses of agpr64 and agpr32 can also accept vgpr except for + // v_accvgpr_read, but we do not produce agpr reads during selection, + // so no use checks are needed. + MRI.setRegClass(Op.getReg(), NewRC); + } + } + return; } @@ -9391,9 +10369,15 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 64: RC = &AMDGPU::SGPR_64RegClass; break; + case 96: + RC = &AMDGPU::SReg_96RegClass; + break; case 128: RC = &AMDGPU::SReg_128RegClass; break; + case 160: + RC = &AMDGPU::SReg_160RegClass; + break; case 256: RC = &AMDGPU::SReg_256RegClass; break; @@ -9419,6 +10403,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 128: RC = &AMDGPU::VReg_128RegClass; break; + case 160: + RC = &AMDGPU::VReg_160RegClass; + break; case 256: RC = &AMDGPU::VReg_256RegClass; break; @@ -9427,6 +10414,29 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, break; } break; + case 'a': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: + case 16: + RC = &AMDGPU::AGPR_32RegClass; + break; + case 64: + RC = &AMDGPU::AReg_64RegClass; + break; + case 128: + RC = &AMDGPU::AReg_128RegClass; + break; + case 512: + RC = &AMDGPU::AReg_512RegClass; + break; + case 1024: + RC = &AMDGPU::AReg_1024RegClass; + // v32 types are not legal but we support them here. + return std::make_pair(0U, RC); + } + break; } // We actually support i128, i16 and f16 as inline parameters // even if they are not reported as legal @@ -9440,6 +10450,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, RC = &AMDGPU::VGPR_32RegClass; } else if (Constraint[1] == 's') { RC = &AMDGPU::SGPR_32RegClass; + } else if (Constraint[1] == 'a') { + RC = &AMDGPU::AGPR_32RegClass; } if (RC) { @@ -9459,6 +10471,7 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { default: break; case 's': case 'v': + case 'a': return C_RegisterClass; } } @@ -9471,7 +10484,7 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { void SITargetLowering::finalizeLowering(MachineFunction &MF) const { MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (Info->isEntryFunction()) { @@ -9479,31 +10492,45 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); } - // We have to assume the SP is needed in case there are calls in the function - // during lowering. Calls are only detected after the function is - // lowered. We're about to reserve registers, so don't bother using it if we - // aren't really going to use it. - bool NeedSP = !Info->isEntryFunction() || - MFI.hasVarSizedObjects() || - MFI.hasCalls(); + assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), + Info->getStackPtrOffsetReg())); + if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) + MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); - if (NeedSP) { - unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF); - Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg); + // We need to worry about replacing the default register with itself in case + // of MIR testcases missing the MFI. + if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG) + MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); - assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg()); - assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), - Info->getStackPtrOffsetReg())); - MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); - } + if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) + MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); - MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); - MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); - MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, - Info->getScratchWaveOffsetReg()); + if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) { + MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, + Info->getScratchWaveOffsetReg()); + } Info->limitOccupancy(MF); + if (ST.isWave32() && !MF.empty()) { + // Add VCC_HI def because many instructions marked as imp-use VCC where + // we may only define VCC_LO. If nothing defines VCC_HI we may end up + // having a use of undef. + + const SIInstrInfo *TII = ST.getInstrInfo(); + DebugLoc DL; + + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator I = MBB.getFirstNonDebugInstr(); + BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), AMDGPU::VCC_HI); + + for (auto &MBB : MF) { + for (auto &MI : MBB) { + TII->fixImplicitOperands(MI); + } + } + } + TargetLoweringBase::finalizeLowering(MF); } @@ -9515,14 +10542,81 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, DAG, Depth); - if (getSubtarget()->enableHugePrivateBuffer()) - return; - - // Technically it may be possible to have a dispatch with a single workitem - // that uses the full private memory size, but that's not really useful. We - // can't use vaddr in MUBUF instructions if we don't know the address + // Set the high bits to zero based on the maximum allowed scratch size per + // wave. We can't use vaddr in MUBUF instructions if we don't know the address // calculation won't overflow, so assume the sign bit is never set. - Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); + Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); +} + +unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { + const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML); + const unsigned CacheLineAlign = 6; // log2(64) + + // Pre-GFX10 target did not benefit from loop alignment + if (!ML || DisableLoopAlignment || + (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) || + getSubtarget()->hasInstFwdPrefetchBug()) + return PrefAlign; + + // On GFX10 I$ is 4 x 64 bytes cache lines. + // By default prefetcher keeps one cache line behind and reads two ahead. + // We can modify it with S_INST_PREFETCH for larger loops to have two lines + // behind and one ahead. + // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. + // If loop fits 64 bytes it always spans no more than two cache lines and + // does not need an alignment. + // Else if loop is less or equal 128 bytes we do not need to modify prefetch, + // Else if loop is less or equal 192 bytes we need two lines behind. + + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + const MachineBasicBlock *Header = ML->getHeader(); + if (Header->getAlignment() != PrefAlign) + return Header->getAlignment(); // Already processed. + + unsigned LoopSize = 0; + for (const MachineBasicBlock *MBB : ML->blocks()) { + // If inner loop block is aligned assume in average half of the alignment + // size to be added as nops. + if (MBB != Header) + LoopSize += (1 << MBB->getAlignment()) / 2; + + for (const MachineInstr &MI : *MBB) { + LoopSize += TII->getInstSizeInBytes(MI); + if (LoopSize > 192) + return PrefAlign; + } + } + + if (LoopSize <= 64) + return PrefAlign; + + if (LoopSize <= 128) + return CacheLineAlign; + + // If any of parent loops is surrounded by prefetch instructions do not + // insert new for inner loop, which would reset parent's settings. + for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) { + if (MachineBasicBlock *Exit = P->getExitBlock()) { + auto I = Exit->getFirstNonDebugInstr(); + if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH) + return CacheLineAlign; + } + } + + MachineBasicBlock *Pre = ML->getLoopPreheader(); + MachineBasicBlock *Exit = ML->getExitBlock(); + + if (Pre && Exit) { + BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(), + TII->get(AMDGPU::S_INST_PREFETCH)) + .addImm(1); // prefetch 2 lines behind PC + + BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(), + TII->get(AMDGPU::S_INST_PREFETCH)) + .addImm(2); // prefetch 1 line behind PC + } + + return CacheLineAlign; } LLVM_ATTRIBUTE_UNUSED @@ -9531,7 +10625,8 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) { do { // Follow the chain until we find an INLINEASM node. N = N->getOperand(0).getNode(); - if (N->getOpcode() == ISD::INLINEASM) + if (N->getOpcode() == ISD::INLINEASM || + N->getOpcode() == ISD::INLINEASM_BR) return true; } while (N->getOpcode() == ISD::CopyFromReg); return false; @@ -9616,7 +10711,10 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, bool SNaN, unsigned Depth) const { if (Op.getOpcode() == AMDGPUISD::CLAMP) { - if (Subtarget->enableDX10Clamp()) + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + if (Info->getMode().DX10Clamp) return true; // Clamped to 0. return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); } @@ -9624,3 +10722,29 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG, SNaN, Depth); } + +TargetLowering::AtomicExpansionKind +SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + switch (RMW->getOperation()) { + case AtomicRMWInst::FAdd: { + Type *Ty = RMW->getType(); + + // We don't have a way to support 16-bit atomics now, so just leave them + // as-is. + if (Ty->isHalfTy()) + return AtomicExpansionKind::None; + + if (!Ty->isFloatTy()) + return AtomicExpansionKind::CmpXChg; + + // TODO: Do have these for flat. Older targets also had them for buffers. + unsigned AS = RMW->getPointerAddressSpace(); + return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ? + AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; + } + default: + break; + } + + return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); +} diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index bcef519ee663..21a215e16ce7 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -1,9 +1,8 @@ //===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -61,7 +60,7 @@ private: SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const; SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset, - SDValue GLC, SelectionDAG &DAG) const; + SDValue GLC, SDValue DLC, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; @@ -90,11 +89,17 @@ private: SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, SelectionDAG &DAG, ArrayRef<SDValue> Ops, bool IsIntrinsic = false) const; + // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to + // dwordx4 if on SI. + SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, + ArrayRef<SDValue> Ops, EVT MemVT, + MachineMemOperand *MMO, SelectionDAG &DAG) const; + SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; /// Converts \p Op, which must be of floating point type, to the @@ -116,8 +121,10 @@ private: SelectionDAG &DAG) const; SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const; @@ -141,6 +148,7 @@ private: SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSignExtendInRegCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const; @@ -156,6 +164,7 @@ private: SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -174,8 +183,6 @@ private: unsigned isCFIntrinsic(const SDNode *Intr) const; - void createDebuggerPrologueStackObjects(MachineFunction &MF) const; - /// \returns True if fixup needs to be emitted for given global value \p GV, /// false otherwise. bool shouldEmitFixup(const GlobalValue *GV) const; @@ -194,6 +201,15 @@ private: void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, unsigned Align = 4) const; + // Handle 8 bit and 16 bit buffer loads + SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, + ArrayRef<SDValue> Ops, MemSDNode *M) const; + + // Handle 8 bit and 16 bit buffer stores + SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, + SDLoc DL, SDValue Ops[], + MemSDNode *M) const; + public: SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI); @@ -219,20 +235,21 @@ public: bool canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const override; - bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, - unsigned Align, - bool *IsFast) const override; + bool allowsMisalignedMemoryAccesses( + EVT VT, unsigned AS, unsigned Align, + MachineMemOperand::Flags Flags = MachineMemOperand::MONone, + bool *IsFast = nullptr) const override; EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const override; + const AttributeList &FuncAttributes) const override; bool isMemOpUniform(const SDNode *N) const; bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const; bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; - bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; + bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; @@ -298,6 +315,9 @@ public: MachineBasicBlock *splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; @@ -352,6 +372,9 @@ public: const SelectionDAG &DAG, bool SNaN = false, unsigned Depth = 0) const override; + AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; + + unsigned getPrefLoopAlignment(MachineLoop *ML) const override; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp index ba21a5ce1293..87e63fcc4a04 100644 --- a/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -1,9 +1,8 @@ //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -93,15 +92,13 @@ INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; -static bool opcodeEmitsNoInsts(unsigned Opc) { - switch (Opc) { - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - case TargetOpcode::BUNDLE: - case TargetOpcode::CFI_INSTRUCTION: - case TargetOpcode::EH_LABEL: - case TargetOpcode::GC_LABEL: - case TargetOpcode::DBG_VALUE: +static bool opcodeEmitsNoInsts(const MachineInstr &MI) { + if (MI.isMetaInstruction()) + return true; + + // Handle target specific opcodes. + switch (MI.getOpcode()) { + case AMDGPU::SI_MASK_BRANCH: return true; default: return false; @@ -110,9 +107,6 @@ static bool opcodeEmitsNoInsts(unsigned Opc) { bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, const MachineBasicBlock &To) const { - if (From.succ_empty()) - return false; - unsigned NumInstr = 0; const MachineFunction *MF = From.getParent(); @@ -122,7 +116,7 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); NumInstr < SkipThreshold && I != E; ++I) { - if (opcodeEmitsNoInsts(I->getOpcode())) + if (opcodeEmitsNoInsts(*I)) continue; // FIXME: Since this is required for correctness, this should be inserted @@ -138,6 +132,11 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) return true; + // These instructions are potentially expensive even if EXEC = 0. + if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || + I->getOpcode() == AMDGPU::S_WAITCNT) + return true; + ++NumInstr; if (NumInstr >= SkipThreshold) return true; @@ -177,7 +176,7 @@ bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { .addImm(0); // en // ... and terminate wavefront. - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); return true; } @@ -245,6 +244,10 @@ void SIInsertSkips::kill(MachineInstr &MI) { llvm_unreachable("invalid ISD:SET cond code"); } + const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); + if (ST.hasNoSdstCMPX()) + Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode); + assert(MI.getOperand(0).isReg()); if (TRI->isVGPR(MBB.getParent()->getRegInfo(), @@ -254,17 +257,23 @@ void SIInsertSkips::kill(MachineInstr &MI) { .add(MI.getOperand(1)) .add(MI.getOperand(0)); } else { - BuildMI(MBB, &MI, DL, TII->get(Opcode)) - .addReg(AMDGPU::VCC, RegState::Define) - .addImm(0) // src0 modifiers - .add(MI.getOperand(1)) - .addImm(0) // src1 modifiers - .add(MI.getOperand(0)) - .addImm(0); // omod + auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode)); + if (!ST.hasNoSdstCMPX()) + I.addReg(AMDGPU::VCC, RegState::Define); + + I.addImm(0) // src0 modifiers + .add(MI.getOperand(1)) + .addImm(0) // src1 modifiers + .add(MI.getOperand(0)); + + I.addImm(0); // omod } break; } case AMDGPU::SI_KILL_I1_TERMINATOR: { + const MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; const MachineOperand &Op = MI.getOperand(0); int64_t KillVal = MI.getOperand(1).getImm(); assert(KillVal == 0 || KillVal == -1); @@ -275,14 +284,17 @@ void SIInsertSkips::kill(MachineInstr &MI) { assert(Imm == 0 || Imm == -1); if (Imm == KillVal) - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32 + : AMDGPU::S_MOV_B64), Exec) .addImm(0); break; } unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; - BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + if (ST.isWave32()) + Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32; + BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec) + .addReg(Exec) .add(Op); break; } @@ -331,9 +343,11 @@ bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const { // S_CBRANCH_EXEC[N]Z bool Changed = false; MachineBasicBlock &MBB = *MI.getParent(); - const unsigned CondReg = AMDGPU::VCC; - const unsigned ExecReg = AMDGPU::EXEC; - const unsigned And = AMDGPU::S_AND_B64; + const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); + const bool IsWave32 = ST.isWave32(); + const unsigned CondReg = TRI->getVCC(); + const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), E = MBB.rend(); diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index afc0b4467610..c89d5b71ec5c 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1,9 +1,8 @@ //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -69,10 +68,10 @@ DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm", DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm", "Force emit s_waitcnt vmcnt(0) instrs"); -static cl::opt<unsigned> ForceEmitZeroFlag( +static cl::opt<bool> ForceEmitZeroFlag( "amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), - cl::init(0), cl::Hidden); + cl::init(false), cl::Hidden); namespace { @@ -101,7 +100,7 @@ public: #define CNT_MASK(t) (1u << (t)) -enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS }; +enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS }; iterator_range<enum_iterator<InstCounterType>> inst_counter_types() { return make_range(enum_iterator<InstCounterType>(VM_CNT), @@ -114,6 +113,7 @@ struct { uint32_t VmcntMax; uint32_t ExpcntMax; uint32_t LgkmcntMax; + uint32_t VscntMax; int32_t NumVGPRsMax; int32_t NumSGPRsMax; } HardwareLimits; @@ -127,6 +127,8 @@ struct { enum WaitEventType { VMEM_ACCESS, // vector-memory read & write + VMEM_READ_ACCESS, // vector-memory read + VMEM_WRITE_ACCESS,// vector-memory write LDS_ACCESS, // lds read & write GDS_ACCESS, // gds read & write SQ_MESSAGE, // send message @@ -140,11 +142,12 @@ enum WaitEventType { }; static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = { - (1 << VMEM_ACCESS), + (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | (1 << SQ_MESSAGE), (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS), + (1 << VMEM_WRITE_ACCESS) }; // The mapping is: @@ -172,6 +175,9 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { case LGKM_CNT: Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count); break; + case VS_CNT: + Wait.VsCnt = std::min(Wait.VsCnt, Count); + break; default: llvm_unreachable("bad InstCounterType"); } @@ -200,6 +206,8 @@ public: return HardwareLimits.LgkmcntMax; case EXP_CNT: return HardwareLimits.ExpcntMax; + case VS_CNT: + return HardwareLimits.VscntMax; default: break; } @@ -222,10 +230,12 @@ public: // Mapping from event to counter. InstCounterType eventCounter(WaitEventType E) { - if (E == VMEM_ACCESS) + if (WaitEventMaskForInst[VM_CNT] & (1 << E)) return VM_CNT; if (WaitEventMaskForInst[LGKM_CNT] & (1 << E)) return LGKM_CNT; + if (WaitEventMaskForInst[VS_CNT] & (1 << E)) + return VS_CNT; assert(WaitEventMaskForInst[EXP_CNT] & (1 << E)); return EXP_CNT; } @@ -453,7 +463,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, unsigned OpNo, bool Def) const { const MachineOperand &Op = MI->getOperand(OpNo); if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || - (Def && !Op.isDef())) + (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg())) return {-1, -1}; // A use via a PW operand does not need a waitcnt. @@ -526,20 +536,22 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, // Put score on the source vgprs. If this is a store, just use those // specific register(s). if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { + int AddrOpIdx = + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr); // All GDS operations must protect their address register (same as // export.) - if (Inst.getOpcode() != AMDGPU::DS_APPEND && - Inst.getOpcode() != AMDGPU::DS_CONSUME) { - setExpScore( - &Inst, TII, TRI, MRI, - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr), - CurrScore); + if (AddrOpIdx != -1) { + setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore); } + if (Inst.mayStore()) { - setExpScore( - &Inst, TII, TRI, MRI, - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), - CurrScore); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::data0) != -1) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), + CurrScore); + } if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data1) != -1) { setExpScore(&Inst, TII, TRI, MRI, @@ -663,6 +675,9 @@ void WaitcntBrackets::print(raw_ostream &OS) { case EXP_CNT: OS << " EXP_CNT(" << UB - LB << "): "; break; + case VS_CNT: + OS << " VS_CNT(" << UB - LB << "): "; + break; default: OS << " UNKNOWN(" << UB - LB << "): "; break; @@ -702,7 +717,8 @@ void WaitcntBrackets::print(raw_ostream &OS) { bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { return simplifyWaitcnt(VM_CNT, Wait.VmCnt) | simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) | - simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt); + simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) | + simplifyWaitcnt(VS_CNT, Wait.VsCnt); } bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T, @@ -745,6 +761,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { applyWaitcnt(VM_CNT, Wait.VmCnt); applyWaitcnt(EXP_CNT, Wait.ExpCnt); applyWaitcnt(LGKM_CNT, Wait.LgkmCnt); + applyWaitcnt(VS_CNT, Wait.VsCnt); } void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { @@ -790,6 +807,21 @@ static bool readsVCCZ(const MachineInstr &MI) { !MI.getOperand(1).isUndef(); } +/// \returns true if the callee inserts an s_waitcnt 0 on function entry. +static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { + // Currently all conventions wait, but this may not always be the case. + // + // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make + // senses to omit the wait and do it in the caller. + return true; +} + +/// \returns true if the callee is expected to wait for any outstanding waits +/// before returning. +static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { + return true; +} + /// Generate s_waitcnt instruction to be placed before cur_Inst. /// Instructions of a given type are returned in order, /// but instructions of different types can complete out of order. @@ -815,7 +847,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // TODO: Handle other cases of NeedsWaitcntVmBefore() if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { + MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || + MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || + MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { Wait.VmCnt = 0; } @@ -823,8 +857,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || - MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { - Wait = AMDGPU::Waitcnt::allZero(); + MI.getOpcode() == AMDGPU::S_SETPC_B64_return || + (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { + Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV)); } // Resolve vm waits before gs-done. else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || @@ -903,91 +938,91 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( } } -#if 0 // TODO: the following code to handle CALL. - // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT. - // However, there is a problem with EXP_CNT, because the call cannot - // easily tell if a register is used in the function, and if it did, then - // the referring instruction would have to have an S_WAITCNT, which is - // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs - // before the call. - if (MI.getOpcode() == SC_CALL) { - if (ScoreBrackets->getScoreUB(EXP_CNT) > - ScoreBrackets->getScoreLB(EXP_CNT)) { - ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitWaitcnt |= CNT_MASK(EXP_CNT); - } - } -#endif - - // FIXME: Should not be relying on memoperands. - // Look at the source operands of every instruction to see if - // any of them results from a previous memory operation that affects - // its current usage. If so, an s_waitcnt instruction needs to be - // emitted. - // If the source operand was defined by a load, add the s_waitcnt - // instruction. - for (const MachineMemOperand *Memop : MI.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS) - continue; - unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - } + if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { + // Don't bother waiting on anything except the call address. The function + // is going to insert a wait on everything in its prolog. This still needs + // to be careful if the call target is a load (e.g. a GOT load). + Wait = AMDGPU::Waitcnt(); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - const MachineOperand &Op = MI.getOperand(I); - const MachineRegisterInfo &MRIA = *MRI; - RegInterval Interval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false); + int CallAddrOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, + CallAddrOpIdx, false); for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (TRI->isVGPR(MRIA, Op.getReg())) { - // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - } ScoreBrackets.determineWait( LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } - } - // End of for loop that looks at all source operands to decide vm_wait_cnt - // and lgk_wait_cnt. - - // Two cases are handled for destination operands: - // 1) If the destination operand was defined by a load, add the s_waitcnt - // instruction to guarantee the right WAW order. - // 2) If a destination operand that was used by a recent export/store ins, - // add s_waitcnt on exp_cnt to guarantee the WAR order. - if (MI.mayStore()) { + } else { // FIXME: Should not be relying on memoperands. + // Look at the source operands of every instruction to see if + // any of them results from a previous memory operation that affects + // its current usage. If so, an s_waitcnt instruction needs to be + // emitted. + // If the source operand was defined by a load, add the s_waitcnt + // instruction. for (const MachineMemOperand *Memop : MI.memoperands()) { unsigned AS = Memop->getAddrSpace(); if (AS != AMDGPUAS::LOCAL_ADDRESS) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; + // VM_CNT is only relevant to vgpr or LDS. ScoreBrackets.determineWait( VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - ScoreBrackets.determineWait( - EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } - } - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - MachineOperand &Def = MI.getOperand(I); - const MachineRegisterInfo &MRIA = *MRI; - RegInterval Interval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true); - for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (TRI->isVGPR(MRIA, Def.getReg())) { + + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + const MachineOperand &Op = MI.getOperand(I); + const MachineRegisterInfo &MRIA = *MRI; + RegInterval Interval = + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(MRIA, Op.getReg())) { + // VM_CNT is only relevant to vgpr or LDS. + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + } + ScoreBrackets.determineWait( + LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + } + } + // End of for loop that looks at all source operands to decide vm_wait_cnt + // and lgk_wait_cnt. + + // Two cases are handled for destination operands: + // 1) If the destination operand was defined by a load, add the s_waitcnt + // instruction to guarantee the right WAW order. + // 2) If a destination operand that was used by a recent export/store ins, + // add s_waitcnt on exp_cnt to guarantee the WAR order. + if (MI.mayStore()) { + // FIXME: Should not be relying on memoperands. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS) + continue; + unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; ScoreBrackets.determineWait( VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); ScoreBrackets.determineWait( EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } - ScoreBrackets.determineWait( - LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } - } // End of for loop that looks at all dest operands. + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + MachineOperand &Def = MI.getOperand(I); + const MachineRegisterInfo &MRIA = *MRI; + RegInterval Interval = + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(MRIA, Def.getReg())) { + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.determineWait( + EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); + } + ScoreBrackets.determineWait( + LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + } + } // End of for loop that looks at all dest operands. + } } // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 @@ -996,13 +1031,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // requiring a WAITCNT beforehand. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier()) { - Wait = AMDGPU::Waitcnt::allZero(); + Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV)); } // TODO: Remove this work-around, enable the assert for Bug 457939 // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. - if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) { + if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { if (ScoreBrackets.getScoreLB(LGKM_CNT) < ScoreBrackets.getScoreUB(LGKM_CNT) && ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { @@ -1014,21 +1049,31 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) { bool Modified = false; if (OldWaitcntInstr) { - if (TrackedWaitcntSet.count(OldWaitcntInstr)) { - TrackedWaitcntSet.erase(OldWaitcntInstr); - OldWaitcntInstr->eraseFromParent(); - Modified = true; - } else { - int64_t Imm = OldWaitcntInstr->getOperand(0).getImm(); - ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); + for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II); + &*II != &MI; II = NextI, ++NextI) { + if (II->isDebugInstr()) + continue; + + if (TrackedWaitcntSet.count(&*II)) { + TrackedWaitcntSet.erase(&*II); + II->eraseFromParent(); + Modified = true; + } else if (II->getOpcode() == AMDGPU::S_WAITCNT) { + int64_t Imm = II->getOperand(0).getImm(); + ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); + } else { + assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); + assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); + ScoreBrackets.applyWaitcnt( + AMDGPU::Waitcnt(0, 0, 0, II->getOperand(1).getImm())); + } } - Modified = true; } return Modified; } if (ForceEmitZeroWaitcnts) - Wait = AMDGPU::Waitcnt::allZero(); + Wait = AMDGPU::Waitcnt::allZero(IV); if (ForceEmitWaitcnt[VM_CNT]) Wait.VmCnt = 0; @@ -1036,39 +1081,88 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( Wait.ExpCnt = 0; if (ForceEmitWaitcnt[LGKM_CNT]) Wait.LgkmCnt = 0; + if (ForceEmitWaitcnt[VS_CNT]) + Wait.VsCnt = 0; ScoreBrackets.applyWaitcnt(Wait); AMDGPU::Waitcnt OldWait; + bool Modified = false; + if (OldWaitcntInstr) { - OldWait = - AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm()); - } - if (OldWait.dominates(Wait)) - return false; + for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II); + &*II != &MI; II = NextI, NextI++) { + if (II->isDebugInstr()) + continue; - if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr)) - Wait = Wait.combined(OldWait); + if (II->getOpcode() == AMDGPU::S_WAITCNT) { + unsigned IEnc = II->getOperand(0).getImm(); + AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc); + OldWait = OldWait.combined(IWait); + if (!TrackedWaitcntSet.count(&*II)) + Wait = Wait.combined(IWait); + unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait); + if (IEnc != NewEnc) { + II->getOperand(0).setImm(NewEnc); + Modified = true; + } + Wait.VmCnt = ~0u; + Wait.LgkmCnt = ~0u; + Wait.ExpCnt = ~0u; + } else { + assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); + assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); + + unsigned ICnt = II->getOperand(1).getImm(); + OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt); + if (!TrackedWaitcntSet.count(&*II)) + Wait.VsCnt = std::min(Wait.VsCnt, ICnt); + if (Wait.VsCnt != ICnt) { + II->getOperand(1).setImm(Wait.VsCnt); + Modified = true; + } + Wait.VsCnt = ~0u; + } - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - if (OldWaitcntInstr) { - OldWaitcntInstr->getOperand(0).setImm(Enc); + LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *II << '\n'); - LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" - << "Old Instr: " << MI << '\n' - << "New Instr: " << *OldWaitcntInstr << '\n'); - } else { + if (!Wait.hasWait()) + return Modified; + } + } + + if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) { + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) .addImm(Enc); TrackedWaitcntSet.insert(SWaitInst); + Modified = true; LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" << "Old Instr: " << MI << '\n' << "New Instr: " << *SWaitInst << '\n'); } - return true; + if (Wait.VsCnt != ~0u) { + assert(ST->hasVscnt()); + + auto SWaitInst = + BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(Wait.VsCnt); + TrackedWaitcntSet.insert(SWaitInst); + Modified = true; + + LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *SWaitInst << '\n'); + } + + return Modified; } // This is a flat memory operation. Check to see if it has memory @@ -1093,7 +1187,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // bracket and the destination operand scores. // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { - if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { + if (TII->isAlwaysGDS(Inst.getOpcode()) || + TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); } else { @@ -1102,8 +1197,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } else if (TII->isFLAT(Inst)) { assert(Inst.mayLoad() || Inst.mayStore()); - if (TII->usesVM_CNT(Inst)) - ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + if (TII->usesVM_CNT(Inst)) { + if (!ST->hasVscnt()) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + else if (Inst.mayLoad() && + AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst); + else + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst); + } if (TII->usesLGKM_CNT(Inst)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); @@ -1118,14 +1220,33 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // TODO: get a better carve out. Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 && Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC && - Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) { - ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL && + Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV && + Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) { + if (!ST->hasVscnt()) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + else if ((Inst.mayLoad() && + AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) || + /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */ + (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore())) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst); + else if (Inst.mayStore()) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst); + if (ST->vmemWriteNeedsExpWaitcnt() && (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); } } else if (TII->isSMRD(Inst)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); + } else if (Inst.isCall()) { + if (callWaitsOnFunctionReturn(Inst)) { + // Act as a wait on everything + ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV)); + } else { + // May need to way wait for anything. + ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); + } } else { switch (Inst.getOpcode()) { case AMDGPU::S_SENDMSG: @@ -1236,31 +1357,18 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // Walk over the instructions. MachineInstr *OldWaitcntInstr = nullptr; - for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end(); + for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(), + E = Block.instr_end(); Iter != E;) { MachineInstr &Inst = *Iter; - // Remove any previously existing waitcnts. - if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { - if (OldWaitcntInstr) { - if (TrackedWaitcntSet.count(OldWaitcntInstr)) { - TrackedWaitcntSet.erase(OldWaitcntInstr); - OldWaitcntInstr->eraseFromParent(); - OldWaitcntInstr = nullptr; - } else if (!TrackedWaitcntSet.count(&Inst)) { - // Two successive s_waitcnt's, both of which are pre-existing and - // are therefore preserved. - int64_t Imm = OldWaitcntInstr->getOperand(0).getImm(); - ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); - } else { - ++Iter; - Inst.eraseFromParent(); - Modified = true; - continue; - } - } - - OldWaitcntInstr = &Inst; + // Track pre-existing waitcnts from earlier iterations. + if (Inst.getOpcode() == AMDGPU::S_WAITCNT || + (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + Inst.getOperand(0).isReg() && + Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) { + if (!OldWaitcntInstr) + OldWaitcntInstr = &Inst; ++Iter; continue; } @@ -1299,27 +1407,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, ScoreBrackets.dump(); }); - // Check to see if this is a GWS instruction. If so, and if this is CI or - // VI, then the generated code sequence will include an S_WAITCNT 0. - // TODO: Are these the only GWS instructions? - if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT || - Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V || - Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || - Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P || - Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) { - // TODO: && context->target_info->GwsRequiresMemViolTest() ) { - ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero()); - } - // TODO: Remove this work-around after fixing the scheduler and enable the // assert above. if (VCCZBugWorkAround) { // Restore the vccz bit. Any time a value is written to vcc, the vcc // bit is updated, so we can restore the bit by reading the value of // vcc and then writing it back to the register. - BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), - AMDGPU::VCC) - .addReg(AMDGPU::VCC); + BuildMI(Block, Inst, Inst.getDebugLoc(), + TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), + TRI->getVCC()) + .addReg(TRI->getVCC()); VCCZBugHandledSet.insert(&Inst); Modified = true; } @@ -1345,6 +1442,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); + HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0; HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs(); HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs(); @@ -1480,6 +1578,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // TODO: Could insert earlier and schedule more liberally with operations // that only use caller preserved registers. MachineBasicBlock &EntryBB = MF.front(); + if (ST->hasVscnt()) + BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) .addImm(0); diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index 65ffc27b8b60..561a16c3e351 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -1,9 +1,8 @@ //===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,19 +10,9 @@ // //===----------------------------------------------------------------------===// -def isGCN : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, - AssemblerPredicate<"FeatureGCN">; -def isSI : Predicate<"Subtarget->getGeneration() " - "== AMDGPUSubtarget::SOUTHERN_ISLANDS">, - AssemblerPredicate<"FeatureSouthernIslands">; - - class InstSI <dag outs, dag ins, string asm = "", list<dag> pattern = []> : AMDGPUInst<outs, ins, asm, pattern>, GCNPredicateControl { - let SubtargetPredicate = isGCN; - // Low bits - basic encoding information. field bit SALU = 0; field bit VALU = 0; @@ -121,10 +110,20 @@ class InstSI <dag outs, dag ins, string asm = "", // This bit indicates that this is a D16 buffer instruction. field bit D16Buf = 0; + // This field indicates that FLAT instruction accesses FLAT_GLBL or + // FLAT_SCRATCH segment. Must be 0 for non-FLAT instructions. + field bit IsNonFlatSeg = 0; + // This bit indicates that this uses the floating point double precision // rounding mode flags field bit FPDPRounding = 0; + // Instruction is FP atomic. + field bit FPAtomic = 0; + + // This bit indicates that this is one of MFMA instructions. + field bit IsMAI = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -182,7 +181,13 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{50} = D16Buf; - let TSFlags{51} = FPDPRounding; + let TSFlags{51} = IsNonFlatSeg; + + let TSFlags{52} = FPDPRounding; + + let TSFlags{53} = FPAtomic; + + let TSFlags{54} = IsMAI; let SchedRW = [Write32Bit]; @@ -251,38 +256,59 @@ class VINTRPe <bits<2> op> : Enc32 { let Inst{31-26} = 0x32; // encoding } -class MIMGe <bits<7> op> : Enc64 { +class MIMGe : Enc64 { bits<8> vdata; bits<4> dmask; bits<1> unorm; bits<1> glc; - bits<1> da; bits<1> r128; bits<1> tfe; bits<1> lwe; bits<1> slc; bit d16; - bits<8> vaddr; bits<7> srsrc; bits<7> ssamp; let Inst{11-8} = dmask; let Inst{12} = unorm; let Inst{13} = glc; - let Inst{14} = da; let Inst{15} = r128; let Inst{16} = tfe; let Inst{17} = lwe; - let Inst{24-18} = op; let Inst{25} = slc; let Inst{31-26} = 0x3c; - let Inst{39-32} = vaddr; let Inst{47-40} = vdata; let Inst{52-48} = srsrc{6-2}; let Inst{57-53} = ssamp{6-2}; let Inst{63} = d16; } +class MIMGe_gfx6789 <bits<8> op> : MIMGe { + bits<8> vaddr; + bits<1> da; + + let Inst{0} = op{7}; + let Inst{14} = da; + let Inst{24-18} = op{6-0}; + let Inst{39-32} = vaddr; +} + +class MIMGe_gfx10 <bits<8> op> : MIMGe { + bits<8> vaddr0; + bits<3> dim; + bits<2> nsa; + bits<1> dlc; + bits<1> a16 = 0; // TODO: this should be an operand + + let Inst{0} = op{7}; + let Inst{2-1} = nsa; + let Inst{5-3} = dim; + let Inst{7} = dlc; + let Inst{24-18} = op{6-0}; + let Inst{39-32} = vaddr0; + let Inst{62} = a16; +} + class EXPe : Enc64 { bits<4> en; bits<6> tgt; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 2370d5fa7b27..ba8ed6993a56 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1,9 +1,8 @@ //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,7 +13,6 @@ #include "SIInstrInfo.h" #include "AMDGPU.h" -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "GCNHazardRecognizer.h" #include "SIDefines.h" @@ -100,12 +98,6 @@ static unsigned getNumOperandsNoGlue(SDNode *Node) { return N; } -static SDValue findChainOperand(SDNode *Load) { - SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); - assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); - return LastOp; -} - /// Returns true if both nodes have the same value for the given /// operand \p Op, or if both nodes do not have this operand. static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { @@ -142,7 +134,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: - return true; + // No implicit operands. + return MI.getNumOperands() == MI.getDesc().getNumOperands(); default: return false; } @@ -168,22 +161,25 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, return false; // Check base reg. - if (Load0->getOperand(1) != Load1->getOperand(1)) - return false; - - // Check chain. - if (findChainOperand(Load0) != findChainOperand(Load1)) + if (Load0->getOperand(0) != Load1->getOperand(0)) return false; // Skip read2 / write2 variants for simplicity. // TODO: We should report true if the used offsets are adjacent (excluded // st64 versions). - if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || - AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) + int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); + int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); + if (Offset0Idx == -1 || Offset1Idx == -1) return false; - Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); - Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); + // XXX - be careful of datalesss loads + // getNamedOperandIdx returns the index for MachineInstrs. Since they + // include the output in the operand list, but SDNodes don't, we need to + // subtract the index by one. + Offset0Idx -= get(Opc0).NumDefs; + Offset1Idx -= get(Opc1).NumDefs; + Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); + Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); return true; } @@ -207,10 +203,6 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, if (!Load0Offset || !Load1Offset) return false; - // Check chain. - if (findChainOperand(Load0) != findChainOperand(Load1)) - return false; - Offset0 = Load0Offset->getZExtValue(); Offset1 = Load1Offset->getZExtValue(); return true; @@ -221,7 +213,6 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, // MUBUF and MTBUF have vaddr at different indices. if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || - findChainOperand(Load0) != findChainOperand(Load1) || !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) return false; @@ -233,10 +224,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, return false; // getNamedOperandIdx returns the index for MachineInstrs. Since they - // inlcude the output in the operand list, but SDNodes don't, we need to + // include the output in the operand list, but SDNodes don't, we need to // subtract the index by one. - --OffIdx0; - --OffIdx1; + OffIdx0 -= get(Opc0).NumDefs; + OffIdx1 -= get(Opc1).NumDefs; SDValue Off0 = Load0->getOperand(OffIdx0); SDValue Off1 = Load1->getOperand(OffIdx1); @@ -265,8 +256,8 @@ static bool isStride64(unsigned Opc) { } } -bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, - MachineOperand *&BaseOp, +bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const { unsigned Opc = LdSt.getOpcode(); @@ -277,6 +268,11 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, if (OffsetImm) { // Normal, single offset LDS instruction. BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); + // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to + // report that here? + if (!BaseOp) + return false; + Offset = OffsetImm->getImm(); assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register."); @@ -325,7 +321,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, if (SOffset && SOffset->isReg()) return false; - MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (!AddrReg) return false; @@ -348,7 +344,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, if (!OffsetImm) return false; - MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); + const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); BaseOp = SBaseReg; Offset = OffsetImm->getImm(); assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " @@ -357,7 +353,7 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, } if (isFLAT(LdSt)) { - MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (VAddr) { // Can't analyze 2 offsets. if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) @@ -413,11 +409,11 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, return Base1 == Base2; } -bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1, - MachineOperand &BaseOp2, +bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, + const MachineOperand &BaseOp2, unsigned NumLoads) const { - MachineInstr &FirstLdSt = *BaseOp1.getParent(); - MachineInstr &SecondLdSt = *BaseOp2.getParent(); + const MachineInstr &FirstLdSt = *BaseOp1.getParent(); + const MachineInstr &SecondLdSt = *BaseOp2.getParent(); if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2)) return false; @@ -461,7 +457,12 @@ bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1, const MachineRegisterInfo &MRI = FirstLdSt.getParent()->getParent()->getRegInfo(); - const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); + + const unsigned Reg = FirstDst->getReg(); + + const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg) + ? MRI.getRegClass(Reg) + : RI.getPhysRegClass(Reg); return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; } @@ -511,8 +512,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (RC == &AMDGPU::VGPR_32RegClass) { assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || - AMDGPU::SReg_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + AMDGPU::SReg_32RegClass.contains(SrcReg) || + AMDGPU::AGPR_32RegClass.contains(SrcReg)); + unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? + AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32; + BuildMI(MBB, MI, DL, get(Opc), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } @@ -526,6 +530,21 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (DestReg == AMDGPU::VCC_LO) { + if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + // FIXME: Hack until VReg_1 removed. + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) + .addImm(0) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + + return; + } + if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); return; @@ -570,10 +589,83 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (RC == &AMDGPU::AGPR_32RegClass) { + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || + AMDGPU::SReg_32RegClass.contains(SrcReg) || + AMDGPU::AGPR_32RegClass.contains(SrcReg)); + if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) { + // First try to find defining accvgpr_write to avoid temporary registers. + for (auto Def = MI, E = MBB.begin(); Def != E; ) { + --Def; + if (!Def->definesRegister(SrcReg, &RI)) + continue; + if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) + break; + + MachineOperand &DefOp = Def->getOperand(1); + assert(DefOp.isReg() || DefOp.isImm()); + + if (DefOp.isReg()) { + // Check that register source operand if not clobbered before MI. + // Immediate operands are always safe to propagate. + bool SafeToPropagate = true; + for (auto I = Def; I != MI && SafeToPropagate; ++I) + if (I->modifiesRegister(DefOp.getReg(), &RI)) + SafeToPropagate = false; + + if (!SafeToPropagate) + break; + + DefOp.setIsKill(false); + } + + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .add(DefOp); + return; + } + + RegScavenger RS; + RS.enterBasicBlock(MBB); + RS.forward(MI); + + // Ideally we want to have three registers for a long reg_sequence copy + // to hide 2 waitstates between v_mov_b32 and accvgpr_write. + unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, + *MBB.getParent()); + + // Registers in the sequence are allocated contiguously so we can just + // use register number to pick one of three round-robin temps. + unsigned RegNo = DestReg % 3; + unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp) + report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); + RS.setRegUsed(Tmp); + // Only loop through if there are any free registers left, otherwise + // scavenger may report a fatal error without emergency spill slot + // or spill with the slot. + while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { + unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) + break; + Tmp = Tmp2; + RS.setRegUsed(Tmp); + } + copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .addReg(Tmp, RegState::Kill); + return; + } + + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isSGPRClass(RC)) { - if (RI.getRegSizeInBits(*RC) > 32) { + // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32. + if (!(RI.getRegSizeInBits(*RC) % 64)) { Opcode = AMDGPU::S_MOV_B64; EltSize = 8; } else { @@ -585,6 +677,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); return; } + } else if (RI.hasAGPRs(RC)) { + Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? + AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; + } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { + Opcode = AMDGPU::V_ACCVGPR_READ_B32; } ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); @@ -597,6 +694,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else SubIdx = SubIndices[SubIndices.size() - Idx - 1]; + if (Opcode == TargetOpcode::COPY) { + copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), + RI.getSubReg(SrcReg, SubIdx), KillSrc); + continue; + } + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)); @@ -696,38 +799,50 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, unsigned TrueReg, unsigned FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineFunction *MF = MBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const TargetRegisterClass *BoolXExecRC = + RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && "Not a VGPR32 reg"); if (Cond.size() == 1) { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(Cond[0]); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); } else if (Cond.size() == 2) { assert(Cond[0].isImm() && "Cond[0] is not an immediate"); switch (Cond[0].getImm()) { case SIInstrInfo::SCC_TRUE: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 + : AMDGPU::S_CSELECT_B64), SReg) .addImm(-1) .addImm(0); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); break; } case SIInstrInfo::SCC_FALSE: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 + : AMDGPU::S_CSELECT_B64), SReg) .addImm(0) .addImm(-1); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); break; @@ -735,11 +850,13 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, case SIInstrInfo::VCCNZ: { MachineOperand RegOp = Cond[1]; RegOp.setImplicit(false); - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(RegOp); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); break; @@ -747,39 +864,49 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, case SIInstrInfo::VCCZ: { MachineOperand RegOp = Cond[1]; RegOp.setImplicit(false); - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(RegOp); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(TrueReg) + .addImm(0) .addReg(FalseReg) .addReg(SReg); break; } case SIInstrInfo::EXECNZ: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 + : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) .addImm(0); - BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 + : AMDGPU::S_CSELECT_B64), SReg) .addImm(-1) .addImm(0); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); break; } case SIInstrInfo::EXECZ: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) + unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 + : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) .addImm(0); - BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 + : AMDGPU::S_CSELECT_B64), SReg) .addImm(0) .addImm(-1); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) .addReg(FalseReg) + .addImm(0) .addReg(TrueReg) .addReg(SReg); llvm_unreachable("Unhandled branch predicate EXECZ"); @@ -798,7 +925,7 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, const DebugLoc &DL, unsigned SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) .addImm(Value) .addReg(SrcReg); @@ -811,7 +938,7 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, const DebugLoc &DL, unsigned SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) .addImm(Value) .addReg(SrcReg); @@ -821,6 +948,8 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { + if (RI.hasAGPRs(DstRC)) + return AMDGPU::COPY; if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { @@ -837,12 +966,18 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S32_SAVE; case 8: return AMDGPU::SI_SPILL_S64_SAVE; + case 12: + return AMDGPU::SI_SPILL_S96_SAVE; case 16: return AMDGPU::SI_SPILL_S128_SAVE; + case 20: + return AMDGPU::SI_SPILL_S160_SAVE; case 32: return AMDGPU::SI_SPILL_S256_SAVE; case 64: return AMDGPU::SI_SPILL_S512_SAVE; + case 128: + return AMDGPU::SI_SPILL_S1024_SAVE; default: llvm_unreachable("unknown register size"); } @@ -858,10 +993,31 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V96_SAVE; case 16: return AMDGPU::SI_SPILL_V128_SAVE; + case 20: + return AMDGPU::SI_SPILL_V160_SAVE; case 32: return AMDGPU::SI_SPILL_V256_SAVE; case 64: return AMDGPU::SI_SPILL_V512_SAVE; + case 128: + return AMDGPU::SI_SPILL_V1024_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getAGPRSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_A32_SAVE; + case 8: + return AMDGPU::SI_SPILL_A64_SAVE; + case 16: + return AMDGPU::SI_SPILL_A128_SAVE; + case 64: + return AMDGPU::SI_SPILL_A512_SAVE; + case 128: + return AMDGPU::SI_SPILL_A1024_SAVE; default: llvm_unreachable("unknown register size"); } @@ -906,12 +1062,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); + .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); // Add the scratch resource registers as implicit uses because we may end up // needing them, and need to ensure that the reserved registers are // correctly handled. - - FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); + if (RI.spillSGPRToVGPR()) + FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); @@ -920,17 +1076,22 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, return; } - assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - - unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) + : getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); - BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg, getKillRegState(isKill)) // data - .addFrameIndex(FrameIndex) // addr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getFrameOffsetReg()) // scratch_offset - .addImm(0) // offset - .addMemOperand(MMO); + + auto MIB = BuildMI(MBB, MI, DL, get(Opcode)); + if (RI.hasAGPRs(RC)) { + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MIB.addReg(Tmp, RegState::Define); + } + MIB.addReg(SrcReg, getKillRegState(isKill)) // data + .addFrameIndex(FrameIndex) // addr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset + .addImm(0) // offset + .addMemOperand(MMO); } static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { @@ -939,12 +1100,18 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S32_RESTORE; case 8: return AMDGPU::SI_SPILL_S64_RESTORE; + case 12: + return AMDGPU::SI_SPILL_S96_RESTORE; case 16: return AMDGPU::SI_SPILL_S128_RESTORE; + case 20: + return AMDGPU::SI_SPILL_S160_RESTORE; case 32: return AMDGPU::SI_SPILL_S256_RESTORE; case 64: return AMDGPU::SI_SPILL_S512_RESTORE; + case 128: + return AMDGPU::SI_SPILL_S1024_RESTORE; default: llvm_unreachable("unknown register size"); } @@ -960,10 +1127,31 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V96_RESTORE; case 16: return AMDGPU::SI_SPILL_V128_RESTORE; + case 20: + return AMDGPU::SI_SPILL_V160_RESTORE; case 32: return AMDGPU::SI_SPILL_V256_RESTORE; case 64: return AMDGPU::SI_SPILL_V512_RESTORE; + case 128: + return AMDGPU::SI_SPILL_V1024_RESTORE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_A32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_A64_RESTORE; + case 16: + return AMDGPU::SI_SPILL_A128_RESTORE; + case 64: + return AMDGPU::SI_SPILL_A512_RESTORE; + case 128: + return AMDGPU::SI_SPILL_A1024_RESTORE; default: llvm_unreachable("unknown register size"); } @@ -999,12 +1187,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } - FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); + if (RI.spillSGPRToVGPR()) + FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) - .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); + .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. @@ -1014,15 +1203,19 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - - unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); - BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // vaddr - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getFrameOffsetReg()) // scratch_offset - .addImm(0) // offset - .addMemOperand(MMO); + unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) + : getVGPRSpillRestoreOpcode(SpillSize); + auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg); + if (RI.hasAGPRs(RC)) { + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MIB.addReg(Tmp, RegState::Define); + } + MIB.addFrameIndex(FrameIndex) // vaddr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset + .addImm(0) // offset + .addMemOperand(MMO); } /// \param @Offset Offset in bytes of the FrameIndex being spilled @@ -1089,7 +1282,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z getAddNoCarry(Entry, Insert, DL, TIDReg) .addReg(TIDReg) - .addReg(TIDIGZReg); + .addReg(TIDIGZReg) + .addImm(0); // clamp bit } else { // Get the wave id BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), @@ -1114,7 +1308,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); getAddNoCarry(MBB, MI, DL, TmpReg) .addImm(LDSOffset) - .addReg(TIDReg); + .addReg(TIDReg) + .addImm(0); // clamp bit return TmpReg; } @@ -1148,13 +1343,17 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { if (MBB.succ_empty()) { bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); - if (HasNoTerminator) - BuildMI(MBB, MBB.end(), DebugLoc(), - get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG)); + if (HasNoTerminator) { + if (Info->returnsVoid()) { + BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); + } else { + BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); + } + } } } -unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { +unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return 1; // FIXME: Do wait states equal cycles? @@ -1174,18 +1373,42 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_MOV_B64)); break; + case AMDGPU::S_MOV_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_MOV_B32)); + break; + case AMDGPU::S_XOR_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(get(AMDGPU::S_XOR_B64)); break; + case AMDGPU::S_XOR_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_XOR_B32)); + break; + + case AMDGPU::S_OR_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_OR_B32)); + break; + case AMDGPU::S_ANDN2_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(get(AMDGPU::S_ANDN2_B64)); break; + case AMDGPU::S_ANDN2_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_ANDN2_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -1215,24 +1438,28 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; } case AMDGPU::V_SET_INACTIVE_B32: { - BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MI, DL, get(NotOpc), Exec) + .addReg(Exec); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) .add(MI.getOperand(2)); - BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + BuildMI(MBB, MI, DL, get(NotOpc), Exec) + .addReg(Exec); MI.eraseFromParent(); break; } case AMDGPU::V_SET_INACTIVE_B64: { - BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MI, DL, get(NotOpc), Exec) + .addReg(Exec); MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), MI.getOperand(0).getReg()) .add(MI.getOperand(2)); expandPostRAPseudo(*Copy); - BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + BuildMI(MBB, MI, DL, get(NotOpc), Exec) + .addReg(Exec); MI.eraseFromParent(); break; } @@ -1282,10 +1509,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) .addReg(RegHi); - if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) - MIB.addImm(0); - else - MIB.add(MI.getOperand(2)); + MIB.add(MI.getOperand(2)); Bundler.append(MIB); finalizeBundle(MBB, Bundler.begin()); @@ -1293,10 +1517,17 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + case AMDGPU::ENTER_WWM: { + // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when + // WWM is entered. + MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 + : AMDGPU::S_OR_SAVEEXEC_B64)); + break; + } case AMDGPU::EXIT_WWM: { - // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM - // is exited. - MI.setDesc(get(AMDGPU::S_MOV_B64)); + // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when + // WWM is exited. + MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } case TargetOpcode::BUNDLE: { @@ -1492,7 +1723,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub0) .addReg(PCReg, 0, AMDGPU::sub0) - .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); + .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD); BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub1) .addReg(PCReg, 0, AMDGPU::sub1) @@ -1502,7 +1733,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub0) .addReg(PCReg, 0, AMDGPU::sub0) - .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); + .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD); BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub1) .addReg(PCReg, 0, AMDGPU::sub1) @@ -1659,6 +1890,10 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, case AMDGPU::S_MOV_B64_term: case AMDGPU::S_XOR_B64_term: case AMDGPU::S_ANDN2_B64_term: + case AMDGPU::S_MOV_B32_term: + case AMDGPU::S_XOR_B32_term: + case AMDGPU::S_OR_B32_term: + case AMDGPU::S_ANDN2_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: @@ -1826,7 +2061,7 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? // Limit to equal cost for branch vs. N v_cndmask_b32s. - return !RI.isSGPRClass(RC) && NumInsts <= 6; + return RI.hasVGPRs(RC) && NumInsts <= 6; } case SCC_TRUE: case SCC_FALSE: { @@ -1907,14 +2142,18 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, const int16_t *SubIndices = Sub0_15; int NElts = DstSize / 32; - // 64-bit select is only avaialble for SALU. + // 64-bit select is only available for SALU. + // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. if (Pred == SCC_TRUE) { - SelOp = AMDGPU::S_CSELECT_B64; - EltRC = &AMDGPU::SGPR_64RegClass; - SubIndices = Sub0_15_64; - - assert(NElts % 2 == 0); - NElts /= 2; + if (NElts % 2) { + SelOp = AMDGPU::S_CSELECT_B32; + EltRC = &AMDGPU::SGPR_32RegClass; + } else { + SelOp = AMDGPU::S_CSELECT_B64; + EltRC = &AMDGPU::SGPR_64RegClass; + SubIndices = Sub0_15_64; + NElts /= 2; + } } MachineInstrBuilder MIB = BuildMI( @@ -1934,6 +2173,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, .addReg(FalseReg, 0, SubIdx) .addReg(TrueReg, 0, SubIdx); preserveCondRegFlags(Select->getOperand(3), Cond[1]); + fixImplicitOperands(*Select); MIB.addReg(DstElt) .addImm(SubIdx); @@ -1955,6 +2195,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: case AMDGPU::COPY: + case AMDGPU::V_ACCVGPR_WRITE_B32: + case AMDGPU::V_ACCVGPR_READ_B32: return true; default: return false; @@ -2007,6 +2249,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, case AMDGPU::V_MOV_B32_e32: case AMDGPU::S_MOV_B32: + case AMDGPU::V_ACCVGPR_WRITE_B32: break; } @@ -2020,6 +2263,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (Opc == AMDGPU::COPY) { bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; + if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) { + if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32)) + return false; + NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; + } UseMI.setDesc(get(NewOpc)); UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); @@ -2027,7 +2275,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, } if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { + Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) { // Don't fold if we are using source or output modifiers. The new VOP2 // instructions don't have them. if (hasAnyModifiersSet(UseMI)) @@ -2042,7 +2292,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (isInlineConstant(UseMI, *Src0, *ImmOp)) return false; - bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; + bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64; MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -2055,6 +2308,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) return false; + unsigned NewOpc = + IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) + : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); + if (pseudoToMCOpcode(NewOpc) == -1) + return false; + // We need to swap operands 0 and 1 since madmk constant is at operand 1. const int64_t Imm = ImmOp->getImm(); @@ -2075,14 +2334,16 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Src0->setIsKill(Src1->isKill()); if (Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_MAC_F16_e64) + Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); Src1->ChangeToImmediate(Imm); removeModOperands(UseMI); - UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); + UseMI.setDesc(get(NewOpc)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -2107,9 +2368,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Src0->ChangeToImmediate(Def->getOperand(1).getImm()); Src0Inlined = true; } else if ((RI.isPhysicalRegister(Src0->getReg()) && - RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) || + (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || (RI.isVirtualRegister(Src0->getReg()) && - RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) + (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) return false; // VGPR is okay as Src0 - fallthrough } @@ -2130,6 +2393,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // VGPR is okay as Src1 - fallthrough } + unsigned NewOpc = + IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) + : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); + if (pseudoToMCOpcode(NewOpc) == -1) + return false; + const int64_t Imm = ImmOp->getImm(); // FIXME: This would be a lot easier if we could return a new instruction @@ -2142,7 +2411,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); if (Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_MAC_F16_e64) + Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -2151,7 +2422,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // These come before src2. removeModOperands(UseMI); - UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); + UseMI.setDesc(get(NewOpc)); + // It might happen that UseMI was commuted + // and we now have SGPR as SRC1. If so 2 inlined + // constant and SGPR are illegal. + legalizeOperands(UseMI); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -2172,9 +2447,9 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA, return LowOffset + LowWidth <= HighOffset; } -bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, - MachineInstr &MIb) const { - MachineOperand *BaseOp0, *BaseOp1; +bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, + const MachineInstr &MIb) const { + const MachineOperand *BaseOp0, *BaseOp1; int64_t Offset0, Offset1; if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) && @@ -2196,8 +2471,8 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, return false; } -bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, - MachineInstr &MIb, +bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, + const MachineInstr &MIb, AliasAnalysis *AA) const { assert((MIa.mayLoad() || MIa.mayStore()) && "MIa must load from or modify a memory location"); @@ -2211,17 +2486,6 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; - if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { - const MachineMemOperand *MMOa = *MIa.memoperands_begin(); - const MachineMemOperand *MMOb = *MIb.memoperands_begin(); - if (MMOa->getValue() && MMOb->getValue()) { - MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); - MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); - if (!AA->alias(LocA, LocB)) - return true; - } - } - // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, @@ -2275,18 +2539,21 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, LiveVariables *LV) const { unsigned Opc = MI.getOpcode(); bool IsF16 = false; - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; switch (Opc) { default: return nullptr; case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_FMAC_F16_e64: IsF16 = true; LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_FMAC_F32_e64: break; case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_FMAC_F16_e32: IsF16 = true; LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e32: @@ -2315,30 +2582,38 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); - if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod && + if (!Src0Mods && !Src1Mods && !Clamp && !Omod && // If we have an SGPR input, we will violate the constant bus restriction. - (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { + (ST.getConstantBusLimit(Opc) > 1 || + !Src0->isReg() || + !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { if (auto Imm = getFoldableImm(Src2)) { - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32)) - .add(*Dst) - .add(*Src0) - .add(*Src1) - .addImm(Imm); + unsigned NewOpc = + IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) + : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); + if (pseudoToMCOpcode(NewOpc) != -1) + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) + .add(*Dst) + .add(*Src0) + .add(*Src1) + .addImm(Imm); } + unsigned NewOpc = + IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) + : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); if (auto Imm = getFoldableImm(Src1)) { - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) - .add(*Dst) - .add(*Src0) - .addImm(Imm) - .add(*Src2); + if (pseudoToMCOpcode(NewOpc) != -1) + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) + .add(*Dst) + .add(*Src0) + .addImm(Imm) + .add(*Src2); } if (auto Imm = getFoldableImm(Src0)) { - if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32, + if (pseudoToMCOpcode(NewOpc) != -1 && + isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), Src1)) - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) .add(*Src1) .addImm(Imm) @@ -2346,9 +2621,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, } } - assert((!IsFMA || !IsF16) && "fmac only expected with f32"); - unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 : - (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); + unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32) + : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); + if (pseudoToMCOpcode(NewOpc) == -1) + return nullptr; + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) .addImm(Src0Mods ? Src0Mods->getImm() : 0) @@ -2390,12 +2667,26 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, changesVGPRIndexingMode(MI); } +bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { + return Opcode == AMDGPU::DS_ORDERED_COUNT || + Opcode == AMDGPU::DS_GWS_INIT || + Opcode == AMDGPU::DS_GWS_SEMA_V || + Opcode == AMDGPU::DS_GWS_SEMA_BR || + Opcode == AMDGPU::DS_GWS_SEMA_P || + Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || + Opcode == AMDGPU::DS_GWS_BARRIER; +} + bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); if (MI.mayStore() && isSMRD(MI)) return true; // scalar store or atomic + // This will terminate the function when other lanes may need to continue. + if (MI.isReturn()) + return true; + // These instructions cause shader I/O that may cause hardware lockups // when executed with an empty EXEC mask. // @@ -2403,10 +2694,12 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const // EXEC = 0, but checking for that case here seems not worth it // given the typical code patterns. if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || - Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE) + Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || + Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || + Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) return true; - if (MI.isInlineAsm()) + if (MI.isCall() || MI.isInlineAsm()) return true; // conservative assumption // These are like SALU instructions in terms of effects, so it's questionable @@ -2420,8 +2713,36 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const return false; } +bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, + const MachineInstr &MI) const { + if (MI.isMetaInstruction()) + return false; + + // This won't read exec if this is an SGPR->SGPR copy. + if (MI.isCopyLike()) { + if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) + return true; + + // Make sure this isn't copying exec as a normal operand + return MI.readsRegister(AMDGPU::EXEC, &RI); + } + + // Make a conservative assumption about the callee. + if (MI.isCall()) + return true; + + // Be conservative with any unhandled generic opcodes. + if (!isTargetSpecificOpcode(MI.getOpcode())) + return true; + + return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); +} + bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { switch (Imm.getBitWidth()) { + case 1: // This likely will be a condition code mask. + return true; + case 32: return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); @@ -2454,7 +2775,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: - case AMDGPU::OPERAND_REG_INLINE_C_FP32: { + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { int32_t Trunc = static_cast<int32_t>(Imm); return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } @@ -2467,7 +2790,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. @@ -2480,19 +2805,14 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, return false; } + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { - if (isUInt<16>(Imm)) { - int16_t Trunc = static_cast<int16_t>(Imm); - return ST.has16BitInsts() && - AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); - } - if (!(Imm & 0xffff)) { - return ST.has16BitInsts() && - AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm()); - } + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { uint32_t Trunc = static_cast<uint32_t>(Imm); - return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); + return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); } default: llvm_unreachable("invalid bitwidth"); @@ -2534,9 +2854,10 @@ static bool compareMachineOp(const MachineOperand &Op0, bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const { - const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; + const MCInstrDesc &InstDesc = MI.getDesc(); + const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; - assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) return true; @@ -2547,7 +2868,15 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, if (MO.isImm() && isInlineConstant(MO, OpInfo)) return RI.opCanUseInlineConstant(OpInfo.OperandType); - return RI.opCanUseLiteralConstant(OpInfo.OperandType); + if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) + return false; + + if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) + return true; + + const MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + return ST.hasVOP3Literal(); } bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { @@ -2586,7 +2915,8 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, // Can't shrink instruction with three operands. // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add // a special case for it. It can only be shrunk if the third operand - // is vcc. We should handle this the same way we handle vopc, by addding + // is vcc, and src0_modifiers and src1_modifiers are not set. + // We should handle this the same way we handle vopc, by addding // a register allocation hint pre-regalloc and then do the shrinking // post-regalloc. if (Src2) { @@ -2606,6 +2936,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMAC_F16_e64: if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) return false; @@ -2662,7 +2993,8 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, // dst Inst32.add(MI.getOperand(0)); } else { - assert(MI.getOperand(0).getReg() == AMDGPU::VCC && + assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || + (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case"); } @@ -2707,19 +3039,19 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); - // FLAT_SCR is just an SGPR pair. - if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) - return true; - - // EXEC register uses the constant bus. - if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) - return true; + // Null is free + if (MO.getReg() == AMDGPU::SGPR_NULL) + return false; // SGPRs use the constant bus - return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || - (!MO.isImplicit() && - (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || - AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); + if (MO.isImplicit()) { + return MO.getReg() == AMDGPU::M0 || + MO.getReg() == AMDGPU::VCC || + MO.getReg() == AMDGPU::VCC_LO; + } else { + return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || + AMDGPU::SReg_64RegClass.contains(MO.getReg()); + } } static unsigned findImplicitSGPRRead(const MachineInstr &MI) { @@ -2730,6 +3062,8 @@ static unsigned findImplicitSGPRRead(const MachineInstr &MI) { switch (MO.getReg()) { case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: case AMDGPU::M0: case AMDGPU::FLAT_SCR: return MO.getReg(); @@ -2746,10 +3080,12 @@ static bool shouldReadExec(const MachineInstr &MI) { if (SIInstrInfo::isVALU(MI)) { switch (MI.getOpcode()) { case AMDGPU::V_READLANE_B32: - case AMDGPU::V_READLANE_B32_si: + case AMDGPU::V_READLANE_B32_gfx6_gfx7: + case AMDGPU::V_READLANE_B32_gfx10: case AMDGPU::V_READLANE_B32_vi: case AMDGPU::V_WRITELANE_B32: - case AMDGPU::V_WRITELANE_B32_si: + case AMDGPU::V_WRITELANE_B32_gfx6_gfx7: + case AMDGPU::V_WRITELANE_B32_gfx10: case AMDGPU::V_WRITELANE_B32_vi: return false; } @@ -2830,7 +3166,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, switch (Desc.OpInfo[i].OperandType) { case MCOI::OPERAND_REGISTER: - if (MI.getOperand(i).isImm()) { + if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { ErrInfo = "Illegal immediate value for operand."; return false; } @@ -2843,7 +3179,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { const MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { ErrInfo = "Illegal immediate value for operand."; @@ -3022,9 +3362,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) ++ConstantBusCount; + SmallVector<unsigned, 2> SGPRsUsed; unsigned SGPRUsed = findImplicitSGPRRead(MI); - if (SGPRUsed != AMDGPU::NoRegister) + if (SGPRUsed != AMDGPU::NoRegister) { ++ConstantBusCount; + SGPRsUsed.push_back(SGPRUsed); + } for (int OpIdx : OpIndices) { if (OpIdx == -1) @@ -3032,23 +3375,37 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { if (MO.isReg()) { - if (MO.getReg() != SGPRUsed) - ++ConstantBusCount; SGPRUsed = MO.getReg(); + if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { + return !RI.regsOverlap(SGPRUsed, SGPR); + })) { + ++ConstantBusCount; + SGPRsUsed.push_back(SGPRUsed); + } } else { ++ConstantBusCount; ++LiteralCount; } } } - if (ConstantBusCount > 1) { - ErrInfo = "VOP* instruction uses the constant bus more than once"; + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + // v_writelane_b32 is an exception from constant bus restriction: + // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const + if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && + Opcode != AMDGPU::V_WRITELANE_B32) { + ErrInfo = "VOP* instruction violates constant bus restriction"; return false; } if (isVOP3(MI) && LiteralCount) { - ErrInfo = "VOP3 instruction uses literal"; - return false; + if (LiteralCount && !ST.hasVOP3Literal()) { + ErrInfo = "VOP3 instruction uses literal"; + return false; + } + if (LiteralCount > 1) { + ErrInfo = "VOP3 instruction uses more than one literal"; + return false; + } } } @@ -3067,17 +3424,43 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (isSOP2(MI) || isSOPC(MI)) { + const MachineOperand &Src0 = MI.getOperand(Src0Idx); + const MachineOperand &Src1 = MI.getOperand(Src1Idx); + unsigned Immediates = 0; + + if (!Src0.isReg() && + !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) + Immediates++; + if (!Src1.isReg() && + !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) + Immediates++; + + if (Immediates > 1) { + ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; + return false; + } + } + if (isSOPK(MI)) { - int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); - if (sopkIsZext(MI)) { - if (!isUInt<16>(Imm)) { - ErrInfo = "invalid immediate for SOPK instruction"; + auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); + if (Desc.isBranch()) { + if (!Op->isMBB()) { + ErrInfo = "invalid branch target for SOPK instruction"; return false; } } else { - if (!isInt<16>(Imm)) { - ErrInfo = "invalid immediate for SOPK instruction"; - return false; + uint64_t Imm = Op->getImm(); + if (sopkIsZext(MI)) { + if (!isUInt<16>(Imm)) { + ErrInfo = "invalid immediate for SOPK instruction"; + return false; + } + } else { + if (!isInt<16>(Imm)) { + ErrInfo = "invalid immediate for SOPK instruction"; + return false; + } } } } @@ -3155,6 +3538,53 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (isMIMG(MI)) { + const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); + if (DimOp) { + int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, + AMDGPU::OpName::vaddr0); + int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); + const AMDGPU::MIMGDimInfo *Dim = + AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); + + if (!Dim) { + ErrInfo = "dim is out of range"; + return false; + } + + bool IsNSA = SRsrcIdx - VAddr0Idx > 1; + unsigned AddrWords = BaseOpcode->NumExtraArgs + + (BaseOpcode->Gradients ? Dim->NumGradients : 0) + + (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + + unsigned VAddrWords; + if (IsNSA) { + VAddrWords = SRsrcIdx - VAddr0Idx; + } else { + const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); + VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; + if (AddrWords > 8) + AddrWords = 16; + else if (AddrWords > 4) + AddrWords = 8; + else if (AddrWords == 3 && VAddrWords == 4) { + // CodeGen uses the V4 variant of instructions for three addresses, + // because the selection DAG does not support non-power-of-two types. + AddrWords = 4; + } + } + + if (VAddrWords != AddrWords) { + ErrInfo = "bad vaddr size"; + return false; + } + } + } + const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); if (DppCt) { using namespace AMDGPU::DPP; @@ -3165,10 +3595,29 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || - (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) { + (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || + (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { ErrInfo = "Invalid dpp_ctrl value"; return false; } + if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && + ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + ErrInfo = "Invalid dpp_ctrl value: " + "wavefront shifts are not supported on GFX10+"; + return false; + } + if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && + ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + ErrInfo = "Invalid dpp_ctrl value: " + "broadcats are not supported on GFX10+"; + return false; + } + if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && + ST.getGeneration() < AMDGPUSubtarget::GFX10) { + ErrInfo = "Invalid dpp_ctrl value: " + "row_share and row_xmask are not supported before GFX10"; + return false; + } } return true; @@ -3183,9 +3632,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::WQM: return AMDGPU::WQM; case AMDGPU::WWM: return AMDGPU::WWM; - case AMDGPU::S_MOV_B32: - return MI.getOperand(1).isReg() ? + case AMDGPU::S_MOV_B32: { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + return MI.getOperand(1).isReg() || + RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; + } case AMDGPU::S_ADD_I32: return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; case AMDGPU::S_ADDC_U32: @@ -3199,7 +3651,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; - case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; + case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32; + case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32; + case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32; case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; @@ -3244,6 +3698,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; } + llvm_unreachable( + "Unexpected scalar opcode without corresponding vector one!"); } const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, @@ -3263,30 +3719,21 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, return RI.getRegClass(RCID); } -bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { - switch (MI.getOpcode()) { - case AMDGPU::COPY: - case AMDGPU::REG_SEQUENCE: - case AMDGPU::PHI: - case AMDGPU::INSERT_SUBREG: - return RI.hasVGPRs(getOpRegClass(MI, 0)); - default: - return RI.hasVGPRs(getOpRegClass(MI, OpNo)); - } -} - void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MachineBasicBlock::iterator I = MI; MachineBasicBlock *MBB = MI.getParent(); MachineOperand &MO = MI.getOperand(OpIdx); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; const TargetRegisterClass *RC = RI.getRegClass(RCID); - unsigned Opcode = AMDGPU::V_MOV_B32_e32; + unsigned Size = TRI->getRegSizeInBits(*RC); + unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; if (MO.isReg()) Opcode = AMDGPU::COPY; else if (RI.isSGPRClass(RC)) - Opcode = AMDGPU::S_MOV_B32; + Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) @@ -3396,37 +3843,53 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, return isLegalRegOperand(MRI, OpInfo, MO); // Handle non-register types that are treated like immediates. - assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); return true; } bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const TargetRegisterClass *DefinedRC = OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; if (!MO) MO = &MI.getOperand(OpIdx); + int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); + int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { + if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) + return false; - RegSubRegPair SGPRUsed; + SmallDenseSet<RegSubRegPair> SGPRsUsed; if (MO->isReg()) - SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); + SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (i == OpIdx) continue; const MachineOperand &Op = MI.getOperand(i); if (Op.isReg()) { - if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && + RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); + if (!SGPRsUsed.count(SGPR) && usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { - return false; + if (--ConstantBusLimit <= 0) + return false; + SGPRsUsed.insert(SGPR); } } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { - return false; + if (--ConstantBusLimit <= 0) + return false; + } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && + isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { + if (!VOP3LiteralLimit--) + return false; + if (--ConstantBusLimit <= 0) + return false; } } } @@ -3437,7 +3900,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, } // Handle non-register types that are treated like immediates. - assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); + assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); if (!DefinedRC) { // This operand expects an immediate. @@ -3452,30 +3915,24 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, unsigned Opc = MI.getOpcode(); const MCInstrDesc &InstrDesc = get(Opc); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); MachineOperand &Src1 = MI.getOperand(Src1Idx); // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 - // we need to only have one constant bus use. - // - // Note we do not need to worry about literal constants here. They are - // disabled for the operand type for instructions because they will always - // violate the one constant bus use rule. + // we need to only have one constant bus use before GFX10. bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; - if (HasImplicitSGPR) { - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - - if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) - legalizeOpWithMove(MI, Src0Idx); - } + if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && + Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || + isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) + legalizeOpWithMove(MI, Src0Idx); // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for // both the value to write (src0) and lane select (src1). Fix up non-SGPR // src0/src1 with V_READFIRSTLANE. if (Opc == AMDGPU::V_WRITELANE_B32) { - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); const DebugLoc &DL = MI.getDebugLoc(); if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -3493,6 +3950,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, return; } + // No VOP2 instructions support AGPRs. + if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) + legalizeOpWithMove(MI, Src0Idx); + + if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) + legalizeOpWithMove(MI, Src1Idx); + // VOP2 src0 instructions support all operand types, so we don't need to check // their legality. If src1 is already legal, we don't need to do anything. if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) @@ -3520,9 +3984,6 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, return; } - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - // If src0 can be used as src1, commuting will make the operands legal. // Otherwise we have to give up and insert a move. // @@ -3556,12 +4017,11 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); Src1.setSubReg(Src0SubReg); + fixImplicitOperands(MI); } -// Legalize VOP3 operands. Because all operand types are supported for any -// operand, and since literal constants are not allowed and should never be -// seen, we only need to worry about inserting copies if we use multiple SGPR -// operands. +// Legalize VOP3 operands. All operand types are supported for any operand +// but only one literal constant and only starting from GFX10. void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); @@ -3572,8 +4032,35 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) }; + if (Opc == AMDGPU::V_PERMLANE16_B32 || + Opc == AMDGPU::V_PERMLANEX16_B32) { + // src1 and src2 must be scalar + MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); + MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); + const DebugLoc &DL = MI.getDebugLoc(); + if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src1); + Src1.ChangeToRegister(Reg, false); + } + if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src2); + Src2.ChangeToRegister(Reg, false); + } + } + // Find the one SGPR operand we are allowed to use. + int ConstantBusLimit = ST.getConstantBusLimit(Opc); + int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; + SmallDenseSet<unsigned> SGPRsUsed; unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + if (SGPRReg != AMDGPU::NoRegister) { + SGPRsUsed.insert(SGPRReg); + --ConstantBusLimit; + } for (unsigned i = 0; i < 3; ++i) { int Idx = VOP3Idx[i]; @@ -3581,16 +4068,38 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, break; MachineOperand &MO = MI.getOperand(Idx); - // We should never see a VOP3 instruction with an illegal immediate operand. - if (!MO.isReg()) + if (!MO.isReg()) { + if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) + continue; + + if (LiteralLimit > 0 && ConstantBusLimit > 0) { + --LiteralLimit; + --ConstantBusLimit; + continue; + } + + --LiteralLimit; + --ConstantBusLimit; + legalizeOpWithMove(MI, Idx); continue; + } + + if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && + !isOperandLegal(MI, Idx, &MO)) { + legalizeOpWithMove(MI, Idx); + continue; + } if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) continue; // VGPRs are legal - if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { - SGPRReg = MO.getReg(); - // We can use one SGPR in each VOP3 instruction. + // We can use one SGPR in each VOP3 instruction prior to GFX10 + // and two starting from GFX10. + if (SGPRsUsed.count(MO.getReg())) + continue; + if (ConstantBusLimit > 0) { + SGPRsUsed.insert(MO.getReg()); + --ConstantBusLimit; continue; } @@ -3607,6 +4116,15 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, unsigned DstReg = MRI.createVirtualRegister(SRC); unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; + if (RI.hasAGPRs(VRC)) { + VRC = RI.getEquivalentVGPRClass(VRC); + unsigned NewSrcReg = MRI.createVirtualRegister(VRC); + BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(TargetOpcode::COPY), NewSrcReg) + .addReg(SrcReg); + SrcReg = NewSrcReg; + } + if (SubRegs == 1) { BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(AMDGPU::V_READFIRSTLANE_B32), DstReg) @@ -3691,15 +4209,27 @@ static void emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, MachineOperand &Rsrc) { + MachineFunction &MF = *OrigBB.getParent(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned SaveExecOpc = + ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + unsigned XorTermOpc = + ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; + unsigned AndOpc = + ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + MachineBasicBlock::iterator I = LoopBB.begin(); unsigned VRsrc = Rsrc.getReg(); unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); - unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); + unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC); + unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC); + unsigned AndCond = MRI.createVirtualRegister(BoolXExecRC); unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); @@ -3737,22 +4267,22 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) .addReg(SRsrc, 0, AMDGPU::sub2_sub3) .addReg(VRsrc, 0, AMDGPU::sub2_sub3); - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond) + BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond) .addReg(CondReg0) .addReg(CondReg1); MRI.setSimpleHint(SaveExec, AndCond); // Update EXEC to matching lanes, saving original to SaveExec. - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec) + BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) .addReg(AndCond, RegState::Kill); // The original instruction is here; we insert the terminators after it. I = LoopBB.end(); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) + .addReg(Exec) .addReg(SaveExec); BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); } @@ -3763,15 +4293,19 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc, MachineDominatorTree *MDT) { MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock::iterator I(&MI); const DebugLoc &DL = MI.getDebugLoc(); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); // Save the EXEC mask - BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec) - .addReg(AMDGPU::EXEC); + BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); // Killed uses in the instruction we are waterfalling around will be // incorrect due to the added control-flow. @@ -3820,8 +4354,7 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, // Restore the EXEC mask MachineBasicBlock::iterator First = RemainderBB->begin(); - BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(SaveExec); + BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); } // Extract pointer from Rsrc and return a zero-value Rsrc replacement. @@ -3901,7 +4434,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, continue; const TargetRegisterClass *OpRC = MRI.getRegClass(MI.getOperand(i).getReg()); - if (RI.hasVGPRs(OpRC)) { + if (RI.hasVectorRegisters(OpRC)) { VRC = OpRC; } else { SRC = OpRC; @@ -3914,7 +4447,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); - VRC = RI.getEquivalentVGPRClass(SRC); + VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC) + : RI.getEquivalentVGPRClass(SRC); } RC = VRC; } else { @@ -3983,7 +4517,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, // Legalize SI_INIT_M0 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { MachineOperand &Src = MI.getOperand(0); - if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg()))) + if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); return; } @@ -4047,19 +4581,28 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC); + unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC); + unsigned RsrcPtr, NewSRsrc; std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 - DebugLoc DL = MI.getDebugLoc(); - BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) - .addReg(RsrcPtr, 0, AMDGPU::sub0) - .addReg(VAddr->getReg(), 0, AMDGPU::sub0); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), NewVAddrLo) + .addDef(CondReg0) + .addReg(RsrcPtr, 0, AMDGPU::sub0) + .addReg(VAddr->getReg(), 0, AMDGPU::sub0) + .addImm(0); // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 - BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) - .addReg(RsrcPtr, 0, AMDGPU::sub1) - .addReg(VAddr->getReg(), 0, AMDGPU::sub1); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) + .addDef(CondReg1, RegState::Dead) + .addReg(RsrcPtr, 0, AMDGPU::sub1) + .addReg(VAddr->getReg(), 0, AMDGPU::sub1) + .addReg(CondReg0, RegState::Kill) + .addImm(0); // NewVaddr = {NewVaddrHi, NewVaddrLo} BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) @@ -4106,6 +4649,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, getNamedOperand(MI, AMDGPU::OpName::glc)) { MIB.addImm(GLC->getImm()); } + if (const MachineOperand *DLC = + getNamedOperand(MI, AMDGPU::OpName::dlc)) { + MIB.addImm(DLC->getImm()); + } MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); @@ -4235,37 +4782,37 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, continue; case AMDGPU::S_LSHL_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_LSHLREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_ASHRREV_I32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_LSHRREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHL_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_LSHLREV_B64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_ASHRREV_I64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_LSHRREV_B64; swapOperands(Inst); } @@ -4279,10 +4826,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, case AMDGPU::S_CBRANCH_SCC0: case AMDGPU::S_CBRANCH_SCC1: // Clear unused bits of vcc - BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), - AMDGPU::VCC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); + if (ST.isWave32()) + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32), + AMDGPU::VCC_LO) + .addReg(AMDGPU::EXEC_LO) + .addReg(AMDGPU::VCC_LO); + else + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), + AMDGPU::VCC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); break; case AMDGPU::S_BFE_U64: @@ -4339,8 +4892,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { MachineOperand &Op = Inst.getOperand(i); if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { + // Only propagate through live-def of SCC. + if (Op.isDef() && !Op.isDead()) + addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); Inst.RemoveOperand(i); - addSCCDefUsersToVALUWorklist(Inst, Worklist); } } @@ -4358,6 +4913,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, } Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); + fixImplicitOperands(Inst); if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { const MachineOperand &OffsetWidthOp = Inst.getOperand(2); @@ -4445,6 +5001,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, Inst.RemoveOperand(3); Inst.setDesc(get(NewOpc)); + Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit Inst.addImplicitDefUseOperands(*MBB.getParent()); MRI.replaceRegWith(OldDstReg, ResultReg); legalizeOperands(Inst, MDT); @@ -4514,8 +5071,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); bool Src1IsSGPR = Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); - MachineInstr *Not = nullptr; - MachineInstr *Xor = nullptr; + MachineInstr *Xor; unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -4523,14 +5079,12 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, // The next iteration over the work list will lower these to the vector // unit as necessary. if (Src0IsSGPR) { - Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) - .add(Src0); + BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) .addReg(Temp) .add(Src1); } else if (Src1IsSGPR) { - Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) - .add(Src1); + BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) .add(Src0) .addReg(Temp); @@ -4538,8 +5092,8 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) .add(Src0) .add(Src1); - Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) - .addReg(Temp); + MachineInstr *Not = + BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); Worklist.insert(Not); } @@ -4670,13 +5224,14 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned CarryReg = MRI.createVirtualRegister(CarryRC); + unsigned DeadCarryReg = MRI.createVirtualRegister(CarryRC); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src0 = Inst.getOperand(1); @@ -4705,7 +5260,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) .addReg(CarryReg, RegState::Define) .add(SrcReg0Sub0) - .add(SrcReg1Sub0); + .add(SrcReg1Sub0) + .addImm(0); // clamp bit unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; MachineInstr *HiHalf = @@ -4713,7 +5269,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, .addReg(DeadCarryReg, RegState::Define | RegState::Dead) .add(SrcReg0Sub1) .add(SrcReg1Sub1) - .addReg(CarryReg, RegState::Kill); + .addReg(CarryReg, RegState::Kill) + .addImm(0); // clamp bit BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) @@ -4943,7 +5500,23 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), E = MRI.use_end(); I != E;) { MachineInstr &UseMI = *I->getParent(); - if (!canReadVGPR(UseMI, I.getOperandNo())) { + + unsigned OpNo = 0; + + switch (UseMI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::WQM: + case AMDGPU::WWM: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::PHI: + case AMDGPU::INSERT_SUBREG: + break; + default: + OpNo = I.getOperandNo(); + break; + } + + if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { Worklist.insert(&UseMI); do { @@ -5017,19 +5590,23 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addSCCDefUsersToVALUWorklist( - MachineInstr &SCCDefInst, SetVectorType &Worklist) const { +void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, + MachineInstr &SCCDefInst, + SetVectorType &Worklist) const { + // Ensure that def inst defines SCC, which is still live. + assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && + !Op.isDead() && Op.getParent() == &SCCDefInst); // This assumes that all the users of SCC are in the same block // as the SCC def. - for (MachineInstr &MI : - make_range(MachineBasicBlock::iterator(SCCDefInst), - SCCDefInst.getParent()->end())) { + for (MachineInstr &MI : // Skip the def inst itself. + make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), + SCCDefInst.getParent()->end())) { + // Check if SCC is used first. + if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) + Worklist.insert(&MI); // Exit if we find another SCC def. if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) return; - - if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) - Worklist.insert(&MI); } } @@ -5046,14 +5623,26 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( case AMDGPU::REG_SEQUENCE: case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: - case AMDGPU::WWM: - if (RI.hasVGPRs(NewDstRC)) - return nullptr; + case AMDGPU::WWM: { + const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); + if (RI.hasAGPRs(SrcRC)) { + if (RI.hasAGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + } else { + if (RI.hasVGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + } - NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); - if (!NewDstRC) - return nullptr; return NewDstRC; + } default: return NewDstRC; } @@ -5139,6 +5728,12 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, } uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { + if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + return (22ULL << 44) | // IMG_FORMAT_32_FLOAT + (1ULL << 56) | // RESOURCE_LEVEL = 1 + (3ULL << 60); // OOB_SELECT = 3 + } + uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; if (ST.isAmdHsaOS()) { // Set ATC = 1. GFX9 doesn't have this bit. @@ -5165,12 +5760,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; } - // IndexStride = 64. - Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; + // IndexStride = 64 / 32. + uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; + Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && + ST.getGeneration() <= AMDGPUSubtarget::GFX9) Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; return Rsrc23; @@ -5267,25 +5864,35 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return DescSize; // No operands. if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) - return DescSize + 4; + return isVOP3(MI) ? 12 : (DescSize + 4); int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return DescSize; if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) - return DescSize + 4; + return isVOP3(MI) ? 12 : (DescSize + 4); int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (Src2Idx == -1) return DescSize; if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) - return DescSize + 4; + return isVOP3(MI) ? 12 : (DescSize + 4); return DescSize; } + // Check whether we have extra NSA words. + if (isMIMG(MI)) { + int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); + if (VAddr0Idx < 0) + return 8; + + int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); + } + switch (Opc) { case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: @@ -5294,10 +5901,12 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return 0; case TargetOpcode::BUNDLE: return getInstBundleSize(MI); - case TargetOpcode::INLINEASM: { + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: { const MachineFunction *MF = MI.getParent()->getParent(); const char *AsmStr = MI.getOperand(0).getSymbolName(); - return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); + return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), + &MF->getSubtarget()); } default: return DescSize; @@ -5332,7 +5941,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { - unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC()); MachineInstr *SIIF = BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) .add(Branch->getOperand(0)) @@ -5359,8 +5968,8 @@ void SIInstrInfo::convertNonUniformLoopRegion( if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { - unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC()); + unsigned BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); MachineInstrBuilder HeaderPHIBuilder = BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), @@ -5370,7 +5979,7 @@ void SIInstrInfo::convertNonUniformLoopRegion( HeaderPHIBuilder.addReg(BackEdgeReg); } else { MachineBasicBlock *PMBB = *PI; - unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), ZeroReg, 0); HeaderPHIBuilder.addReg(ZeroReg); @@ -5432,7 +6041,9 @@ SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, { MO_REL32_LO, "amdgpu-rel32-lo" }, - { MO_REL32_HI, "amdgpu-rel32-hi" } + { MO_REL32_HI, "amdgpu-rel32-hi" }, + { MO_ABS32_LO, "amdgpu-abs32-lo" }, + { MO_ABS32_HI, "amdgpu-abs32-hi" }, }; return makeArrayRef(TargetFlags); @@ -5452,8 +6063,8 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC); + unsigned UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); + MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) .addReg(UnusedCarry, RegState::Define | RegState::Dead); @@ -5480,6 +6091,20 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con } } +void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction *MF = MBB->getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + + if (!ST.isWave32()) + return; + + for (auto &Op : MI.implicit_operands()) { + if (Op.isReg() && Op.getReg() == AMDGPU::VCC) + Op.setReg(AMDGPU::VCC_LO); + } +} + bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { if (!isSMRD(MI)) return false; @@ -5493,6 +6118,25 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { return RCID == AMDGPU::SReg_128RegClassID; } +bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, + bool Signed) const { + // TODO: Should 0 be special cased? + if (!ST.hasFlatInstOffsets()) + return false; + + if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) + return false; + + if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + return (Signed && isInt<12>(Offset)) || + (!Signed && isUInt<11>(Offset)); + } + + return (Signed && isInt<13>(Offset)) || + (!Signed && isUInt<12>(Offset)); +} + + // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td enum SIEncodingFamily { SI = 0, @@ -5500,7 +6144,9 @@ enum SIEncodingFamily { SDWA = 2, SDWA9 = 3, GFX80 = 4, - GFX9 = 5 + GFX9 = 5, + GFX10 = 6, + SDWA10 = 7 }; static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { @@ -5513,6 +6159,8 @@ static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { case AMDGPUSubtarget::VOLCANIC_ISLANDS: case AMDGPUSubtarget::GFX9: return SIEncodingFamily::VI; + case AMDGPUSubtarget::GFX10: + return SIEncodingFamily::GFX10; } llvm_unreachable("Unknown subtarget generation!"); } @@ -5521,18 +6169,29 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { SIEncodingFamily Gen = subtargetEncodingFamily(ST); if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && - ST.getGeneration() >= AMDGPUSubtarget::GFX9) + ST.getGeneration() == AMDGPUSubtarget::GFX9) Gen = SIEncodingFamily::GFX9; - if (get(Opcode).TSFlags & SIInstrFlags::SDWA) - Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9 - : SIEncodingFamily::SDWA; // Adjust the encoding family to GFX80 for D16 buffer instructions when the // subtarget has UnpackedD16VMem feature. // TODO: remove this when we discard GFX80 encoding. if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) Gen = SIEncodingFamily::GFX80; + if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { + switch (ST.getGeneration()) { + default: + Gen = SIEncodingFamily::SDWA; + break; + case AMDGPUSubtarget::GFX9: + Gen = SIEncodingFamily::SDWA9; + break; + case AMDGPUSubtarget::GFX10: + Gen = SIEncodingFamily::SDWA10; + break; + } + } + int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); // -1 means that Opcode is already a native instruction. @@ -5627,3 +6286,77 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, } return nullptr; } + +bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI, + const MachineInstr &UseMI) { + assert(MRI.isSSA() && "Must be run on SSA"); + + auto *TRI = MRI.getTargetRegisterInfo(); + auto *DefBB = DefMI.getParent(); + + // Don't bother searching between blocks, although it is possible this block + // doesn't modify exec. + if (UseMI.getParent() != DefBB) + return true; + + const int MaxInstScan = 20; + int NumInst = 0; + + // Stop scan at the use. + auto E = UseMI.getIterator(); + for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { + if (I->isDebugInstr()) + continue; + + if (++NumInst > MaxInstScan) + return true; + + if (I->modifiesRegister(AMDGPU::EXEC, TRI)) + return true; + } + + return false; +} + +bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI) { + assert(MRI.isSSA() && "Must be run on SSA"); + + auto *TRI = MRI.getTargetRegisterInfo(); + auto *DefBB = DefMI.getParent(); + + const int MaxUseInstScan = 10; + int NumUseInst = 0; + + for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) { + // Don't bother searching between blocks, although it is possible this block + // doesn't modify exec. + if (UseInst.getParent() != DefBB) + return true; + + if (++NumUseInst > MaxUseInstScan) + return true; + } + + const int MaxInstScan = 20; + int NumInst = 0; + + // Stop scan when we have seen all the uses. + for (auto I = std::next(DefMI.getIterator()); ; ++I) { + if (I->isDebugInstr()) + continue; + + if (++NumInst > MaxInstScan) + return true; + + if (I->readsRegister(VReg)) + if (--NumUseInst == 0) + return false; + + if (I->modifiesRegister(AMDGPU::EXEC, TRI)) + return true; + } +} diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 5b1a05f3785e..3ff35da0b963 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -1,9 +1,8 @@ //===- SIInstrInfo.h - SI Instruction Info Interface ------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -121,14 +120,15 @@ private: void addUsersToMoveToVALUWorklist(unsigned Reg, MachineRegisterInfo &MRI, SetVectorType &Worklist) const; - void - addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst, - SetVectorType &Worklist) const; + void addSCCDefUsersToVALUWorklist(MachineOperand &Op, + MachineInstr &SCCDefInst, + SetVectorType &Worklist) const; const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; - bool checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const; + bool checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, + const MachineInstr &MIb) const; unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; @@ -143,7 +143,7 @@ protected: public: enum TargetOperandFlags { - MO_MASK = 0x7, + MO_MASK = 0xf, MO_NONE = 0, // MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL. @@ -157,7 +157,13 @@ public: MO_REL32 = 4, MO_REL32_LO = 4, // MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI. - MO_REL32_HI = 5 + MO_REL32_HI = 5, + + MO_LONG_BRANCH_FORWARD = 6, + MO_LONG_BRANCH_BACKWARD = 7, + + MO_ABS32_LO = 8, + MO_ABS32_HI = 9, }; explicit SIInstrInfo(const GCNSubtarget &ST); @@ -173,11 +179,13 @@ public: int64_t &Offset1, int64_t &Offset2) const override; - bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, + bool getMemOperandWithOffset(const MachineInstr &LdSt, + const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const final; - bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2, + bool shouldClusterMemOps(const MachineOperand &BaseOp1, + const MachineOperand &BaseOp2, unsigned NumLoads) const override; bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, @@ -294,7 +302,8 @@ public: unsigned Kind) const override; bool - areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, + areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, + const MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; bool isFoldableCopy(const MachineInstr &MI) const; @@ -376,6 +385,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::SOPP; } + static bool isPacked(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsPacked; + } + + bool isPacked(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::IsPacked; + } + static bool isVOP1(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::VOP1; } @@ -450,6 +467,8 @@ public: return get(Opcode).TSFlags & SIInstrFlags::DS; } + bool isAlwaysGDS(uint16_t Opcode) const; + static bool isMIMG(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::MIMG; } @@ -477,6 +496,11 @@ public: return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT); } + // FIXME: Make this more precise + static bool isFLATScratch(const MachineInstr &MI) { + return isSegmentSpecificFLAT(MI); + } + // Any FLAT encoded instruction, including global_* and scratch_*. bool isFLAT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FLAT; @@ -546,6 +570,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VINTRP; } + static bool isMAI(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsMAI; + } + + bool isMAI(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::IsMAI; + } + static bool isScalarUnit(const MachineInstr &MI) { return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD); } @@ -612,6 +644,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding; } + static bool isFPAtomic(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FPAtomic; + } + + bool isFPAtomic(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FPAtomic; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); @@ -620,9 +660,21 @@ public: return !RI.isSGPRReg(MRI, Dest); } + bool hasVGPRUses(const MachineInstr &MI) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return llvm::any_of(MI.explicit_uses(), + [&MRI, this](const MachineOperand &MO) { + return MO.isReg() && RI.isVGPR(MRI, MO.getReg());}); + } + /// Whether we must prevent this instruction from executing with EXEC = 0. bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const; + /// Returns true if the instruction could potentially depend on the value of + /// exec. If false, exec dependencies may safely be ignored. + bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const; + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; @@ -761,10 +813,6 @@ public: return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8; } - /// \returns true if it is legal for the operand at index \p OpNo - /// to read a VGPR. - bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const; - /// Legalize the \p OpIndex operand of this instruction by inserting /// a MOV. For example: /// ADD_I32_e32 VGPR0, 15 @@ -836,7 +884,7 @@ public: void insertReturn(MachineBasicBlock &MBB) const; /// Return the number of wait states that result from executing this /// instruction. - unsigned getNumWaitStates(const MachineInstr &MI) const; + static unsigned getNumWaitStates(const MachineInstr &MI); /// Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. @@ -922,10 +970,27 @@ public: return isUInt<12>(Imm); } + /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT + /// encoded instruction. If \p Signed, this is for an instruction that + /// interprets the offset as signed. + bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, + bool Signed) const; + /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; + + const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum, + const TargetRegisterInfo *TRI, + const MachineFunction &MF) + const override { + if (OpNum >= TID.getNumOperands()) + return nullptr; + return RI.getRegClass(TID.OpInfo[OpNum].RegClass); + } + + void fixImplicitOperands(MachineInstr &MI) const; }; /// \brief Returns true if a reg:subreg pair P has a TRC class @@ -956,6 +1021,21 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI); +/// \brief Return false if EXEC is not changed between the def of \p VReg at \p +/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not +/// attempt to track between blocks. +bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI, + const MachineInstr &UseMI); + +/// \brief Return false if EXEC is not changed between the def of \p VReg at \p +/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to +/// track between blocks. +bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, + Register VReg, + const MachineInstr &DefMI); + namespace AMDGPU { LLVM_READONLY @@ -1003,17 +1083,14 @@ namespace AMDGPU { LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode); + LLVM_READONLY + int getVCMPXNoSDstOp(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23); - // For MachineOperands. - enum TargetFlags { - TF_LONG_BRANCH_FORWARD = 1 << 0, - TF_LONG_BRANCH_BACKWARD = 1 << 1 - }; - } // end namespace AMDGPU namespace SI { diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 13afa4d4974b..c382c816e0b4 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -1,25 +1,21 @@ //===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -def isCI : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SEA_ISLANDS">; -def isCIOnly : Predicate<"Subtarget->getGeneration() ==" - "AMDGPUSubtarget::SEA_ISLANDS">, - AssemblerPredicate <"FeatureSeaIslands">; -def isVIOnly : Predicate<"Subtarget->getGeneration() ==" - "AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate <"FeatureVolcanicIslands">; + +def isWave32 : Predicate<"Subtarget->getWavefrontSize() == 32">, + AssemblerPredicate <"FeatureWavefrontSize32">; +def isWave64 : Predicate<"Subtarget->getWavefrontSize() == 64">, + AssemblerPredicate <"FeatureWavefrontSize64">; def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; class GCNPredicateControl : PredicateControl { - Predicate SIAssemblerPredicate = isSICI; - Predicate VIAssemblerPredicate = isVI; + Predicate SIAssemblerPredicate = isGFX6GFX7; + Predicate VIAssemblerPredicate = isGFX8GFX9; } // Execpt for the NONE field, this must be kept in sync with the @@ -32,6 +28,8 @@ def SIEncodingFamily { int SDWA9 = 3; int GFX80 = 4; int GFX9 = 5; + int GFX10 = 6; + int SDWA10 = 7; } //===----------------------------------------------------------------------===// @@ -41,10 +39,16 @@ def SIEncodingFamily { def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", - SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>, + SDTypeProfile<1, 4, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>, + SDTCisVT<4, i1>]>, [SDNPMayLoad, SDNPMemOperand] >; +def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i16>]>, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain, SDNPInGlue] +>; + def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; @@ -57,10 +61,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> ]>; -def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; @@ -69,6 +69,13 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; +// load_d16_{lo|hi} ptr, tied_input +def SIload_d16 : SDTypeProfile<1, 2, [ + SDTCisPtrTy<1>, + SDTCisSameAs<0, 2> +]>; + + def SDTtbuffer_load : SDTypeProfile<1, 8, [ // vdata SDTCisVT<1, v4i32>, // rsrc @@ -101,9 +108,6 @@ def SDTtbuffer_store : SDTypeProfile<0, 9, def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; -def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3", - SDTtbuffer_store, - [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16", SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; @@ -120,6 +124,14 @@ def SDTBufferLoad : SDTypeProfile<1, 7, def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ubyte : SDNode <"AMDGPUISD::BUFFER_LOAD_UBYTE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_ushort : SDNode <"AMDGPUISD::BUFFER_LOAD_USHORT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_byte : SDNode <"AMDGPUISD::BUFFER_LOAD_BYTE", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_short: SDNode <"AMDGPUISD::BUFFER_LOAD_SHORT", SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", @@ -138,6 +150,12 @@ def SDTBufferStore : SDTypeProfile<0, 8, def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_byte: SDNode <"AMDGPUISD::BUFFER_STORE_BYTE", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_short : SDNode <"AMDGPUISD::BUFFER_STORE_SHORT", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; @@ -147,9 +165,7 @@ def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16", class SDBufferAtomic<string opcode> : SDNode <opcode, SDTypeProfile<1, 8, - [SDTCisVT<0, i32>, // dst - SDTCisVT<1, i32>, // vdata - SDTCisVT<2, v4i32>, // rsrc + [SDTCisVT<2, v4i32>, // rsrc SDTCisVT<3, i32>, // vindex(VGPR) SDTCisVT<4, i32>, // voffset(VGPR) SDTCisVT<5, i32>, // soffset(SGPR) @@ -159,6 +175,19 @@ class SDBufferAtomic<string opcode> : SDNode <opcode, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; +class SDBufferAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode, + SDTypeProfile<0, 8, + [SDTCisVT<0, ty>, // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<7, i1>]>, // idxen(imm) + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] +>; + def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">; def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">; def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">; @@ -169,6 +198,8 @@ def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">; def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">; def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; +def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>; +def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, @@ -185,10 +216,54 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; +class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode, + SDTypeProfile<0, 2, + [SDTCisPtrTy<0>, // vaddr + SDTCisVT<1, ty>]>, // vdata + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] +>; + +def SIglobal_atomic_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_FADD", f32>; +def SIglobal_atomic_pk_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_PK_FADD", v2f16>; + def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]> >; +def SIlds : SDNode<"AMDGPUISD::LDS", + SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]> +>; + +def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_lo_u8 : SDNode<"AMDGPUISD::LOAD_D16_LO_U8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_lo_i8 : SDNode<"AMDGPUISD::LOAD_D16_LO_I8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_hi : SDNode<"AMDGPUISD::LOAD_D16_HI", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_hi_u8 : SDNode<"AMDGPUISD::LOAD_D16_HI_U8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + //===----------------------------------------------------------------------===// // ValueType helpers //===----------------------------------------------------------------------===// @@ -201,7 +276,8 @@ class isFloatType<ValueType SrcVT> { !if(!eq(SrcVT.Value, f32.Value), 1, !if(!eq(SrcVT.Value, f64.Value), 1, !if(!eq(SrcVT.Value, v2f16.Value), 1, - 0)))); + !if(!eq(SrcVT.Value, v4f16.Value), 1, + 0))))); } class isIntType<ValueType SrcVT> { @@ -215,8 +291,9 @@ class isIntType<ValueType SrcVT> { class isPackedType<ValueType SrcVT> { bit ret = !if(!eq(SrcVT.Value, v2i16.Value), 1, - !if(!eq(SrcVT.Value, v2f16.Value), 1, 0) - ); + !if(!eq(SrcVT.Value, v2f16.Value), 1, + !if(!eq(SrcVT.Value, v4f16.Value), 1, 0) + )); } //===----------------------------------------------------------------------===// @@ -228,7 +305,7 @@ defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>; def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>; def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>; -def atomic_load_fadd_local : local_binary_atomic_op<SIatomic_fadd>; +def atomic_load_fadd_local : local_binary_atomic_op<atomic_load_fadd>; def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>; def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>; @@ -250,13 +327,13 @@ def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; -def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{ - return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED; -}]>; +def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr)> { + let IsUnindexed = 1; +} -def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{ - return cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; -}]>; +def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> { + let IsNonExtLoad = 1; +} def atomic_load_32_glue : PatFrag<(ops node:$ptr), (AMDGPUatomic_ld_glue node:$ptr)> { @@ -270,35 +347,49 @@ def atomic_load_64_glue : PatFrag<(ops node:$ptr), let MemoryVT = i64; } -def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{ - return cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD; -}]>; +def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> { + let IsLoad = 1; + let IsAnyExtLoad = 1; +} def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{ return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; }]>; -def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{ - return cast<LoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD; -}]>; +def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> { + let IsLoad = 1; + let IsZeroExtLoad = 1; +} -def az_extload_glue : AZExtLoadBase <unindexedload_glue>; +def extloadi8_glue : PatFrag<(ops node:$ptr), (extload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} -def az_extloadi8_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{ - return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; +def zextloadi8_glue : PatFrag<(ops node:$ptr), (zextload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} -def az_extloadi16_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{ - return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; +def extloadi16_glue : PatFrag<(ops node:$ptr), (extload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} -def sextloadi8_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{ - return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; +def zextloadi16_glue : PatFrag<(ops node:$ptr), (zextload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} -def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{ - return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; +def sextloadi8_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i8; +} + +def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> { + let IsLoad = 1; + let MemoryVT = i16; +} def load_glue_align8 : Aligned8Bytes < (ops node:$ptr), (load_glue node:$ptr) @@ -311,8 +402,10 @@ def load_glue_align16 : Aligned16Bytes < def load_local_m0 : LoadFrag<load_glue>, LocalAddress; def sextloadi8_local_m0 : LoadFrag<sextloadi8_glue>, LocalAddress; def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress; -def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress; -def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress; +def extloadi8_local_m0 : LoadFrag<extloadi8_glue>, LocalAddress; +def zextloadi8_local_m0 : LoadFrag<zextloadi8_glue>, LocalAddress; +def extloadi16_local_m0 : LoadFrag<extloadi16_glue>, LocalAddress; +def zextloadi16_local_m0 : LoadFrag<zextloadi16_glue>, LocalAddress; def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress; def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress; def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress; @@ -386,6 +479,51 @@ def si_setcc_uniform : PatFrag < return true; }]>; +//===----------------------------------------------------------------------===// +// SDNodes PatFrags for d16 loads +//===----------------------------------------------------------------------===// + +class LoadD16Frag <SDPatternOperator op> : PatFrag<(ops node:$ptr, node:$tied_in), (op node:$ptr, node:$tied_in)>; +class LocalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, LocalAddress; +class GlobalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, GlobalLoadAddress; +class PrivateLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, PrivateAddress; +class FlatLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, FlatLoadAddress; + +def load_d16_hi_local : LocalLoadD16 <SIload_d16_hi>; +def az_extloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_u8>; +def sextloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_i8>; + +def load_d16_hi_global : GlobalLoadD16 <SIload_d16_hi>; +def az_extloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_u8>; +def sextloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_i8>; + +def load_d16_hi_private : PrivateLoadD16 <SIload_d16_hi>; +def az_extloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_u8>; +def sextloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_i8>; + +def load_d16_hi_flat : FlatLoadD16 <SIload_d16_hi>; +def az_extloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_u8>; +def sextloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_i8>; + + +def load_d16_lo_local : LocalLoadD16 <SIload_d16_lo>; +def az_extloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_u8>; +def sextloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_i8>; + +def load_d16_lo_global : GlobalLoadD16 <SIload_d16_lo>; +def az_extloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_u8>; +def sextloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_i8>; + +def load_d16_lo_private : PrivateLoadD16 <SIload_d16_lo>; +def az_extloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_u8>; +def sextloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_i8>; + +def load_d16_lo_flat : FlatLoadD16 <SIload_d16_lo>; +def az_extloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_u8>; +def sextloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_i8>; + + + def lshr_rev : PatFrag < (ops node:$src1, node:$src0), (srl $src0, $src1) @@ -410,6 +548,7 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, >; def _local_m0 : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + def _region_m0 : region_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; } defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; @@ -424,7 +563,7 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; defm atomic_swap : SIAtomicM0Glue2 <"SWAP">; -defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>; +defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32>; defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>; defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>; @@ -433,6 +572,7 @@ def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, >; def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal<atomic_cmp_swap_glue>; +def atomic_cmp_swap_region_m0 : AtomicCmpSwapRegion<atomic_cmp_swap_glue>; def as_i1imm : SDNodeXForm<imm, [{ @@ -482,8 +622,12 @@ class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1); }]>; -def SIMM16bit : PatLeaf <(imm), - [{return isInt<16>(N->getSExtValue());}] +def SIMM16bit : ImmLeaf <i32, + [{return isInt<16>(Imm);}] +>; + +def UIMM16bit : ImmLeaf <i32, + [{return isUInt<16>(Imm); }] >; class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ @@ -515,6 +659,22 @@ def ShiftAmt32Imm : PatLeaf <(imm), [{ return N->getZExtValue() < 32; }]>; +def getNegV2I16Imm : SDNodeXForm<build_vector, [{ + return SDValue(packNegConstantV2I16(N, *CurDAG), 0); +}]>; + +def NegSubInlineConstV216 : PatLeaf<(build_vector), [{ + assert(N->getNumOperands() == 2); + assert(N->getOperand(0).getValueType().getSizeInBits() == 16); + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + if (Src0 == Src1) + return isNegInlineImmediate(Src0.getNode()); + + return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) || + (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode())); +}], getNegV2I16Imm>; + //===----------------------------------------------------------------------===// // Custom Operands //===----------------------------------------------------------------------===// @@ -588,6 +748,14 @@ def SwizzleMatchClass : AsmOperandClass { let IsOptional = 1; } +def EndpgmMatchClass : AsmOperandClass { + let Name = "EndpgmImm"; + let PredicateMethod = "isEndpgm"; + let ParserMethod = "parseEndpgmOp"; + let RenderMethod = "addImmOperands"; + let IsOptional = 1; +} + def ExpTgtMatchClass : AsmOperandClass { let Name = "ExpTgt"; let PredicateMethod = "isExpTgt"; @@ -605,6 +773,11 @@ def SwizzleImm : Operand<i16> { let ParserMatchClass = SwizzleMatchClass; } +def EndpgmImm : Operand<i16> { + let PrintMethod = "printEndpgm"; + let ParserMatchClass = EndpgmMatchClass; +} + def SWaitMatchClass : AsmOperandClass { let Name = "SWaitCnt"; let RenderMethod = "addImmOperands"; @@ -619,11 +792,41 @@ def VReg32OrOffClass : AsmOperandClass { def WAIT_FLAG : Operand <i32> { let ParserMatchClass = SWaitMatchClass; let PrintMethod = "printWaitFlag"; + let OperandType = "OPERAND_IMMEDIATE"; } include "SIInstrFormats.td" include "VIInstrFormats.td" +def BoolReg : AsmOperandClass { + let Name = "BoolReg"; + let ParserMethod = "parseBoolReg"; + let RenderMethod = "addRegOperands"; +} + +class BoolRC : RegisterOperand<SReg_1> { + let ParserMatchClass = BoolReg; + let DecoderMethod = "decodeBoolReg"; +} + +def SSrc_i1 : RegisterOperand<SReg_1_XEXEC> { + let ParserMatchClass = BoolReg; + let DecoderMethod = "decodeBoolReg"; +} + +def VOPDstS64orS32 : BoolRC { + let PrintMethod = "printVOPDst"; +} + +// SCSrc_i1 is the operand for pseudo instructions only. +// Boolean immeadiates shall not be exposed to codegen instructions. +def SCSrc_i1 : RegisterOperand<SReg_1_XEXEC> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM_INT32"; + let ParserMatchClass = BoolReg; + let DecoderMethod = "decodeBoolReg"; +} + // ===----------------------------------------------------------------------===// // ExpSrc* Special cases for exp src operands which are printed as // "off" depending on en operand. @@ -662,11 +865,12 @@ def SDWASrc_i16 : SDWASrc<i16>; def SDWASrc_f32 : SDWASrc<f32>; def SDWASrc_f16 : SDWASrc<f16>; -def SDWAVopcDst : VOPDstOperand<SReg_64> { +def SDWAVopcDst : BoolRC { let OperandNamespace = "AMDGPU"; let OperandType = "OPERAND_SDWA_VOPC_DST"; let EncoderMethod = "getSDWAVopcDstEncoding"; let DecoderMethod = "decodeSDWAVopcDst"; + let PrintMethod = "printVOPDst"; } class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass { @@ -688,21 +892,11 @@ class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> { let ParserMatchClass = MatchClass; } -class NamedOperandU12<string Name, AsmOperandClass MatchClass> : Operand<i16> { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; -} - class NamedOperandU16<string Name, AsmOperandClass MatchClass> : Operand<i16> { let PrintMethod = "print"#Name; let ParserMatchClass = MatchClass; } -class NamedOperandS13<string Name, AsmOperandClass MatchClass> : Operand<i16> { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; -} - class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> { let PrintMethod = "print"#Name; let ParserMatchClass = MatchClass; @@ -720,8 +914,7 @@ def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>; def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>; def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>; -def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>; -def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>; +def flat_offset : NamedOperandU16<"FlatOffset", NamedMatchClass<"FlatOffset">>; def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>; def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>; def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>; @@ -732,6 +925,7 @@ def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>; def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>; def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>; +def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>; def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; @@ -746,11 +940,15 @@ def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>; def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>; def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; +def Dim : NamedOperandU8<"Dim", NamedMatchClass<"Dim", 0>>; + +def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>; def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>; def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>; def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>; +def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>; def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>; def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>; @@ -762,6 +960,10 @@ def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>; def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>; def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>; +def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>; +def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>; +def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>; + def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>; def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> { @@ -793,9 +995,6 @@ def f32kimm : kimmOperand<i32>; def KImmFP16MatchClass : KImmMatchClass<16>; def f16kimm : kimmOperand<i16>; - -def VOPDstS64 : VOPDstOperand <SReg_64>; - class FPInputModsMatchClass <int opSize> : AsmOperandClass { let Name = "RegOrImmWithFP"#opSize#"InputMods"; let ParserMethod = "parseRegOrImmWithFPInputMods"; @@ -863,7 +1062,7 @@ def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>; def FPVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; - let PredicateMethod = "isVReg"; + let PredicateMethod = "isVReg32"; } def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> { @@ -890,7 +1089,7 @@ def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>; def IntVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithIntInputMods"; let ParserMethod = "parseRegWithIntInputMods"; - let PredicateMethod = "isVReg"; + let PredicateMethod = "isVReg32"; } def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> { @@ -941,6 +1140,8 @@ def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">; // VOP3Mods, but the input source is known to never be NaN. def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">; +// VOP3Mods, but only allowed for f32 operands. +def VOP3Mods_f32 : ComplexPattern<fAny, 2, "SelectVOP3Mods_f32">; def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; @@ -995,6 +1196,31 @@ def TRAPID{ int LLVM_DEBUG_TRAP = 3; } +def HWREG { + int MODE = 1; + int STATUS = 2; + int TRAPSTS = 3; + int HW_ID = 4; + int GPR_ALLOC = 5; + int LDS_ALLOC = 6; + int IB_STS = 7; + int MEM_BASES = 15; + int TBA_LO = 16; + int TBA_HI = 17; + int TMA_LO = 18; + int TMA_HI = 19; + int FLAT_SCR_LO = 20; + int FLAT_SCR_HI = 21; + int XNACK_MASK = 22; + int POPS_PACKER = 25; +} + +class getHwRegImm<int Reg, int Offset = 0, int Size = 32> { + int ret = !or(Reg, + !or(!shl(Offset, 6), + !shl(!add(Size, -1), 11))); +} + //===----------------------------------------------------------------------===// // // SI Instruction multiclass helpers. @@ -1045,18 +1271,26 @@ multiclass EXP_m<bit done, SDPatternOperator node> { def _si : EXP_Helper<done>, SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>, EXPe { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; + let AssemblerPredicates = [isGFX6GFX7]; + let DecoderNamespace = "GFX6GFX7"; let DisableDecoder = DisableSIDecoder; } def _vi : EXP_Helper<done>, SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>, EXPe_vi { - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; + let AssemblerPredicates = [isGFX8GFX9]; + let DecoderNamespace = "GFX8"; let DisableDecoder = DisableVIDecoder; } + + def _gfx10 : EXP_Helper<done>, + SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.GFX10>, + EXPe { + let AssemblerPredicates = [isGFX10Plus]; + let DecoderNamespace = "GFX10"; + let DisableDecoder = DisableSIDecoder; + } } } } @@ -1080,7 +1314,19 @@ class getVALUDstForVT<ValueType VT> { !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>, !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>, - VOPDstOperand<SReg_64>)))); // else VT == i1 + VOPDstS64orS32)))); // else VT == i1 +} + +// Returns true if VT is floating point. +class getIsFP<ValueType VT> { + bit ret = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, v2f16.Value), 1, + !if(!eq(VT.Value, v4f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + !if(!eq(VT.Value, v2f32.Value), 1, + !if(!eq(VT.Value, f64.Value), 1, + !if(!eq(VT.Value, v2f64.Value), 1, + 0))))))); } // Returns the register class to use for the destination of VOP[12C] @@ -1094,11 +1340,7 @@ class getSDWADstForVT<ValueType VT> { // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { - bit isFP = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, v2f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - !if(!eq(VT.Value, f64.Value), 1, - 0)))); + bit isFP = getIsFP<VT>.ret; RegisterOperand ret = !if(isFP, @@ -1107,8 +1349,11 @@ class getVOPSrc0ForVT<ValueType VT> { !if(!eq(VT.Value, f16.Value), VSrc_f16, !if(!eq(VT.Value, v2f16.Value), - VCSrc_v2f16, - VSrc_f32 + VSrc_v2f16, + !if(!eq(VT.Value, v4f16.Value), + AVSrc_64, + VSrc_f32 + ) ) ) ), @@ -1117,7 +1362,7 @@ class getVOPSrc0ForVT<ValueType VT> { !if(!eq(VT.Value, i16.Value), VSrc_b16, !if(!eq(VT.Value, v2i16.Value), - VCSrc_v2b16, + VSrc_v2b16, VSrc_b32 ) ) @@ -1132,9 +1377,7 @@ class getVregSrcForVT<ValueType VT> { } class getSDWASrcForVT <ValueType VT> { - bit isFP = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - 0)); + bit isFP = getIsFP<VT>.ret; RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32); RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32); RegisterOperand ret = !if(isFP, retFlt, retInt); @@ -1143,33 +1386,32 @@ class getSDWASrcForVT <ValueType VT> { // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT> { - bit isFP = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, v2f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - !if(!eq(VT.Value, f64.Value), 1, - 0)))); + bit isFP = getIsFP<VT>.ret; RegisterOperand ret = !if(!eq(VT.Size, 128), VSrc_128, !if(!eq(VT.Size, 64), !if(isFP, - VCSrc_f64, - VCSrc_b64), + VSrc_f64, + VSrc_b64), !if(!eq(VT.Value, i1.Value), - SCSrc_i1, + SSrc_i1, !if(isFP, !if(!eq(VT.Value, f16.Value), - VCSrc_f16, + VSrc_f16, !if(!eq(VT.Value, v2f16.Value), - VCSrc_v2f16, - VCSrc_f32 + VSrc_v2f16, + !if(!eq(VT.Value, v4f16.Value), + AVSrc_64, + VSrc_f32 + ) ) ), !if(!eq(VT.Value, i16.Value), - VCSrc_b16, + VSrc_b16, !if(!eq(VT.Value, v2i16.Value), - VCSrc_v2b16, - VCSrc_b32 + VSrc_v2b16, + VSrc_b32 ) ) ) @@ -1190,11 +1432,8 @@ class isModifierType<ValueType SrcVT> { } // Return type of input modifiers operand for specified input operand -class getSrcMod <ValueType VT> { - bit isFP = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - !if(!eq(VT.Value, f64.Value), 1, - 0))); +class getSrcMod <ValueType VT, bit EnableF32SrcMods> { + bit isFP = getIsFP<VT>.ret; bit isPacked = isPackedType<VT>.ret; Operand ret = !if(!eq(VT.Size, 64), !if(isFP, FP64InputMods, Int64InputMods), @@ -1203,7 +1442,7 @@ class getSrcMod <ValueType VT> { FP16InputMods, FP32InputMods ), - Int32InputMods) + !if(EnableF32SrcMods, FP32InputMods, Int32InputMods)) ); } @@ -1213,10 +1452,7 @@ class getOpSelMod <ValueType VT> { // Return type of input modifiers operand specified input operand for DPP class getSrcModExt <ValueType VT> { - bit isFP = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - !if(!eq(VT.Value, f64.Value), 1, - 0))); + bit isFP = getIsFP<VT>.ret; Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } @@ -1238,7 +1474,7 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { // Returns the input arguments for VOP3 instructions for the given SrcVT. class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, RegisterOperand Src2RC, int NumSrcArgs, - bit HasIntClamp, bit HasModifiers, bit HasOMod, + bit HasIntClamp, bit HasModifiers, bit HasSrc2Mods, bit HasOMod, Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { dag ret = @@ -1276,16 +1512,33 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, /* endif */ ) /* NumSrcArgs == 3 */, !if (!eq(HasModifiers, 1), - // VOP3 with modifiers - !if (!eq(HasOMod, 1), - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2Mod:$src2_modifiers, Src2RC:$src2, - clampmod:$clamp, omod:$omod), - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2Mod:$src2_modifiers, Src2RC:$src2, - clampmod:$clamp)) + !if (!eq(HasSrc2Mods, 1), + // VOP3 with modifiers + !if (!eq(HasOMod, 1), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp, omod:$omod), + !if (!eq(HasIntClamp, 1), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2))), + // VOP3 with modifiers except src2 + !if (!eq(HasOMod, 1), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2RC:$src2, clampmod:$clamp, omod:$omod), + !if (!eq(HasIntClamp, 1), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2RC:$src2, clampmod:$clamp), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2RC:$src2)))) /* else */, // VOP3 without modifiers !if (!eq(HasIntClamp, 1), @@ -1398,6 +1651,42 @@ class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1 /* endif */))); } +class getInsDPP16 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC, + int NumSrcArgs, bit HasModifiers, + Operand Src0Mod, Operand Src1Mod> { + dag ret = !con(getInsDPP<DstRC, Src0RC, Src1RC, NumSrcArgs, + HasModifiers, Src0Mod, Src1Mod>.ret, + (ins FI:$fi)); +} + +class getInsDPP8 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC, + int NumSrcArgs, bit HasModifiers, + Operand Src0Mod, Operand Src1Mod> { + dag ret = !if (!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP) + (ins dpp8:$dpp8, FI:$fi), + !if (!eq(NumSrcArgs, 1), + !if (!eq(HasModifiers, 1), + // VOP1_DPP with modifiers + (ins DstRC:$old, Src0Mod:$src0_modifiers, + Src0RC:$src0, dpp8:$dpp8, FI:$fi) + /* else */, + // VOP1_DPP without modifiers + (ins DstRC:$old, Src0RC:$src0, dpp8:$dpp8, FI:$fi) + /* endif */) + /* NumSrcArgs == 2 */, + !if (!eq(HasModifiers, 1), + // VOP2_DPP with modifiers + (ins DstRC:$old, + Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + dpp8:$dpp8, FI:$fi) + /* else */, + // VOP2_DPP without modifiers + (ins DstRC:$old, + Src0RC:$src0, Src1RC:$src1, dpp8:$dpp8, FI:$fi) + /* endif */))); +} // Ins for SDWA @@ -1556,6 +1845,26 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; } +class getAsmDPP16 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> { + string ret = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret#"$fi"; +} + +class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + "$sdst", + "$vdst"), + ""); // use $sdst for VOPC + string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string args = !if(!eq(HasModifiers, 0), + getAsm32<0, NumSrcArgs, DstVT>.ret, + ", "#src0#src1); + string ret = dst#args#"$dpp8$fi"; +} + class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> { string dst = !if(HasDst, !if(!eq(DstVT.Size, 1), @@ -1650,9 +1959,12 @@ def PatGenMode { int Pattern = 1; } -class VOPProfile <list<ValueType> _ArgVT> { +class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, + bit _EnableClamp = 0> { field list<ValueType> ArgVT = _ArgVT; + field bit EnableF32SrcMods = _EnableF32SrcMods; + field bit EnableClamp = _EnableClamp; field ValueType DstVT = ArgVT[0]; field ValueType Src0VT = ArgVT[1]; @@ -1670,9 +1982,9 @@ class VOPProfile <list<ValueType> _ArgVT> { field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret; field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret; field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret; - field Operand Src0Mod = getSrcMod<Src0VT>.ret; - field Operand Src1Mod = getSrcMod<Src1VT>.ret; - field Operand Src2Mod = getSrcMod<Src2VT>.ret; + field Operand Src0Mod = getSrcMod<Src0VT, EnableF32SrcMods>.ret; + field Operand Src1Mod = getSrcMod<Src1VT, EnableF32SrcMods>.ret; + field Operand Src2Mod = getSrcMod<Src2VT, EnableF32SrcMods>.ret; field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret; field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret; field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret; @@ -1688,12 +2000,16 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1); // TODO: Modifiers logic is somewhat adhoc here, to be refined later - field bit HasModifiers = isModifierType<Src0VT>.ret; + // HasModifiers affects the normal and DPP encodings. We take note of EnableF32SrcMods, which + // enables modifiers for i32 type. + field bit HasModifiers = BitOr<isModifierType<Src0VT>.ret, EnableF32SrcMods>.ret; + // HasSrc*FloatMods affects the SDWA encoding. We ignore EnableF32SrcMods. field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret; field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret; field bit HasSrc2FloatMods = isFloatType<Src2VT>.ret; + // HasSrc*IntMods affects the SDWA encoding. We ignore EnableF32SrcMods. field bit HasSrc0IntMods = isIntType<Src0VT>.ret; field bit HasSrc1IntMods = isIntType<Src1VT>.ret; field bit HasSrc2IntMods = isIntType<Src2VT>.ret; @@ -1702,7 +2018,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0); field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0); - field bit HasClamp = HasModifiers; + field bit HasClamp = BitOr<isModifierType<Src0VT>.ret, EnableClamp>.ret; field bit HasSDWAClamp = EmitDst; field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret; field bit HasIntClamp = !if(isFloatType<DstVT>.ret, 0, HasClamp); @@ -1721,6 +2037,8 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasExtSDWA9 = HasExt; field int NeedPatGen = PatGenMode.NoPattern; + field bit IsMAI = 0; + field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods); @@ -1732,12 +2050,13 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag Outs32 = Outs; field dag Outs64 = Outs; field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; + field dag OutsDPP8 = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, - HasIntClamp, HasModifiers, HasOMod, Src0Mod, Src1Mod, - Src2Mod>.ret; + HasIntClamp, HasModifiers, HasSrc2Mods, + HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, HasClamp, Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret; @@ -1751,6 +2070,10 @@ class VOPProfile <list<ValueType> _ArgVT> { getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP>.ret, (ins)); + field dag InsDPP16 = getInsDPP16<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, + HasModifiers, Src0ModDPP, Src1ModDPP>.ret; + field dag InsDPP8 = getInsDPP8<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, 0, + Src0ModDPP, Src1ModDPP>.ret; field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasSDWAOMod, Src0ModSDWA, Src1ModSDWA, DstVT>.ret; @@ -1766,8 +2089,12 @@ class VOPProfile <list<ValueType> _ArgVT> { HasSrc2FloatMods>.ret; field string AsmDPP = !if(HasExtDPP, getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, ""); + field string AsmDPP16 = getAsmDPP16<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; + field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0, DstVT>.ret; field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret; field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret; + + field string TieRegDPP = "$old"; } class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { @@ -1828,6 +2155,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], 0, /*EnableClamp=*/1>; def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; @@ -1848,6 +2176,19 @@ def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>; def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>; def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>; +def VOP_V4F32_F32_F32_V4F32 : VOPProfile <[v4f32, f32, f32, v4f32]>; +def VOP_V16F32_F32_F32_V16F32 : VOPProfile <[v16f32, f32, f32, v16f32]>; +def VOP_V32F32_F32_F32_V32F32 : VOPProfile <[v32f32, f32, f32, v32f32]>; +def VOP_V4F32_V4F16_V4F16_V4F32 : VOPProfile <[v4f32, v4f16, v4f16, v4f32]>; +def VOP_V16F32_V4F16_V4F16_V16F32 : VOPProfile <[v16f32, v4f16, v4f16, v16f32]>; +def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32f32, v4f16, v4f16, v32f32]>; +def VOP_V4F32_V2I16_V2I16_V4F32 : VOPProfile <[v4f32, v2i16, v2i16, v4f32]>; +def VOP_V16F32_V2I16_V2I16_V16F32 : VOPProfile <[v16f32, v2i16, v2i16, v16f32]>; +def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32f32, v2i16, v2i16, v32f32]>; +def VOP_V4I32_I32_I32_V4I32 : VOPProfile <[v4i32, i32, i32, v4i32]>; +def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>; +def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>; + class Commutable_REV <string revOp, bit isOrig> { string RevOp = revOp; bit IsOrig = isOrig; @@ -1871,13 +2212,12 @@ class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : let isCodeGenOnly = 1; } +// FIXME-GFX10: WIP. class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins, - string asm> : + string asm, int encodingFamily> : VINTRPCommon <outs, ins, asm, []>, VINTRPe <op>, - SIMCInstr<opName, SIEncodingFamily.SI> { - let AssemblerPredicate = SIAssemblerPredicate; - let DecoderNamespace = "SICI"; + SIMCInstr<opName, encodingFamily> { let DisableDecoder = DisableSIDecoder; } @@ -1887,19 +2227,25 @@ class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins, VINTRPe_vi <op>, SIMCInstr<opName, SIEncodingFamily.VI> { let AssemblerPredicate = VIAssemblerPredicate; - let DecoderNamespace = "VI"; + let DecoderNamespace = "GFX8"; let DisableDecoder = DisableVIDecoder; } +// FIXME-GFX10: WIP. multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern = []> { def "" : VINTRP_Pseudo <NAME, outs, ins, pattern>; - def _si : VINTRP_Real_si <op, NAME, outs, ins, asm>; + let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + def _si : VINTRP_Real_si <op, NAME, outs, ins, asm, SIEncodingFamily.SI>; + } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>; -} + let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + def _gfx10 : VINTRP_Real_si<op, NAME, outs, ins, asm, SIEncodingFamily.GFX10>; + } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" +} //===----------------------------------------------------------------------===// // Vector instruction mappings //===----------------------------------------------------------------------===// @@ -1981,7 +2327,9 @@ def getMCOpcodeGen : InstrMapping { // does not actually change the encoding, and thus may be // removed later. [!cast<string>(SIEncodingFamily.GFX80)], - [!cast<string>(SIEncodingFamily.GFX9)]]; + [!cast<string>(SIEncodingFamily.GFX9)], + [!cast<string>(SIEncodingFamily.GFX10)], + [!cast<string>(SIEncodingFamily.SDWA10)]]; } // Get equivalent SOPK instruction. @@ -2044,6 +2392,24 @@ def getGlobalSaddrOp : InstrMapping { let ValueCols = [["1"]]; } +// Maps a v_cmpx opcode with sdst to opcode without sdst. +def getVCMPXNoSDstOp : InstrMapping { + let FilterClass = "VCMPXNoSDstTable"; + let RowFields = ["NoSDstOp"]; + let ColFields = ["HasSDst"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +// Maps a SOPP to a SOPP with S_NOP +def getSOPPWithRelaxation : InstrMapping { + let FilterClass = "Base_SOPP"; + let RowFields = ["AsmString"]; + let ColFields = ["Size"]; + let KeyCol = ["4"]; + let ValueCols = [["8"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index b6b00c2e4257..70f20bb69370 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1,9 +1,8 @@ //===-- SIInstructions.td - SI Instruction Defintions ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This file was originally auto-generated from a GPU register header file and @@ -12,7 +11,7 @@ //===----------------------------------------------------------------------===// class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl { - let SubtargetPredicate = isGCN; + } include "SOPInstructions.td" @@ -122,7 +121,14 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] -def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> { +def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { + let Defs = [EXEC]; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + +def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { let hasSideEffects = 0; let mayLoad = 0; let mayStore = 0; @@ -155,13 +161,12 @@ def S_SUB_U64_PSEUDO : SPseudoInstSI < >; def S_ADD_U64_CO_PSEUDO : SPseudoInstSI < - (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) + (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) >; def S_SUB_U64_CO_PSEUDO : SPseudoInstSI < - (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) + (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) >; - } // End usesCustomInserter = 1, Defs = [SCC] let usesCustomInserter = 1 in { @@ -169,23 +174,30 @@ def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; } // End let usesCustomInserter = 1, SALU = 1 -def S_MOV_B64_term : SPseudoInstSI<(outs SReg_64:$dst), - (ins SSrc_b64:$src0)> { - let isAsCheapAsAMove = 1; +// Wrap an instruction by duplicating it, except for setting isTerminator. +class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< + base_inst.OutOperandList, + base_inst.InOperandList> { + let Uses = base_inst.Uses; + let Defs = base_inst.Defs; let isTerminator = 1; + let isAsCheapAsAMove = base_inst.isAsCheapAsAMove; + let hasSideEffects = base_inst.hasSideEffects; + let UseNamedOperandTable = base_inst.UseNamedOperandTable; + let CodeSize = base_inst.CodeSize; } -def S_XOR_B64_term : SPseudoInstSI<(outs SReg_64:$dst), - (ins SSrc_b64:$src0, SSrc_b64:$src1)> { - let isAsCheapAsAMove = 1; - let isTerminator = 1; - let Defs = [SCC]; +let WaveSizePredicate = isWave64 in { +def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>; +def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; +def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; } -def S_ANDN2_B64_term : SPseudoInstSI<(outs SReg_64:$dst), - (ins SSrc_b64:$src0, SSrc_b64:$src1)> { - let isAsCheapAsAMove = 1; - let isTerminator = 1; +let WaveSizePredicate = isWave32 in { +def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>; +def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>; +def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; +def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; } def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), @@ -195,7 +207,6 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), let hasSideEffects = 1; let mayLoad = 1; let mayStore = 1; - let isBarrier = 1; let isConvergent = 1; let FixedSize = 1; let Size = 0; @@ -222,30 +233,30 @@ let isTerminator = 1 in { let OtherPredicates = [EnableLateCFGStructurize] in { def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < (outs), - (ins SReg_64:$vcc, brtarget:$target), + (ins SReg_1:$vcc, brtarget:$target), [(brcond i1:$vcc, bb:$target)]> { let Size = 12; } } def SI_IF: CFPseudoInstSI < - (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), - [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { + (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target), + [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { let Constraints = ""; let Size = 12; let hasSideEffects = 1; } def SI_ELSE : CFPseudoInstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { + (outs SReg_1:$dst), + (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { let Size = 12; let hasSideEffects = 1; } def SI_LOOP : CFPseudoInstSI < - (outs), (ins SReg_64:$saved, brtarget:$target), - [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> { + (outs), (ins SReg_1:$saved, brtarget:$target), + [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> { let Size = 8; let isBranch = 1; let hasSideEffects = 1; @@ -254,8 +265,7 @@ def SI_LOOP : CFPseudoInstSI < } // End isTerminator = 1 def SI_END_CF : CFPseudoInstSI < - (outs), (ins SReg_64:$saved), - [(int_amdgcn_end_cf i64:$saved)], 1, 1> { + (outs), (ins SReg_1:$saved), [], 1, 1> { let Size = 4; let isAsCheapAsAMove = 1; let isReMaterializable = 1; @@ -265,8 +275,7 @@ def SI_END_CF : CFPseudoInstSI < } def SI_IF_BREAK : CFPseudoInstSI < - (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), - [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> { + (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { let Size = 4; let isAsCheapAsAMove = 1; let isReMaterializable = 1; @@ -292,7 +301,7 @@ multiclass PseudoInstKill <dag ins> { } } -defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>; +defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; let Defs = [EXEC,VCC] in @@ -311,7 +320,7 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> { } def SI_PS_LIVE : PseudoInstSI < - (outs SReg_64:$dst), (ins), + (outs SReg_1:$dst), (ins), [(set i1:$dst, (int_amdgcn_ps_live))]> { let SALU = 1; } @@ -340,6 +349,15 @@ def SI_INIT_EXEC : SPseudoInstSI < let Defs = [EXEC]; let usesCustomInserter = 1; let isAsCheapAsAMove = 1; + let WaveSizePredicate = isWave64; +} + +def SI_INIT_EXEC_LO : SPseudoInstSI < + (outs), (ins i32imm:$src), []> { + let Defs = [EXEC_LO]; + let usesCustomInserter = 1; + let isAsCheapAsAMove = 1; + let WaveSizePredicate = isWave32; } def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < @@ -374,11 +392,14 @@ def SI_RETURN : SPseudoInstSI < // This version is only needed so we can fill in the output regiter in // the custom inserter. def SI_CALL_ISEL : SPseudoInstSI < - (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> { + (outs), (ins SSrc_b64:$src0, unknown:$callee), + [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { let Size = 4; let isCall = 1; let SchedRW = [WriteBranch]; let usesCustomInserter = 1; + // TODO: Should really base this on the call target + let isConvergent = 1; } // Wrapper around s_swappc_b64 with extra $callee parameter to track @@ -389,23 +410,14 @@ def SI_CALL : SPseudoInstSI < let isCall = 1; let UseNamedOperandTable = 1; let SchedRW = [WriteBranch]; + // TODO: Should really base this on the call target + let isConvergent = 1; } // Tail call handling pseudo -def SI_TCRETURN_ISEL : SPseudoInstSI<(outs), - (ins SSrc_b64:$src0, i32imm:$fpdiff), - [(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> { - let isCall = 1; - let isTerminator = 1; - let isReturn = 1; - let isBarrier = 1; - let SchedRW = [WriteBranch]; - let usesCustomInserter = 1; -} - -def SI_TCRETURN : SPseudoInstSI < - (outs), - (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> { +def SI_TCRETURN : SPseudoInstSI <(outs), + (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff), + [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { let Size = 4; let isCall = 1; let isTerminator = 1; @@ -413,6 +425,8 @@ def SI_TCRETURN : SPseudoInstSI < let isBarrier = 1; let UseNamedOperandTable = 1; let SchedRW = [WriteBranch]; + // TODO: Should really base this on the call target + let isConvergent = 1; } @@ -424,6 +438,8 @@ def ADJCALLSTACKUP : SPseudoInstSI< let FixedSize = 1; let hasSideEffects = 1; let usesCustomInserter = 1; + let SchedRW = [WriteSALU]; + let Defs = [SCC]; } def ADJCALLSTACKDOWN : SPseudoInstSI< @@ -433,6 +449,8 @@ def ADJCALLSTACKDOWN : SPseudoInstSI< let Size = 8; // Worst case. (s_add_u32 + constant) let hasSideEffects = 1; let usesCustomInserter = 1; + let SchedRW = [WriteSALU]; + let Defs = [SCC]; } let Defs = [M0, EXEC, SCC], @@ -490,9 +508,12 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { // SI_SPILL_32_* instructions. defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; +defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>; defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; +defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; +defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { let UseNamedOperandTable = 1, VGPRSpill = 1, @@ -504,7 +525,9 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { let mayStore = 1; let mayLoad = 0; // (2 * 4) + (8 * num_subregs) bytes maximum - let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); } def _RESTORE : VPseudoInstSI < @@ -515,7 +538,9 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { let mayLoad = 1; // (2 * 4) + (8 * num_subregs) bytes maximum - let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); } } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] } @@ -524,21 +549,74 @@ defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; +defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; +defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; + +multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> { + let UseNamedOperandTable = 1, VGPRSpill = 1, + Constraints = "@earlyclobber $tmp", + SchedRW = [WriteVMEM] in { + def _SAVE : VPseudoInstSI < + (outs VGPR_32:$tmp), + (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, + SReg_32:$soffset, i32imm:$offset)> { + let mayStore = 1; + let mayLoad = 0; + // (2 * 4) + (16 * num_subregs) bytes maximum + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); + } + + def _RESTORE : VPseudoInstSI < + (outs vgpr_class:$vdata, VGPR_32:$tmp), + (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, + i32imm:$offset)> { + let mayStore = 0; + let mayLoad = 1; + + // (2 * 4) + (16 * num_subregs) bytes maximum + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); + } + } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] +} + +defm SI_SPILL_A32 : SI_SPILL_AGPR <AGPR_32>; +defm SI_SPILL_A64 : SI_SPILL_AGPR <AReg_64>; +defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>; +defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>; +defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>; def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), (ins si_ga:$ptr_lo, si_ga:$ptr_hi), [(set SReg_64:$dst, - (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> { + (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> { let Defs = [SCC]; } def : GCNPat < + (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0), + (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) +>; + +def : GCNPat < (AMDGPUinit_exec i64:$src), (SI_INIT_EXEC (as_i64imm $src)) ->; +> { + let WaveSizePredicate = isWave64; +} + +def : GCNPat < + (AMDGPUinit_exec i64:$src), + (SI_INIT_EXEC_LO (as_i32imm $src)) +> { + let WaveSizePredicate = isWave32; +} def : GCNPat < (AMDGPUinit_exec_from_input i32:$input, i32:$shift), @@ -551,7 +629,7 @@ def : GCNPat< >; def : GCNPat< - (AMDGPUelse i64:$src, bb:$target), + (AMDGPUelse i1:$src, bb:$target), (SI_ELSE $src, $target, 0) >; @@ -584,7 +662,12 @@ def : Pat < // TODO: we could add more variants for other types of conditionals def : Pat < - (int_amdgcn_icmp i1:$src, (i1 0), (i32 33)), + (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), + (COPY $src) // Return the SGPRs representing i1 src +>; + +def : Pat < + (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), (COPY $src) // Return the SGPRs representing i1 src >; @@ -592,7 +675,7 @@ def : Pat < // VOP1 Patterns //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] in { +let OtherPredicates = [UnsafeFPMath] in { //def : RcpPat<V_RCP_F64_e32, f64>; //defm : RsqPat<V_RSQ_F64_e32, f64>; @@ -615,7 +698,7 @@ def : GCNPat < (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) >; -} // End SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] +} // End OtherPredicates = [UnsafeFPMath] // f16_to_fp patterns @@ -706,17 +789,18 @@ def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> { let SubtargetPredicate = Has16BitInsts; } -multiclass SelectPat <ValueType vt, Instruction inst> { +multiclass SelectPat <ValueType vt> { def : GCNPat < - (vt (select i1:$src0, vt:$src1, vt:$src2)), - (inst $src2, $src1, $src0) + (vt (select i1:$src0, (VOP3Mods_f32 vt:$src1, i32:$src1_mods), + (VOP3Mods_f32 vt:$src2, i32:$src2_mods))), + (V_CNDMASK_B32_e64 $src2_mods, $src2, $src1_mods, $src1, $src0) >; } -defm : SelectPat <i16, V_CNDMASK_B32_e64>; -defm : SelectPat <i32, V_CNDMASK_B32_e64>; -defm : SelectPat <f16, V_CNDMASK_B32_e64>; -defm : SelectPat <f32, V_CNDMASK_B32_e64>; +defm : SelectPat <i16>; +defm : SelectPat <i32>; +defm : SelectPat <f16>; +defm : SelectPat <f32>; let AddedComplexity = 1 in { def : GCNPat < @@ -749,6 +833,22 @@ foreach Index = 0-2 in { >; } +foreach Index = 0-2 in { + def Extract_Element_v3i32_#Index : Extract_Element < + i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v3i32_#Index : Insert_Element < + i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v3f32_#Index : Extract_Element < + f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v3f32_#Index : Insert_Element < + f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + foreach Index = 0-3 in { def Extract_Element_v4i32_#Index : Extract_Element < i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) @@ -765,6 +865,22 @@ foreach Index = 0-3 in { >; } +foreach Index = 0-4 in { + def Extract_Element_v5i32_#Index : Extract_Element < + i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v5i32_#Index : Insert_Element < + i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v5f32_#Index : Extract_Element < + f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v5f32_#Index : Insert_Element < + f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + foreach Index = 0-7 in { def Extract_Element_v8i32_#Index : Extract_Element < i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) @@ -818,7 +934,23 @@ def : Pat < (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) >; -let SubtargetPredicate = isGCN in { +foreach Index = 0-31 in { + def Extract_Element_v32i32_#Index : Extract_Element < + i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Insert_Element_v32i32_#Index : Insert_Element < + i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v32f32_#Index : Extract_Element < + f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Insert_Element_v32f32_#Index : Insert_Element < + f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} // FIXME: Why do only some of these type combinations for SReg and // VReg? @@ -882,6 +1014,10 @@ def : BitConvert <i64, v4f16, VReg_64>; def : BitConvert <v4i32, v4f32, VReg_128>; def : BitConvert <v4f32, v4i32, VReg_128>; +// 96-bit bitcast +def : BitConvert <v3i32, v3f32, SGPR_96>; +def : BitConvert <v3f32, v3i32, SGPR_96>; + // 128-bit bitcast def : BitConvert <v2i64, v4i32, SReg_128>; def : BitConvert <v4i32, v2i64, SReg_128>; @@ -892,6 +1028,10 @@ def : BitConvert <v4i32, v2f64, VReg_128>; def : BitConvert <v2i64, v2f64, VReg_128>; def : BitConvert <v2f64, v2i64, VReg_128>; +// 160-bit bitcast +def : BitConvert <v5i32, v5f32, SGPR_160>; +def : BitConvert <v5f32, v5i32, SGPR_160>; + // 256-bit bitcast def : BitConvert <v8i32, v8f32, SReg_256>; def : BitConvert <v8f32, v8i32, SReg_256>; @@ -902,7 +1042,9 @@ def : BitConvert <v8f32, v8i32, VReg_256>; def : BitConvert <v16i32, v16f32, VReg_512>; def : BitConvert <v16f32, v16i32, VReg_512>; -} // End SubtargetPredicate = isGCN +// 1024-bit bitcast +def : BitConvert <v32i32, v32f32, VReg_1024>; +def : BitConvert <v32f32, v32i32, VReg_1024>; /********** =================== **********/ /********** Src & Dst modifiers **********/ @@ -1070,6 +1212,16 @@ def : GCNPat < (S_MOV_B32 imm:$imm) >; +def : GCNPat < + (VGPRImm<(SIlds tglobaladdr:$ga)>), + (V_MOV_B32_e32 $ga) +>; + +def : GCNPat < + (SIlds tglobaladdr:$ga), + (S_MOV_B32 $ga) +>; + // FIXME: Workaround for ordering issue with peephole optimizer where // a register class copy interferes with immediate folding. Should // use s_mov_b32, which can be shrunk to s_movk_i32 @@ -1104,7 +1256,16 @@ def : GCNPat < def : GCNPat < (i1 imm:$imm), (S_MOV_B64 (i64 (as_i64imm $imm))) ->; +> { + let WaveSizePredicate = isWave64; +} + +def : GCNPat < + (i1 imm:$imm), + (S_MOV_B32 (i32 (as_i32imm $imm))) +> { + let WaveSizePredicate = isWave32; +} def : GCNPat < (f64 InlineFPImm<f64>:$imm), @@ -1115,18 +1276,18 @@ def : GCNPat < /********** Intrinsic Patterns **********/ /********** ================== **********/ -let SubtargetPredicate = isGCN in { def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; -} def : GCNPat < (i32 (sext i1:$src0)), - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0) >; class Ext32Pat <SDNode ext> : GCNPat < (i32 (ext i1:$src0)), - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 1), $src0) >; def : Ext32Pat <zext>; @@ -1144,8 +1305,6 @@ def : GCNPat < // VOP3 Patterns //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGCN in { - def : IMad24Pat<V_MAD_I32_I24, 1>; def : UMad24Pat<V_MAD_U32_U24, 1>; @@ -1153,8 +1312,6 @@ def : UMad24Pat<V_MAD_U32_U24, 1>; defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; -} - def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; @@ -1261,8 +1418,9 @@ def : GCNPat < class ZExt_i64_i1_Pat <SDNode ext> : GCNPat < (i64 (ext i1:$src)), (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, - (S_MOV_B32 (i32 0)), sub1) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 1), $src), + sub0, (S_MOV_B32 (i32 0)), sub1) >; @@ -1280,8 +1438,10 @@ def : GCNPat < def : GCNPat < (i64 (sext i1:$src)), (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0, - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0, + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1) >; class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat < @@ -1296,10 +1456,12 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>; // If we need to perform a logical operation on i1 values, we need to // use vector comparisons since there is only one SCC register. Vector -// comparisons still write to a pair of SGPRs, so treat these as -// 64-bit comparisons. When legalizing SGPR copies, instructions -// resulting in the copies from SCC to these instructions will be -// moved to the VALU. +// comparisons may write to a pair of SGPRs or a single SGPR, so treat +// these as 32 or 64-bit comparisons. When legalizing SGPR copies, +// instructions resulting in the copies from SCC to these instructions +// will be moved to the VALU. + +let WaveSizePredicate = isWave64 in { def : GCNPat < (i1 (and i1:$src0, i1:$src1)), (S_AND_B64 $src0, $src1) @@ -1336,35 +1498,89 @@ def : GCNPat < (S_NOT_B64 $src0) >; } +} // end isWave64 + +let WaveSizePredicate = isWave32 in { +def : GCNPat < + (i1 (and i1:$src0, i1:$src1)), + (S_AND_B32 $src0, $src1) +>; + +def : GCNPat < + (i1 (or i1:$src0, i1:$src1)), + (S_OR_B32 $src0, $src1) +>; + +def : GCNPat < + (i1 (xor i1:$src0, i1:$src1)), + (S_XOR_B32 $src0, $src1) +>; + +def : GCNPat < + (i1 (add i1:$src0, i1:$src1)), + (S_XOR_B32 $src0, $src1) +>; + +def : GCNPat < + (i1 (sub i1:$src0, i1:$src1)), + (S_XOR_B32 $src0, $src1) +>; + +let AddedComplexity = 1 in { +def : GCNPat < + (i1 (add i1:$src0, (i1 -1))), + (S_NOT_B32 $src0) +>; + +def : GCNPat < + (i1 (sub i1:$src0, (i1 -1))), + (S_NOT_B32 $src0) +>; +} +} // end isWave32 def : GCNPat < (f16 (sint_to_fp i1:$src)), - (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)) + (V_CVT_F16_F32_e32 ( + V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), + $src)) >; def : GCNPat < (f16 (uint_to_fp i1:$src)), - (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)) + (V_CVT_F16_F32_e32 ( + V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), + $src)) >; def : GCNPat < (f32 (sint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), + $src) >; def : GCNPat < (f32 (uint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), + $src) >; def : GCNPat < (f64 (sint_to_fp i1:$src)), - (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) + (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 -1), + $src)) >; def : GCNPat < (f64 (uint_to_fp i1:$src)), - (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)) + (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 1), + $src)) >; //===----------------------------------------------------------------------===// @@ -1417,7 +1633,7 @@ def : GCNPat< def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), - (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) + (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) >; } @@ -1478,6 +1694,14 @@ def : GCNPat < >; } // End OtherPredicates = [HasDLInsts] +let SubtargetPredicate = isGFX10Plus in +def : GCNPat < + (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)), + (f16 (VOP3NoMods f32:$src2))), + (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + SRCMODS.NONE, $src2, $clamp, $omod) +>; // Allow integer inputs class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat< @@ -1568,7 +1792,7 @@ def : GCNPat < // Fract Patterns //===----------------------------------------------------------------------===// -let SubtargetPredicate = isSI in { +let SubtargetPredicate = isGFX6 in { // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is // used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient @@ -1595,7 +1819,7 @@ def : GCNPat < DSTCLAMP.NONE, DSTOMOD.NONE) >; -} // End SubtargetPredicates = isSI +} // End SubtargetPredicates = isGFX6 //============================================================================// // Miscellaneous Optimization Patterns @@ -1609,6 +1833,13 @@ def : GCNPat< (S_SUB_I32 $src0, NegSubInlineConst32:$src1) >; +// Avoid pointlessly materializing a constant in VGPR. +// FIXME: Should also do this for readlane, but tablegen crashes on +// the ignored src1. +def : GCNPat< + (int_amdgcn_readfirstlane (i32 imm:$src)), + (S_MOV_B32 $src) +>; multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { def : GCNPat < @@ -1622,8 +1853,6 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { >; } -let SubtargetPredicate = isGCN in { - defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; @@ -1633,8 +1862,6 @@ defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>; defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>; defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>; -} - // This matches 16 permutations of // max(min(x, y), min(max(x, y), z)) class FPMed3Pat<ValueType vt, @@ -1683,8 +1910,8 @@ multiclass Int16Med3Pat<Instruction med3Inst, def : FPMed3Pat<f32, V_MED3_F32>; -let OtherPredicates = [isGFX9] in { +let OtherPredicates = [isGFX9Plus] in { def : FP16Med3Pat<f16, V_MED3_F16>; defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>; defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>; -} // End Predicates = [isGFX9] +} // End Predicates = [isGFX9Plus] diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td deleted file mode 100644 index e51ff4b4bc50..000000000000 --- a/lib/Target/AMDGPU/SIIntrinsics.td +++ /dev/null @@ -1,19 +0,0 @@ -//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Backend internal SI Intrinsic Definitions. User code should not -// directly use these. -// -//===----------------------------------------------------------------------===// - - -let TargetPrefix = "SI", isTarget = 1 in { - def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - -} // End TargetPrefix = "SI", isTarget = 1 diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index be291b127301..ae8b967893a2 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1,9 +1,8 @@ //===- SILoadStoreOptimizer.cpp -------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -132,6 +131,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool GLC1; bool SLC0; bool SLC1; + bool DLC0; + bool DLC1; bool UseST64; SmallVector<MachineInstr *, 8> InstsToMove; }; @@ -257,13 +258,11 @@ static void addDefsUsesToList(const MachineInstr &MI, static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, - const SIInstrInfo *TII, AliasAnalysis *AA) { // RAW or WAR - cannot reorder // WAW - cannot reorder // RAR - safe to reorder - return !(A->mayStore() || B->mayStore()) || - TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); + return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); } // Add MI and its defs to the lists if MI reads one of the defs that are @@ -282,6 +281,7 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, // registers are in SSA form. if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || + (Use.isDef() && RegDefs.count(Use.getReg())) || (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && PhysRegUses.count(Use.getReg())))) { Insts.push_back(&MI); @@ -295,13 +295,13 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef<MachineInstr *> InstsToMove, - const SIInstrInfo *TII, AliasAnalysis *AA) { + AliasAnalysis *AA) { assert(MemOp.mayLoadOrStore()); for (MachineInstr *InstToMove : InstsToMove) { if (!InstToMove->mayLoadOrStore()) continue; - if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) + if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) return false; } return true; @@ -326,7 +326,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { return (EltOffset0 + CI.Width0 == EltOffset1 || EltOffset1 + CI.Width1 == EltOffset0) && - CI.GLC0 == CI.GLC1 && + CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 && (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); } @@ -567,8 +567,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } if (MBBI->mayLoadOrStore() && - (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { + (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. @@ -640,6 +640,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); } + CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm(); + CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm(); } // Check both offsets fit in the reduced range. @@ -647,7 +649,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // move and make sure they are all safe to move down past the merged // instruction. if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) - if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) + if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) return true; } @@ -656,8 +658,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // it was safe to move I and also all the instruction in InstsToMove // down past this instruction. // check if we can move I across MBBI and if we can move all I's users - if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) + if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) break; } return false; @@ -726,7 +728,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(AddrReg->getReg(), 0, BaseSubReg) + .addImm(0); // clamp bit BaseSubReg = 0; } @@ -819,7 +822,8 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) .addReg(ImmReg) - .addReg(AddrReg->getReg(), 0, BaseSubReg); + .addReg(AddrReg->getReg(), 0, BaseSubReg) + .addImm(0); // clamp bit BaseSubReg = 0; } @@ -858,6 +862,7 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) .addImm(MergedOffset) // offset .addImm(CI.GLC0) // glc + .addImm(CI.DLC0) // dlc .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); @@ -910,6 +915,7 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { .addImm(CI.GLC0) // glc .addImm(CI.SLC0) // slc .addImm(0) // tfe + .addImm(CI.DLC0) // dlc .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); @@ -1089,9 +1095,10 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(std::min(CI.Offset0, CI.Offset1)) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); moveInstsAfter(MIB, CI.InstsToMove); @@ -1137,9 +1144,10 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); MachineOperand OffsetHi = createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); - unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned DeadCarryReg = - MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + unsigned CarryReg = MRI->createVirtualRegister(CarryRC); + unsigned DeadCarryReg = MRI->createVirtualRegister(CarryRC); unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -1147,7 +1155,8 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) .addReg(CarryReg, RegState::Define) .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) - .add(OffsetLo); + .add(OffsetLo) + .addImm(0); // clamp bit (void)LoHalf; LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); @@ -1156,7 +1165,8 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, .addReg(DeadCarryReg, RegState::Define | RegState::Dead) .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) .add(OffsetHi) - .addReg(CarryReg, RegState::Kill); + .addReg(CarryReg, RegState::Kill) + .addImm(0); // clamp bit (void)HiHalf; LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 1aa1feebbdae..78f409cd9555 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -1,9 +1,8 @@ //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -83,6 +82,16 @@ private: LiveIntervals *LIS = nullptr; MachineRegisterInfo *MRI = nullptr; + const TargetRegisterClass *BoolRC = nullptr; + unsigned AndOpc; + unsigned OrOpc; + unsigned XorOpc; + unsigned MovTermOpc; + unsigned Andn2TermOpc; + unsigned XorTermrOpc; + unsigned OrSaveExecOpc; + unsigned Exec; + void emitIf(MachineInstr &MI); void emitElse(MachineInstr &MI); void emitIfBreak(MachineInstr &MI); @@ -176,7 +185,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister && Cond.getSubReg() == AMDGPU::NoSubRegister); - unsigned SaveExecReg = SaveExec.getReg(); + Register SaveExecReg = SaveExec.getReg(); MachineOperand &ImpDefSCC = MI.getOperand(4); assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); @@ -188,26 +197,26 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. - unsigned CopyReg = SimpleIf ? SaveExecReg - : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register CopyReg = SimpleIf ? SaveExecReg + : MRI->createVirtualRegister(BoolRC); MachineInstr *CopyExec = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::EXEC, RegState::ImplicitDefine); + .addReg(Exec) + .addReg(Exec, RegState::ImplicitDefine); - unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned Tmp = MRI->createVirtualRegister(BoolRC); MachineInstr *And = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp) + BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp) .addReg(CopyReg) - //.addReg(AMDGPU::EXEC) - .addReg(Cond.getReg()); + .add(Cond); + setImpSCCDefDead(*And, true); MachineInstr *Xor = nullptr; if (!SimpleIf) { Xor = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) + BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg) .addReg(Tmp) .addReg(CopyReg); setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); @@ -216,7 +225,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // Use a copy that is a terminator to get correct spill code placement it with // fast regalloc. MachineInstr *SetExec = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC) + BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) .addReg(Tmp, RegState::Kill); // Insert a pseudo terminator to help keep the verifier happy. This will also @@ -240,7 +249,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { LIS->InsertMachineInstrInMaps(*SetExec); LIS->InsertMachineInstrInMaps(*NewBr); - LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); + LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); MI.eraseFromParent(); // FIXME: Is there a better way of adjusting the liveness? It shouldn't be @@ -257,7 +266,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister); bool ExecModified = MI.getOperand(3).getImm() != 0; @@ -266,17 +275,17 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { // We are running before TwoAddressInstructions, and si_else's operands are // tied. In order to correctly tie the registers, split this into a copy of // the src like it does. - unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register CopyReg = MRI->createVirtualRegister(BoolRC); MachineInstr *CopyExec = BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg) .add(MI.getOperand(1)); // Saved EXEC // This must be inserted before phis and any spill code inserted before the // else. - unsigned SaveReg = ExecModified ? - MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg; + Register SaveReg = ExecModified ? + MRI->createVirtualRegister(BoolRC) : DstReg; MachineInstr *OrSaveExec = - BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg) + BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg) .addReg(CopyReg); MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); @@ -285,8 +294,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { if (ExecModified) { MachineInstr *And = - BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg) - .addReg(AMDGPU::EXEC) + BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg) + .addReg(Exec) .addReg(SaveReg); if (LIS) @@ -294,8 +303,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { } MachineInstr *Xor = - BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec) + .addReg(Exec) .addReg(DstReg); MachineInstr *Branch = @@ -324,7 +333,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { LIS->createAndComputeVirtRegInterval(SaveReg); // Let this be recomputed. - LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); + LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); } void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { @@ -348,14 +357,14 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { // exit" mask. MachineInstr *And = nullptr, *Or = nullptr; if (!SkipAnding) { - And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) - .addReg(AMDGPU::EXEC) + And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Dst) + .addReg(Exec) .add(MI.getOperand(1)); - Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) .addReg(Dst) .add(MI.getOperand(2)); } else - Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) .add(MI.getOperand(1)) .add(MI.getOperand(2)); @@ -373,8 +382,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *AndN2 = - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec) + .addReg(Exec) .add(MI.getOperand(0)); MachineInstr *Branch = @@ -395,8 +404,8 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock::iterator InsPt = MBB.begin(); MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) + .addReg(Exec) .add(MI.getOperand(0)); if (LIS) @@ -428,13 +437,13 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, // does not really modify exec. for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) if (I->modifiesRegister(AMDGPU::EXEC, TRI) && - !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC)) + !(I->isCopy() && I->getOperand(0).getReg() != Exec)) return; for (const auto &SrcOp : Def->explicit_operands()) if (SrcOp.isReg() && SrcOp.isUse() && (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) || - SrcOp.getReg() == AMDGPU::EXEC)) + SrcOp.getReg() == Exec)) Src.push_back(SrcOp); } @@ -472,6 +481,27 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { // This doesn't actually need LiveIntervals, but we can preserve them. LIS = getAnalysisIfAvailable<LiveIntervals>(); MRI = &MF.getRegInfo(); + BoolRC = TRI->getBoolRC(); + + if (ST.isWave32()) { + AndOpc = AMDGPU::S_AND_B32; + OrOpc = AMDGPU::S_OR_B32; + XorOpc = AMDGPU::S_XOR_B32; + MovTermOpc = AMDGPU::S_MOV_B32_term; + Andn2TermOpc = AMDGPU::S_ANDN2_B32_term; + XorTermrOpc = AMDGPU::S_XOR_B32_term; + OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; + Exec = AMDGPU::EXEC_LO; + } else { + AndOpc = AMDGPU::S_AND_B64; + OrOpc = AMDGPU::S_OR_B64; + XorOpc = AMDGPU::S_XOR_B64; + MovTermOpc = AMDGPU::S_MOV_B64_term; + Andn2TermOpc = AMDGPU::S_ANDN2_B64_term; + XorTermrOpc = AMDGPU::S_XOR_B64_term; + OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; + Exec = AMDGPU::EXEC; + } MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -508,6 +538,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::S_AND_B64: case AMDGPU::S_OR_B64: + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32: // Cleanup bit manipulations on exec mask combineMasks(MI); Last = I; diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp index eb038bb5d5fc..1c0f836f07e6 100644 --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -1,15 +1,14 @@ //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass lowers all occurrences of i1 values (with a vreg_1 register class) -// to lane masks (64-bit scalar registers). The pass assumes machine SSA form -// and a wave-level control flow graph. +// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA +// form and a wave-level control flow graph. // // Before this pass, values that are semantically i1 and are defined and used // within the same basic block are already represented as lane masks in scalar @@ -51,6 +50,7 @@ public: static char ID; private: + bool IsWave32 = false; MachineFunction *MF = nullptr; MachineDominatorTree *DT = nullptr; MachinePostDominatorTree *PDT = nullptr; @@ -58,6 +58,14 @@ private: const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; + unsigned ExecReg; + unsigned MovOp; + unsigned AndOp; + unsigned OrOp; + unsigned XorOp; + unsigned AndN2Op; + unsigned OrN2Op; + DenseSet<unsigned> ConstrainRegs; public: @@ -87,6 +95,11 @@ private: MachineBasicBlock::iterator getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; + bool isVreg1(unsigned Reg) const { + return TargetRegisterInfo::isVirtualRegister(Reg) && + MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; + } + bool isLaneMaskReg(unsigned Reg) const { return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == @@ -412,8 +425,10 @@ FunctionPass *llvm::createSILowerI1CopiesPass() { } static unsigned createLaneMaskReg(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); - return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); } static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { @@ -443,13 +458,32 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { ST = &MF->getSubtarget<GCNSubtarget>(); TII = ST->getInstrInfo(); + IsWave32 = ST->isWave32(); + + if (IsWave32) { + ExecReg = AMDGPU::EXEC_LO; + MovOp = AMDGPU::S_MOV_B32; + AndOp = AMDGPU::S_AND_B32; + OrOp = AMDGPU::S_OR_B32; + XorOp = AMDGPU::S_XOR_B32; + AndN2Op = AMDGPU::S_ANDN2_B32; + OrN2Op = AMDGPU::S_ORN2_B32; + } else { + ExecReg = AMDGPU::EXEC; + MovOp = AMDGPU::S_MOV_B64; + AndOp = AMDGPU::S_AND_B64; + OrOp = AMDGPU::S_OR_B64; + XorOp = AMDGPU::S_XOR_B64; + AndN2Op = AMDGPU::S_ANDN2_B64; + OrN2Op = AMDGPU::S_ORN2_B64; + } lowerCopiesFromI1(); lowerPhis(); lowerCopiesToI1(); for (unsigned Reg : ConstrainRegs) - MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass); + MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); ConstrainRegs.clear(); return true; @@ -465,13 +499,10 @@ void SILowerI1Copies::lowerCopiesFromI1() { unsigned DstReg = MI.getOperand(0).getReg(); unsigned SrcReg = MI.getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass) + if (!isVreg1(SrcReg)) continue; - if (isLaneMaskReg(DstReg) || - (TargetRegisterInfo::isVirtualRegister(DstReg) && - MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass)) + if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) continue; // Copy into a 32-bit vector register. @@ -484,6 +515,8 @@ void SILowerI1Copies::lowerCopiesFromI1() { ConstrainRegs.insert(SrcReg); BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) .addImm(0) + .addImm(0) + .addImm(0) .addImm(-1) .addReg(SrcReg); DeadCopies.push_back(&MI); @@ -503,18 +536,22 @@ void SILowerI1Copies::lowerPhis() { SmallVector<MachineBasicBlock *, 4> IncomingBlocks; SmallVector<unsigned, 4> IncomingRegs; SmallVector<unsigned, 4> IncomingUpdated; +#ifndef NDEBUG + DenseSet<unsigned> PhiRegisters; +#endif for (MachineBasicBlock &MBB : *MF) { LF.initialize(MBB); for (MachineInstr &MI : MBB.phis()) { unsigned DstReg = MI.getOperand(0).getReg(); - if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass) + if (!isVreg1(DstReg)) continue; LLVM_DEBUG(dbgs() << "Lower PHI: " << MI); - MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); // Collect incoming values. for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { @@ -525,18 +562,22 @@ void SILowerI1Copies::lowerPhis() { if (IncomingDef->getOpcode() == AMDGPU::COPY) { IncomingReg = IncomingDef->getOperand(1).getReg(); - assert(isLaneMaskReg(IncomingReg)); + assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); assert(!IncomingDef->getOperand(1).getSubReg()); } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { continue; } else { - assert(IncomingDef->isPHI()); + assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); } IncomingBlocks.push_back(IncomingMBB); IncomingRegs.push_back(IncomingReg); } +#ifndef NDEBUG + PhiRegisters.insert(DstReg); +#endif + // Phis in a loop that are observed outside the loop receive a simple but // conservatively correct treatment. MachineBasicBlock *PostDomBound = &MBB; @@ -629,8 +670,7 @@ void SILowerI1Copies::lowerCopiesToI1() { continue; unsigned DstReg = MI.getOperand(0).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(DstReg) || - MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass) + if (!isVreg1(DstReg)) continue; if (MRI->use_empty(DstReg)) { @@ -640,7 +680,8 @@ void SILowerI1Copies::lowerCopiesToI1() { LLVM_DEBUG(dbgs() << "Lower Other: " << MI); - MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) continue; @@ -649,7 +690,7 @@ void SILowerI1Copies::lowerCopiesToI1() { assert(!MI.getOperand(1).getSubReg()); if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - !isLaneMaskReg(SrcReg)) { + (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); unsigned TmpReg = createLaneMaskReg(*MF); BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) @@ -699,7 +740,7 @@ bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const { return false; } - if (MI->getOpcode() != AMDGPU::S_MOV_B64) + if (MI->getOpcode() != MovOp) return false; if (!MI->getOperand(1).isImm()) @@ -774,10 +815,10 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, if (PrevVal == CurVal) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); } else if (CurVal) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC); + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg) - .addReg(AMDGPU::EXEC) + BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) + .addReg(ExecReg) .addImm(-1); } return; @@ -790,9 +831,9 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, PrevMaskedReg = PrevReg; } else { PrevMaskedReg = createLaneMaskReg(*MF); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg) + BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) .addReg(PrevReg) - .addReg(AMDGPU::EXEC); + .addReg(ExecReg); } } if (!CurConstant) { @@ -801,9 +842,9 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, CurMaskedReg = CurReg; } else { CurMaskedReg = createLaneMaskReg(*MF); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg) + BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) .addReg(CurReg) - .addReg(AMDGPU::EXEC); + .addReg(ExecReg); } } @@ -814,12 +855,12 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) .addReg(PrevMaskedReg); } else if (PrevConstant && PrevVal) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg) + BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) .addReg(CurMaskedReg) - .addReg(AMDGPU::EXEC); + .addReg(ExecReg); } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg) + BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) .addReg(PrevMaskedReg) - .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC); + .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); } } diff --git a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp new file mode 100644 index 000000000000..a82047473370 --- /dev/null +++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -0,0 +1,323 @@ +//===-- SILowerSGPRSPills.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Handle SGPR spills. This pass takes the place of PrologEpilogInserter for all +// SGPR spills, so must insert CSR SGPR spills as well as expand them. +// +// This pass must never create new SGPR virtual registers. +// +// FIXME: Must stop RegScavenger spills in later passes. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-lower-sgpr-spills" + +using MBBVector = SmallVector<MachineBasicBlock *, 4>; + +namespace { + +static cl::opt<bool> EnableSpillVGPRToAGPR( + "amdgpu-spill-vgpr-to-agpr", + cl::desc("Enable spilling VGPRs to AGPRs"), + cl::ReallyHidden, + cl::init(true)); + +class SILowerSGPRSpills : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + VirtRegMap *VRM = nullptr; + LiveIntervals *LIS = nullptr; + + // Save and Restore blocks of the current function. Typically there is a + // single save block, unless Windows EH funclets are involved. + MBBVector SaveBlocks; + MBBVector RestoreBlocks; + +public: + static char ID; + + SILowerSGPRSpills() : MachineFunctionPass(ID) {} + + void calculateSaveRestoreBlocks(MachineFunction &MF); + bool spillCalleeSavedRegs(MachineFunction &MF); + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char SILowerSGPRSpills::ID = 0; + +INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, + "SI lower SGPR spill instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, + "SI lower SGPR spill instructions", false, false) + +char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID; + +/// Insert restore code for the callee-saved registers used in the function. +static void insertCSRSaves(MachineBasicBlock &SaveBlock, + ArrayRef<CalleeSavedInfo> CSI, + LiveIntervals *LIS) { + MachineFunction &MF = *SaveBlock.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + MachineBasicBlock::iterator I = SaveBlock.begin(); + if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) { + for (const CalleeSavedInfo &CS : CSI) { + // Insert the spill to the stack frame. + unsigned Reg = CS.getReg(); + + MachineInstrSpan MIS(I, &SaveBlock); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, + TRI); + + if (LIS) { + assert(std::distance(MIS.begin(), I) == 1); + MachineInstr &Inst = *std::prev(I); + + LIS->InsertMachineInstrInMaps(Inst); + LIS->removeAllRegUnitsForPhysReg(Reg); + } + } + } +} + +/// Insert restore code for the callee-saved registers used in the function. +static void insertCSRRestores(MachineBasicBlock &RestoreBlock, + std::vector<CalleeSavedInfo> &CSI, + LiveIntervals *LIS) { + MachineFunction &MF = *RestoreBlock.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + // Restore all registers immediately before the return and any + // terminators that precede it. + MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator(); + + // FIXME: Just emit the readlane/writelane directly + if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) { + for (const CalleeSavedInfo &CI : reverse(CSI)) { + unsigned Reg = CI.getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI); + assert(I != RestoreBlock.begin() && + "loadRegFromStackSlot didn't insert any code!"); + // Insert in reverse order. loadRegFromStackSlot can insert + // multiple instructions. + + if (LIS) { + MachineInstr &Inst = *std::prev(I); + LIS->InsertMachineInstrInMaps(Inst); + LIS->removeAllRegUnitsForPhysReg(Reg); + } + } + } +} + +/// Compute the sets of entry and return blocks for saving and restoring +/// callee-saved registers, and placing prolog and epilog code. +void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Even when we do not change any CSR, we still want to insert the + // prologue and epilogue of the function. + // So set the save points for those. + + // Use the points found by shrink-wrapping, if any. + if (MFI.getSavePoint()) { + SaveBlocks.push_back(MFI.getSavePoint()); + assert(MFI.getRestorePoint() && "Both restore and save must be set"); + MachineBasicBlock *RestoreBlock = MFI.getRestorePoint(); + // If RestoreBlock does not have any successor and is not a return block + // then the end point is unreachable and we do not need to insert any + // epilogue. + if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock()) + RestoreBlocks.push_back(RestoreBlock); + return; + } + + // Save refs to entry and return blocks. + SaveBlocks.push_back(&MF.front()); + for (MachineBasicBlock &MBB : MF) { + if (MBB.isEHFuncletEntry()) + SaveBlocks.push_back(&MBB); + if (MBB.isReturnBlock()) + RestoreBlocks.push_back(&MBB); + } +} + +bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = MF.getFunction(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIFrameLowering *TFI = ST.getFrameLowering(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + RegScavenger *RS = nullptr; + + // Determine which of the registers in the callee save list should be saved. + BitVector SavedRegs; + TFI->determineCalleeSavesSGPR(MF, SavedRegs, RS); + + // Add the code to save and restore the callee saved registers. + if (!F.hasFnAttribute(Attribute::Naked)) { + // FIXME: This is a lie. The CalleeSavedInfo is incomplete, but this is + // necessary for verifier liveness checks. + MFI.setCalleeSavedInfoValid(true); + + std::vector<CalleeSavedInfo> CSI; + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + + for (unsigned I = 0; CSRegs[I]; ++I) { + unsigned Reg = CSRegs[I]; + if (SavedRegs.test(Reg)) { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC), + TRI->getSpillAlignment(*RC), + true); + + CSI.push_back(CalleeSavedInfo(Reg, JunkFI)); + } + } + + if (!CSI.empty()) { + for (MachineBasicBlock *SaveBlock : SaveBlocks) + insertCSRSaves(*SaveBlock, CSI, LIS); + + for (MachineBasicBlock *RestoreBlock : RestoreBlocks) + insertCSRRestores(*RestoreBlock, CSI, LIS); + return true; + } + } + + return false; +} + +bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + + VRM = getAnalysisIfAvailable<VirtRegMap>(); + + assert(SaveBlocks.empty() && RestoreBlocks.empty()); + + // First, expose any CSR SGPR spills. This is mostly the same as what PEI + // does, but somewhat simpler. + calculateSaveRestoreBlocks(MF); + bool HasCSRs = spillCalleeSavedRegs(MF); + + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.hasStackObjects() && !HasCSRs) { + SaveBlocks.clear(); + RestoreBlocks.clear(); + return false; + } + + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() + && EnableSpillVGPRToAGPR; + + bool MadeChange = false; + + const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts(); + + // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be + // handled as SpilledToReg in regular PrologEpilogInserter. + if ((TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) || + SpillVGPRToAGPR) { + // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs + // are spilled to VGPRs, in which case we can eliminate the stack usage. + // + // This operates under the assumption that only other SGPR spills are users + // of the frame index. + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (SpillToAGPR && TII->isVGPRSpill(MI)) { + // Try to eliminate stack used by VGPR spills before frame + // finalization. + unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vaddr); + int FI = MI.getOperand(FIOp).getIndex(); + unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata) + ->getReg(); + if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, + TRI->isAGPR(MRI, VReg))) { + TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr); + continue; + } + } + + if (!TII->isSGPRSpill(MI)) + continue; + + int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr); + (void)Spilled; + assert(Spilled && "failed to spill SGPR to VGPR when allocated"); + } + } + } + + for (MachineBasicBlock &MBB : MF) { + for (auto SSpill : FuncInfo->getSGPRSpillVGPRs()) + MBB.addLiveIn(SSpill.VGPR); + + for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) + MBB.addLiveIn(Reg); + + for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) + MBB.addLiveIn(Reg); + + MBB.sortUniqueLiveIns(); + } + + MadeChange = true; + } + + SaveBlocks.clear(); + RestoreBlocks.clear(); + + return MadeChange; +} diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 181cc41bd5ff..46da974a2f45 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -1,9 +1,8 @@ //===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -29,6 +28,7 @@ using namespace llvm; SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), + Mode(MF.getFunction()), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), @@ -46,7 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ImplicitBufferPtr(false), ImplicitArgPtr(false), GITPtrHigh(0xffffffff), - HighBitsOf32BitAddress(0) { + HighBitsOf32BitAddress(0), + GDSSize(0) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const Function &F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); @@ -69,8 +70,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) // Non-entry functions have no special inputs for now, other registers // required for scratch access. ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; - ScratchWaveOffsetReg = AMDGPU::SGPR4; - FrameOffsetReg = AMDGPU::SGPR5; + ScratchWaveOffsetReg = AMDGPU::SGPR33; + + // TODO: Pick a high register, and shift down, similar to a kernel. + FrameOffsetReg = AMDGPU::SGPR34; StackPtrOffsetReg = AMDGPU::SGPR32; ArgInfo.PrivateSegmentBuffer = @@ -88,33 +91,23 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } } - if (ST.debuggerEmitPrologue()) { - // Enable everything. + if (F.hasFnAttribute("amdgpu-work-group-id-x")) WorkGroupIDX = true; - WorkGroupIDY = true; - WorkGroupIDZ = true; - WorkItemIDX = true; - WorkItemIDY = true; - WorkItemIDZ = true; - } else { - if (F.hasFnAttribute("amdgpu-work-group-id-x")) - WorkGroupIDX = true; - if (F.hasFnAttribute("amdgpu-work-group-id-y")) - WorkGroupIDY = true; + if (F.hasFnAttribute("amdgpu-work-group-id-y")) + WorkGroupIDY = true; - if (F.hasFnAttribute("amdgpu-work-group-id-z")) - WorkGroupIDZ = true; + if (F.hasFnAttribute("amdgpu-work-group-id-z")) + WorkGroupIDZ = true; - if (F.hasFnAttribute("amdgpu-work-item-id-x")) - WorkItemIDX = true; + if (F.hasFnAttribute("amdgpu-work-item-id-x")) + WorkItemIDX = true; - if (F.hasFnAttribute("amdgpu-work-item-id-y")) - WorkItemIDY = true; + if (F.hasFnAttribute("amdgpu-work-item-id-y")) + WorkItemIDY = true; - if (F.hasFnAttribute("amdgpu-work-item-id-z")) - WorkItemIDZ = true; - } + if (F.hasFnAttribute("amdgpu-work-item-id-z")) + WorkItemIDZ = true; const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); bool HasStackObjects = FrameInfo.hasStackObjects(); @@ -154,9 +147,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) KernargSegmentPtr = true; if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) { + auto hasNonSpillStackObjects = [&]() { + // Avoid expensive checking if there's no stack objects. + if (!HasStackObjects) + return false; + for (auto OI = FrameInfo.getObjectIndexBegin(), + OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI) + if (!FrameInfo.isSpillSlotObjectIndex(OI)) + return true; + // All stack objects are spill slots. + return false; + }; // TODO: This could be refined a lot. The attribute is a poor way of // detecting calls that may require it before argument lowering. - if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch")) + if (hasNonSpillStackObjects() || F.hasFnAttribute("amdgpu-flat-scratch")) FlatScratchInit = true; } @@ -169,6 +173,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) S = A.getValueAsString(); if (!S.empty()) S.consumeInteger(0, HighBitsOf32BitAddress); + + S = F.getFnAttribute("amdgpu-gds-size").getValueAsString(); + if (!S.empty()) + S.consumeInteger(0, GDSSize); } void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { @@ -239,6 +247,17 @@ static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) { return false; } +/// \p returns true if \p NumLanes slots are available in VGPRs already used for +/// SGPR spilling. +// +// FIXME: This only works after processFunctionBeforeFrameFinalized +bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF, + unsigned NumNeed) const { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + unsigned WaveSize = ST.getWavefrontSize(); + return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size(); +} + /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, int FI) { @@ -260,7 +279,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, int NumLanes = Size / 4; - const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); // Make sure to handle the case where a wide SGPR spill may span between two // VGPRs. @@ -300,26 +319,92 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, return true; } -void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) { - for (auto &R : SGPRToVGPRSpills) - MFI.RemoveStackObject(R.first); +/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI. +/// Either AGPR is spilled to VGPR to vice versa. +/// Returns true if a \p FI can be eliminated completely. +bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF, + int FI, + bool isAGPRtoVGPR) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + + assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI)); + + auto &Spill = VGPRToAGPRSpills[FI]; + + // This has already been allocated. + if (!Spill.Lanes.empty()) + return Spill.FullyAllocated; + + unsigned Size = FrameInfo.getObjectSize(FI); + unsigned NumLanes = Size / 4; + Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister); + + const TargetRegisterClass &RC = + isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass; + auto Regs = RC.getRegisters(); + + auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + Spill.FullyAllocated = true; + + // FIXME: Move allocation logic out of MachineFunctionInfo and initialize + // once. + BitVector OtherUsedRegs; + OtherUsedRegs.resize(TRI->getNumRegs()); + + const uint32_t *CSRMask = + TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv()); + if (CSRMask) + OtherUsedRegs.setBitsInMask(CSRMask); + + // TODO: Should include register tuples, but doesn't matter with current + // usage. + for (MCPhysReg Reg : SpillAGPR) + OtherUsedRegs.set(Reg); + for (MCPhysReg Reg : SpillVGPR) + OtherUsedRegs.set(Reg); + + SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin(); + for (unsigned I = 0; I < NumLanes; ++I) { + NextSpillReg = std::find_if( + NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) { + return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) && + !OtherUsedRegs[Reg]; + }); + + if (NextSpillReg == Regs.end()) { // Registers exhausted + Spill.FullyAllocated = false; + break; + } + + OtherUsedRegs.set(*NextSpillReg); + SpillRegs.push_back(*NextSpillReg); + Spill.Lanes[I] = *NextSpillReg++; + } + + return Spill.FullyAllocated; } +void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { + // The FP spill hasn't been inserted yet, so keep it around. + for (auto &R : SGPRToVGPRSpills) { + if (R.first != FramePointerSaveIndex) + MFI.RemoveStackObject(R.first); + } -/// \returns VGPR used for \p Dim' work item ID. -unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const { - switch (Dim) { - case 0: - assert(hasWorkItemIDX()); - return AMDGPU::VGPR0; - case 1: - assert(hasWorkItemIDY()); - return AMDGPU::VGPR1; - case 2: - assert(hasWorkItemIDZ()); - return AMDGPU::VGPR2; + // All other SPGRs must be allocated on the default stack, so reset the stack + // ID. + for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; + ++i) + if (i != FramePointerSaveIndex) + MFI.setStackID(i, TargetStackID::Default); + + for (auto &R : VGPRToAGPRSpills) { + if (R.second.FullyAllocated) + MFI.RemoveStackObject(R.first); } - llvm_unreachable("unexpected dimension"); } MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const { @@ -330,3 +415,97 @@ MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const { MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const { return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; } + +static yaml::StringValue regToString(unsigned Reg, + const TargetRegisterInfo &TRI) { + yaml::StringValue Dest; + { + raw_string_ostream OS(Dest.Value); + OS << printReg(Reg, &TRI); + } + return Dest; +} + +static Optional<yaml::SIArgumentInfo> +convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, + const TargetRegisterInfo &TRI) { + yaml::SIArgumentInfo AI; + + auto convertArg = [&](Optional<yaml::SIArgument> &A, + const ArgDescriptor &Arg) { + if (!Arg) + return false; + + // Create a register or stack argument. + yaml::SIArgument SA = yaml::SIArgument::createArgument(Arg.isRegister()); + if (Arg.isRegister()) { + raw_string_ostream OS(SA.RegisterName.Value); + OS << printReg(Arg.getRegister(), &TRI); + } else + SA.StackOffset = Arg.getStackOffset(); + // Check and update the optional mask. + if (Arg.isMasked()) + SA.Mask = Arg.getMask(); + + A = SA; + return true; + }; + + bool Any = false; + Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer); + Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr); + Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr); + Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr); + Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID); + Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit); + Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize); + Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX); + Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY); + Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ); + Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo); + Any |= convertArg(AI.PrivateSegmentWaveByteOffset, + ArgInfo.PrivateSegmentWaveByteOffset); + Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr); + Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr); + Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX); + Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY); + Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ); + + if (Any) + return AI; + + return None; +} + +yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( + const llvm::SIMachineFunctionInfo& MFI, + const TargetRegisterInfo &TRI) + : ExplicitKernArgSize(MFI.getExplicitKernArgSize()), + MaxKernArgAlign(MFI.getMaxKernArgAlign()), + LDSSize(MFI.getLDSSize()), + IsEntryFunction(MFI.isEntryFunction()), + NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), + MemoryBound(MFI.isMemoryBound()), + WaveLimiter(MFI.needsWaveLimiter()), + ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), + ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)), + FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), + StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)), + ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), + Mode(MFI.getMode()) {} + +void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { + MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, *this); +} + +bool SIMachineFunctionInfo::initializeBaseYamlFields( + const yaml::SIMachineFunctionInfo &YamlMFI) { + ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize; + MaxKernArgAlign = YamlMFI.MaxKernArgAlign; + LDSSize = YamlMFI.LDSSize; + IsEntryFunction = YamlMFI.IsEntryFunction; + NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath; + MemoryBound = YamlMFI.MemoryBound; + WaveLimiter = YamlMFI.WaveLimiter; + return false; +} diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index ef91d1e43075..f19b20ceb5da 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -1,9 +1,8 @@ //==- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface --*- C++ -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,13 +15,16 @@ #include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUMachineFunction.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" +#include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -38,12 +40,19 @@ class MachineFrameInfo; class MachineFunction; class TargetRegisterClass; -class AMDGPUImagePseudoSourceValue : public PseudoSourceValue { +class AMDGPUPseudoSourceValue : public PseudoSourceValue { public: - // TODO: Is the img rsrc useful? - explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) : - PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) {} + enum AMDGPUPSVKind : unsigned { + PSVBuffer = PseudoSourceValue::TargetCustom, + PSVImage, + GWSResource + }; + +protected: + AMDGPUPseudoSourceValue(unsigned Kind, const TargetInstrInfo &TII) + : PseudoSourceValue(Kind, TII) {} +public: bool isConstant(const MachineFrameInfo *) const override { // This should probably be true for most images, but we will start by being // conservative. @@ -59,29 +68,250 @@ public: } }; -class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue { +class AMDGPUBufferPseudoSourceValue final : public AMDGPUPseudoSourceValue { public: - explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII) : - PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { } + explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII) + : AMDGPUPseudoSourceValue(PSVBuffer, TII) {} - bool isConstant(const MachineFrameInfo *) const override { - // This should probably be true for most images, but we will start by being - // conservative. - return false; + static bool classof(const PseudoSourceValue *V) { + return V->kind() == PSVBuffer; } +}; +class AMDGPUImagePseudoSourceValue final : public AMDGPUPseudoSourceValue { +public: + // TODO: Is the img rsrc useful? + explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) + : AMDGPUPseudoSourceValue(PSVImage, TII) {} + + static bool classof(const PseudoSourceValue *V) { + return V->kind() == PSVImage; + } +}; + +class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue { +public: + explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII) + : AMDGPUPseudoSourceValue(GWSResource, TII) {} + + static bool classof(const PseudoSourceValue *V) { + return V->kind() == GWSResource; + } + + // These are inaccessible memory from IR. bool isAliased(const MachineFrameInfo *) const override { - return true; + return false; } + // These are inaccessible memory from IR. bool mayAlias(const MachineFrameInfo *) const override { - return true; + return false; + } + + void printCustom(raw_ostream &OS) const override { + OS << "GWSResource"; + } +}; + +namespace yaml { + +struct SIArgument { + bool IsRegister; + union { + StringValue RegisterName; + unsigned StackOffset; + }; + Optional<unsigned> Mask; + + // Default constructor, which creates a stack argument. + SIArgument() : IsRegister(false), StackOffset(0) {} + SIArgument(const SIArgument &Other) { + IsRegister = Other.IsRegister; + if (IsRegister) { + ::new ((void *)std::addressof(RegisterName)) + StringValue(Other.RegisterName); + } else + StackOffset = Other.StackOffset; + Mask = Other.Mask; + } + SIArgument &operator=(const SIArgument &Other) { + IsRegister = Other.IsRegister; + if (IsRegister) { + ::new ((void *)std::addressof(RegisterName)) + StringValue(Other.RegisterName); + } else + StackOffset = Other.StackOffset; + Mask = Other.Mask; + return *this; + } + ~SIArgument() { + if (IsRegister) + RegisterName.~StringValue(); + } + + // Helper to create a register or stack argument. + static inline SIArgument createArgument(bool IsReg) { + if (IsReg) + return SIArgument(IsReg); + return SIArgument(); + } + +private: + // Construct a register argument. + SIArgument(bool) : IsRegister(true), RegisterName() {} +}; + +template <> struct MappingTraits<SIArgument> { + static void mapping(IO &YamlIO, SIArgument &A) { + if (YamlIO.outputting()) { + if (A.IsRegister) + YamlIO.mapRequired("reg", A.RegisterName); + else + YamlIO.mapRequired("offset", A.StackOffset); + } else { + auto Keys = YamlIO.keys(); + if (is_contained(Keys, "reg")) { + A = SIArgument::createArgument(true); + YamlIO.mapRequired("reg", A.RegisterName); + } else if (is_contained(Keys, "offset")) + YamlIO.mapRequired("offset", A.StackOffset); + else + YamlIO.setError("missing required key 'reg' or 'offset'"); + } + YamlIO.mapOptional("mask", A.Mask); + } + static const bool flow = true; +}; + +struct SIArgumentInfo { + Optional<SIArgument> PrivateSegmentBuffer; + Optional<SIArgument> DispatchPtr; + Optional<SIArgument> QueuePtr; + Optional<SIArgument> KernargSegmentPtr; + Optional<SIArgument> DispatchID; + Optional<SIArgument> FlatScratchInit; + Optional<SIArgument> PrivateSegmentSize; + + Optional<SIArgument> WorkGroupIDX; + Optional<SIArgument> WorkGroupIDY; + Optional<SIArgument> WorkGroupIDZ; + Optional<SIArgument> WorkGroupInfo; + Optional<SIArgument> PrivateSegmentWaveByteOffset; + + Optional<SIArgument> ImplicitArgPtr; + Optional<SIArgument> ImplicitBufferPtr; + + Optional<SIArgument> WorkItemIDX; + Optional<SIArgument> WorkItemIDY; + Optional<SIArgument> WorkItemIDZ; +}; + +template <> struct MappingTraits<SIArgumentInfo> { + static void mapping(IO &YamlIO, SIArgumentInfo &AI) { + YamlIO.mapOptional("privateSegmentBuffer", AI.PrivateSegmentBuffer); + YamlIO.mapOptional("dispatchPtr", AI.DispatchPtr); + YamlIO.mapOptional("queuePtr", AI.QueuePtr); + YamlIO.mapOptional("kernargSegmentPtr", AI.KernargSegmentPtr); + YamlIO.mapOptional("dispatchID", AI.DispatchID); + YamlIO.mapOptional("flatScratchInit", AI.FlatScratchInit); + YamlIO.mapOptional("privateSegmentSize", AI.PrivateSegmentSize); + + YamlIO.mapOptional("workGroupIDX", AI.WorkGroupIDX); + YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY); + YamlIO.mapOptional("workGroupIDZ", AI.WorkGroupIDZ); + YamlIO.mapOptional("workGroupInfo", AI.WorkGroupInfo); + YamlIO.mapOptional("privateSegmentWaveByteOffset", + AI.PrivateSegmentWaveByteOffset); + + YamlIO.mapOptional("implicitArgPtr", AI.ImplicitArgPtr); + YamlIO.mapOptional("implicitBufferPtr", AI.ImplicitBufferPtr); + + YamlIO.mapOptional("workItemIDX", AI.WorkItemIDX); + YamlIO.mapOptional("workItemIDY", AI.WorkItemIDY); + YamlIO.mapOptional("workItemIDZ", AI.WorkItemIDZ); + } +}; + +// Default to default mode for default calling convention. +struct SIMode { + bool IEEE = true; + bool DX10Clamp = true; + + SIMode() = default; + + + SIMode(const AMDGPU::SIModeRegisterDefaults &Mode) { + IEEE = Mode.IEEE; + DX10Clamp = Mode.DX10Clamp; } + + bool operator ==(const SIMode Other) const { + return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp; + } +}; + +template <> struct MappingTraits<SIMode> { + static void mapping(IO &YamlIO, SIMode &Mode) { + YamlIO.mapOptional("ieee", Mode.IEEE, true); + YamlIO.mapOptional("dx10-clamp", Mode.DX10Clamp, true); + } +}; + +struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { + uint64_t ExplicitKernArgSize = 0; + unsigned MaxKernArgAlign = 0; + unsigned LDSSize = 0; + bool IsEntryFunction = false; + bool NoSignedZerosFPMath = false; + bool MemoryBound = false; + bool WaveLimiter = false; + + StringValue ScratchRSrcReg = "$private_rsrc_reg"; + StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg"; + StringValue FrameOffsetReg = "$fp_reg"; + StringValue StackPtrOffsetReg = "$sp_reg"; + + Optional<SIArgumentInfo> ArgInfo; + SIMode Mode; + + SIMachineFunctionInfo() = default; + SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, + const TargetRegisterInfo &TRI); + + void mappingImpl(yaml::IO &YamlIO) override; + ~SIMachineFunctionInfo() = default; }; +template <> struct MappingTraits<SIMachineFunctionInfo> { + static void mapping(IO &YamlIO, SIMachineFunctionInfo &MFI) { + YamlIO.mapOptional("explicitKernArgSize", MFI.ExplicitKernArgSize, + UINT64_C(0)); + YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign, 0u); + YamlIO.mapOptional("ldsSize", MFI.LDSSize, 0u); + YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false); + YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false); + YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false); + YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); + YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, + StringValue("$private_rsrc_reg")); + YamlIO.mapOptional("scratchWaveOffsetReg", MFI.ScratchWaveOffsetReg, + StringValue("$scratch_wave_offset_reg")); + YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, + StringValue("$fp_reg")); + YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg, + StringValue("$sp_reg")); + YamlIO.mapOptional("argumentInfo", MFI.ArgInfo); + YamlIO.mapOptional("mode", MFI.Mode, SIMode()); + } +}; + +} // end namespace yaml + /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo final : public AMDGPUMachineFunction { + friend class GCNTargetMachine; + unsigned TIDReg = AMDGPU::NoRegister; // Registers that may be reserved for spilling purposes. These may be the same @@ -99,6 +329,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { AMDGPUFunctionArgInfo ArgInfo; + // State of MODE register, assumed FP mode. + AMDGPU::SIModeRegisterDefaults Mode; + // Graphics info. unsigned PSInputAddr = 0; unsigned PSInputEnable = 0; @@ -124,16 +357,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // unit. Minimum - first, maximum - second. std::pair<unsigned, unsigned> WavesPerEU = {0, 0}; - // Stack object indices for work group IDs. - std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices = {{0, 0, 0}}; - - // Stack object indices for work item IDs. - std::array<int, 3> DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}}; - DenseMap<const Value *, std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs; DenseMap<const Value *, std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs; + std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV; private: unsigned LDSWaveSpillSize = 0; @@ -182,6 +410,7 @@ private: unsigned GITPtrHigh; unsigned HighBitsOf32BitAddress; + unsigned GDSSize; // Current recorded maximum possible occupancy. unsigned Occupancy; @@ -213,6 +442,15 @@ public: SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {} }; + struct VGPRSpillToAGPR { + SmallVector<MCPhysReg, 32> Lanes; + bool FullyAllocated = false; + }; + + SparseBitVector<> WWMReservedRegs; + + void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); } + private: // SGPR->VGPR spilling support. using SpillRegMask = std::pair<unsigned, unsigned>; @@ -223,9 +461,25 @@ private: unsigned NumVGPRSpillLanes = 0; SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs; + DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills; + + // AGPRs used for VGPR spills. + SmallVector<MCPhysReg, 32> SpillAGPR; + + // VGPRs used for AGPR spills. + SmallVector<MCPhysReg, 32> SpillVGPR; + +public: // FIXME + /// If this is set, an SGPR used for save/restore of the register used for the + /// frame pointer. + unsigned SGPRForFPSaveRestoreCopy = 0; + Optional<int> FramePointerSaveIndex; + public: SIMachineFunctionInfo(const MachineFunction &MF); + bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI); + ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const { auto I = SGPRToVGPRSpills.find(FrameIndex); return (I == SGPRToVGPRSpills.end()) ? @@ -236,8 +490,29 @@ public: return SpillVGPRs; } + ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const { + return SpillAGPR; + } + + ArrayRef<MCPhysReg> getVGPRSpillAGPRs() const { + return SpillVGPR; + } + + MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const { + auto I = VGPRToAGPRSpills.find(FrameIndex); + return (I == VGPRToAGPRSpills.end()) ? (MCPhysReg)AMDGPU::NoRegister + : I->second.Lanes[Lane]; + } + + AMDGPU::SIModeRegisterDefaults getMode() const { + return Mode; + } + + bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, + unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); - void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); + bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); + void removeDeadFrameIndices(MachineFrameInfo &MFI); bool hasCalculatedTID() const { return TIDReg != 0; }; unsigned getTIDReg() const { return TIDReg; }; @@ -386,8 +661,9 @@ public: return ArgInfo.getPreloadedValue(Value); } - unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const { - return ArgInfo.getPreloadedValue(Value).first->getRegister(); + Register getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const { + auto Arg = ArgInfo.getPreloadedValue(Value).first; + return Arg ? Arg->getRegister() : Register(); } unsigned getGITPtrHigh() const { @@ -398,6 +674,10 @@ public: return HighBitsOf32BitAddress; } + unsigned getGDSSize() const { + return GDSSize; + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } @@ -429,6 +709,11 @@ public: return FrameOffsetReg; } + void setFrameOffsetReg(unsigned Reg) { + assert(Reg != 0 && "Should never be unset"); + FrameOffsetReg = Reg; + } + void setStackPtrOffsetReg(unsigned Reg) { assert(Reg != 0 && "Should never be unset"); StackPtrOffsetReg = Reg; @@ -445,8 +730,6 @@ public: void setScratchWaveOffsetReg(unsigned Reg) { assert(Reg != 0 && "Should never be unset"); ScratchWaveOffsetReg = Reg; - if (isEntryFunction()) - FrameOffsetReg = ScratchWaveOffsetReg; } unsigned getQueuePtrUserSGPR() const { @@ -565,30 +848,6 @@ public: return WavesPerEU.second; } - /// \returns Stack object index for \p Dim's work group ID. - int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const { - assert(Dim < 3); - return DebuggerWorkGroupIDStackObjectIndices[Dim]; - } - - /// Sets stack object index for \p Dim's work group ID to \p ObjectIdx. - void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) { - assert(Dim < 3); - DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx; - } - - /// \returns Stack object index for \p Dim's work item ID. - int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const { - assert(Dim < 3); - return DebuggerWorkItemIDStackObjectIndices[Dim]; - } - - /// Sets stack object index for \p Dim's work item ID to \p ObjectIdx. - void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) { - assert(Dim < 3); - DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx; - } - /// \returns SGPR used for \p Dim's work group ID. unsigned getWorkGroupIDSGPR(unsigned Dim) const { switch (Dim) { @@ -605,9 +864,6 @@ public: llvm_unreachable("unexpected dimension"); } - /// \returns VGPR used for \p Dim' work item ID. - unsigned getWorkItemIDVGPR(unsigned Dim) const; - unsigned getLDSWaveSpillSize() const { return LDSWaveSpillSize; } @@ -630,6 +886,15 @@ public: return PSV.first->second.get(); } + const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) { + if (!GWSResourcePSV) { + GWSResourcePSV = + llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII); + } + + return GWSResourcePSV.get(); + } + unsigned getOccupancy() const { return Occupancy; } diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index fb7e670068fe..ebbdf80f9567 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -1,9 +1,8 @@ //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -1875,6 +1874,8 @@ void SIScheduleDAGMI::moveLowLatencies() { bool CopyForLowLat = false; for (SDep& SuccDep : SU->Succs) { SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { CopyForLowLat = true; } @@ -1955,7 +1956,7 @@ void SIScheduleDAGMI::schedule() for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { SUnit *SU = &SUnits[i]; - MachineOperand *BaseLatOp; + const MachineOperand *BaseLatOp; int64_t OffLatReg; if (SITII->isLowLatencyInstruction(*SU->getInstr())) { IsLowLatencySU[i] = 1; diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h index 0ce68ac6a897..c28a7be4d03a 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/lib/Target/AMDGPU/SIMachineScheduler.h @@ -1,9 +1,8 @@ //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index b4a4e9e33133..4320e6c957a0 100644 --- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1,9 +1,8 @@ //===- SIMemoryLegalizer.cpp ----------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -146,7 +145,7 @@ private: // only contains a single address space. if ((OrderingAddrSpace == InstrAddrSpace) && isPowerOf2_32(uint32_t(InstrAddrSpace))) - IsCrossAddressSpaceOrdering = false; + this->IsCrossAddressSpaceOrdering = false; } public: @@ -353,6 +352,40 @@ public: }; +class SIGfx10CacheControl : public SIGfx7CacheControl { +protected: + bool CuMode = false; + + /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit<AMDGPU::OpName::dlc>(MI); + } + +public: + + SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) : + SIGfx7CacheControl(ST), CuMode(CuMode) {}; + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; + + bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; + + bool insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const override; +}; + class SIMemoryLegalizer final : public MachineFunctionPass { private: @@ -418,35 +451,46 @@ void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const { - /// TODO: For now assume OpenCL memory model which treats each - /// address space as having a separate happens-before relation, and - /// so an instruction only has ordering with respect to the address - /// space it accesses, and if it accesses multiple address spaces it - /// does not require ordering of operations in different address - /// spaces. - if (SSID == SyncScope::System) + if (SSID == SyncScope::System) + return std::make_tuple(SIAtomicScope::SYSTEM, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getAgentSSID()) + return std::make_tuple(SIAtomicScope::AGENT, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getWorkgroupSSID()) + return std::make_tuple(SIAtomicScope::WORKGROUP, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getWavefrontSSID()) + return std::make_tuple(SIAtomicScope::WAVEFRONT, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == SyncScope::SingleThread) + return std::make_tuple(SIAtomicScope::SINGLETHREAD, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getSystemOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getAgentSSID()) + if (SSID == MMI->getAgentOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getWorkgroupSSID()) + if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getWavefrontSSID()) + if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == SyncScope::SingleThread) + if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - /// TODO: To support HSA Memory Model need to add additional memory - /// scopes that specify that do require cross address space - /// ordering. return None; } @@ -613,7 +657,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) return make_unique<SIGfx6CacheControl>(ST); - return make_unique<SIGfx7CacheControl>(ST); + if (Generation < AMDGPUSubtarget::GFX10) + return make_unique<SIGfx7CacheControl>(ST); + return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled()); } bool SIGfx6CacheControl::enableLoadCacheBypass( @@ -722,13 +768,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, bool VMCnt = false; bool LGKMCnt = false; - bool EXPCnt = false; if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: - VMCnt = true; + VMCnt |= true; break; case SIAtomicScope::WORKGROUP: case SIAtomicScope::WAVEFRONT: @@ -752,7 +797,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, // also synchronizing with global/GDS memory as LDS operations // could be reordered with respect to later global/GDS memory // operations of the same wave. - LGKMCnt = IsCrossAddrSpaceOrdering; + LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: @@ -774,7 +819,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, // also synchronizing with global/LDS memory as GDS operations // could be reordered with respect to later global/LDS memory // operations of the same wave. - EXPCnt = IsCrossAddrSpaceOrdering; + LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WORKGROUP: case SIAtomicScope::WAVEFRONT: @@ -787,11 +832,11 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, } } - if (VMCnt || LGKMCnt || EXPCnt) { + if (VMCnt || LGKMCnt) { unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(IV, VMCnt ? 0 : getVmcntBitMask(IV), - EXPCnt ? 0 : getExpcntBitMask(IV), + getExpcntBitMask(IV), LGKMCnt ? 0 : getLgkmcntBitMask(IV)); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); Changed = true; @@ -851,6 +896,231 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, return Changed; } +bool SIGfx10CacheControl::enableLoadCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && !MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + /// TODO Do not set glc for rmw atomic operations as they + /// implicitly bypass the L0/L1 caches. + + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + Changed |= enableGLCBit(MI); + Changed |= enableDLCBit(MI); + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in + // CU mode and all waves of a work-group are on the same CU, and so the + // L0 does not need to be bypassed. + if (!CuMode) Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not hava a cache. + + return Changed; +} + +bool SIGfx10CacheControl::enableNonTemporal( + const MachineBasicBlock::iterator &MI) const { + assert(MI->mayLoad() ^ MI->mayStore()); + bool Changed = false; + + Changed |= enableSLCBit(MI); + /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI) + + return Changed; +} + +bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); + Changed = true; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise + // in CU mode and all waves of a work-group are on the same CU, and so the + // L0 does not need to be invalidated. + if (!CuMode) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); + Changed = true; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to invalidate. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory cache + /// to be flushed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not hava a cache. + + if (Pos == Position::AFTER) + --MI; + + return Changed; +} + +bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + bool VMCnt = false; + bool VSCnt = false; + bool LGKMCnt = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + VMCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + VSCnt |= true; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to wait for operations to complete to ensure + // they are visible to waves in the other CU as the L0 is per CU. + // Otherwise in CU mode and all waves of a work-group are on the same CU + // which shares the same L0. + if (!CuMode) { + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + VMCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + VSCnt |= true; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The L0 cache keeps all memory operations in order for + // work-items in the same wavefront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + // If no cross address space ordering then an LDS waitcnt is not + // needed as LDS operations for all waves are executed in a + // total global ordering as observed by all waves. Required if + // also synchronizing with global/GDS memory as LDS operations + // could be reordered with respect to later global/GDS memory + // operations of the same wave. + LGKMCnt |= IsCrossAddrSpaceOrdering; + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The LDS keeps all memory operations in order for + // the same wavesfront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + // If no cross address space ordering then an GDS waitcnt is not + // needed as GDS operations for all waves are executed in a + // total global ordering as observed by all waves. Required if + // also synchronizing with global/LDS memory as GDS operations + // could be reordered with respect to later global/LDS memory + // operations of the same wave. + LGKMCnt |= IsCrossAddrSpaceOrdering; + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The GDS keeps all memory operations in order for + // the same work-group. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if (VMCnt || LGKMCnt) { + unsigned WaitCntImmediate = + AMDGPU::encodeWaitcnt(IV, + VMCnt ? 0 : getVmcntBitMask(IV), + getExpcntBitMask(IV), + LGKMCnt ? 0 : getLgkmcntBitMask(IV)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); + Changed = true; + } + + if (VSCnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + Changed = true; + } + + if (Pos == Position::AFTER) + --MI; + + return Changed; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp index 883fd308f2f4..a5edd7b3554a 100644 --- a/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/lib/Target/AMDGPU/SIModeRegister.cpp @@ -1,9 +1,8 @@ //===-- SIModeRegister.cpp - Mode Register --------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file @@ -45,7 +44,7 @@ struct Status { Status() : Mask(0), Mode(0){}; - Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) { + Status(unsigned NewMask, unsigned NewMode) : Mask(NewMask), Mode(NewMode) { Mode &= Mask; }; diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index ebcad30a1866..3227bff20513 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -1,9 +1,8 @@ //===-- SIOptimizeExecMasking.cpp -----------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -57,13 +56,16 @@ char SIOptimizeExecMasking::ID = 0; char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; /// If \p MI is a copy from exec, return the register copied to. -static unsigned isCopyFromExec(const MachineInstr &MI) { +static unsigned isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: - case AMDGPU::S_MOV_B64_term: { + case AMDGPU::S_MOV_B64_term: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B32_term: { const MachineOperand &Src = MI.getOperand(1); - if (Src.isReg() && Src.getReg() == AMDGPU::EXEC) + if (Src.isReg() && + Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)) return MI.getOperand(0).getReg(); } } @@ -72,16 +74,20 @@ static unsigned isCopyFromExec(const MachineInstr &MI) { } /// If \p MI is a copy to exec, return the register copied from. -static unsigned isCopyToExec(const MachineInstr &MI) { +static unsigned isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) { switch (MI.getOpcode()) { case AMDGPU::COPY: - case AMDGPU::S_MOV_B64: { + case AMDGPU::S_MOV_B64: + case AMDGPU::S_MOV_B32: { const MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC && MI.getOperand(1).isReg()) + if (Dst.isReg() && + Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) && + MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); break; } case AMDGPU::S_MOV_B64_term: + case AMDGPU::S_MOV_B32_term: llvm_unreachable("should have been replaced"); } @@ -106,6 +112,23 @@ static unsigned isLogicalOpOnExec(const MachineInstr &MI) { const MachineOperand &Src2 = MI.getOperand(2); if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) return MI.getOperand(0).getReg(); + break; + } + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32: + case AMDGPU::S_XOR_B32: + case AMDGPU::S_ANDN2_B32: + case AMDGPU::S_ORN2_B32: + case AMDGPU::S_NAND_B32: + case AMDGPU::S_NOR_B32: + case AMDGPU::S_XNOR_B32: { + const MachineOperand &Src1 = MI.getOperand(1); + if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO) + return MI.getOperand(0).getReg(); + const MachineOperand &Src2 = MI.getOperand(2); + if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO) + return MI.getOperand(0).getReg(); + break; } } @@ -130,6 +153,22 @@ static unsigned getSaveExecOp(unsigned Opc) { return AMDGPU::S_NOR_SAVEEXEC_B64; case AMDGPU::S_XNOR_B64: return AMDGPU::S_XNOR_SAVEEXEC_B64; + case AMDGPU::S_AND_B32: + return AMDGPU::S_AND_SAVEEXEC_B32; + case AMDGPU::S_OR_B32: + return AMDGPU::S_OR_SAVEEXEC_B32; + case AMDGPU::S_XOR_B32: + return AMDGPU::S_XOR_SAVEEXEC_B32; + case AMDGPU::S_ANDN2_B32: + return AMDGPU::S_ANDN2_SAVEEXEC_B32; + case AMDGPU::S_ORN2_B32: + return AMDGPU::S_ORN2_SAVEEXEC_B32; + case AMDGPU::S_NAND_B32: + return AMDGPU::S_NAND_SAVEEXEC_B32; + case AMDGPU::S_NOR_B32: + return AMDGPU::S_NOR_SAVEEXEC_B32; + case AMDGPU::S_XNOR_B32: + return AMDGPU::S_XNOR_SAVEEXEC_B32; default: return AMDGPU::INSTRUCTION_LIST_END; } @@ -140,7 +179,8 @@ static unsigned getSaveExecOp(unsigned Opc) { // these is expected per block. static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { switch (MI.getOpcode()) { - case AMDGPU::S_MOV_B64_term: { + case AMDGPU::S_MOV_B64_term: + case AMDGPU::S_MOV_B32_term: { MI.setDesc(TII.get(AMDGPU::COPY)); return true; } @@ -150,12 +190,30 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); return true; } + case AMDGPU::S_XOR_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_XOR_B32)); + return true; + } + case AMDGPU::S_OR_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_OR_B32)); + return true; + } case AMDGPU::S_ANDN2_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64)); return true; } + case AMDGPU::S_ANDN2_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); + return true; + } default: return false; } @@ -178,6 +236,7 @@ static MachineBasicBlock::reverse_iterator fixTerminators( static MachineBasicBlock::reverse_iterator findExecCopy( const SIInstrInfo &TII, + const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I, unsigned CopyToExec) { @@ -185,7 +244,7 @@ static MachineBasicBlock::reverse_iterator findExecCopy( auto E = MBB.rend(); for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { - unsigned CopyFromExec = isCopyFromExec(*I); + unsigned CopyFromExec = isCopyFromExec(*I, ST); if (CopyFromExec != AMDGPU::NoRegister) return I; } @@ -194,8 +253,8 @@ static MachineBasicBlock::reverse_iterator findExecCopy( } // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly -// repor tthe register as unavailable because a super-register with a lane mask -// as unavailable. +// report the register as unavailable because a super-register with a lane mask +// is unavailable. static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { for (MachineBasicBlock *Succ : MBB.successors()) { if (Succ->isLiveIn(Reg)) @@ -212,6 +271,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Optimize sequences emitted for control flow lowering. They are originally // emitted as the separate operations because spill code may need to be @@ -230,13 +290,13 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (I == E) continue; - unsigned CopyToExec = isCopyToExec(*I); + unsigned CopyToExec = isCopyToExec(*I, ST); if (CopyToExec == AMDGPU::NoRegister) continue; // Scan backwards to find the def. auto CopyToExecInst = &*I; - auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec); + auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec); if (CopyFromExecInst == E) { auto PrepareExecInst = std::next(I); if (PrepareExecInst == E) @@ -246,7 +306,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); - PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC); + PrepareExecInst->getOperand(0).setReg(Exec); LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); @@ -269,7 +329,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock::iterator J = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); J != JE; ++J) { - if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) { + if (SaveExecInst && J->readsRegister(Exec, TRI)) { LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); // Make sure this is inserted after any VALU ops that may have been // scheduled in between. @@ -353,7 +413,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { CopyToExecInst->eraseFromParent(); for (MachineInstr *OtherInst : OtherUseInsts) { - OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC, + OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, *TRI); } } diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index c671fed34bdf..7e10316eab92 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -1,9 +1,8 @@ //===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -34,10 +33,22 @@ using namespace llvm; namespace { class SIOptimizeExecMaskingPreRA : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + MachineRegisterInfo *MRI; + public: - static char ID; + MachineBasicBlock::iterator skipIgnoreExecInsts( + MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const; + + MachineBasicBlock::iterator skipIgnoreExecInstsTrivialSucc( + MachineBasicBlock *&MBB, + MachineBasicBlock::iterator It) const; public: + static char ID; + SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) { initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry()); } @@ -71,38 +82,93 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() { return new SIOptimizeExecMaskingPreRA(); } -static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) { +static bool isEndCF(const MachineInstr &MI, const SIRegisterInfo *TRI, + const GCNSubtarget &ST) { + if (ST.isWave32()) { + return MI.getOpcode() == AMDGPU::S_OR_B32 && + MI.modifiesRegister(AMDGPU::EXEC_LO, TRI); + } + return MI.getOpcode() == AMDGPU::S_OR_B64 && MI.modifiesRegister(AMDGPU::EXEC, TRI); } -static bool isFullExecCopy(const MachineInstr& MI) { - return MI.isFullCopy() && MI.getOperand(1).getReg() == AMDGPU::EXEC; +static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) { + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + + if (MI.isCopy() && MI.getOperand(1).getReg() == Exec) { + assert(MI.isFullCopy()); + return true; + } + + return false; } static unsigned getOrNonExecReg(const MachineInstr &MI, - const SIInstrInfo &TII) { + const SIInstrInfo &TII, + const GCNSubtarget& ST) { + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1); - if (Op->isReg() && Op->getReg() != AMDGPU::EXEC) + if (Op->isReg() && Op->getReg() != Exec) return Op->getReg(); Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0); - if (Op->isReg() && Op->getReg() != AMDGPU::EXEC) + if (Op->isReg() && Op->getReg() != Exec) return Op->getReg(); return AMDGPU::NoRegister; } static MachineInstr* getOrExecSource(const MachineInstr &MI, const SIInstrInfo &TII, - const MachineRegisterInfo &MRI) { - auto SavedExec = getOrNonExecReg(MI, TII); + const MachineRegisterInfo &MRI, + const GCNSubtarget& ST) { + auto SavedExec = getOrNonExecReg(MI, TII, ST); if (SavedExec == AMDGPU::NoRegister) return nullptr; auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec); - if (!SaveExecInst || !isFullExecCopy(*SaveExecInst)) + if (!SaveExecInst || !isFullExecCopy(*SaveExecInst, ST)) return nullptr; return SaveExecInst; } +/// Skip over instructions that don't care about the exec mask. +MachineBasicBlock::iterator SIOptimizeExecMaskingPreRA::skipIgnoreExecInsts( + MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const { + for ( ; I != E; ++I) { + if (TII->mayReadEXEC(*MRI, *I)) + break; + } + + return I; +} + +// Skip to the next instruction, ignoring debug instructions, and trivial block +// boundaries (blocks that have one (typically fallthrough) successor, and the +// successor has one predecessor. +MachineBasicBlock::iterator +SIOptimizeExecMaskingPreRA::skipIgnoreExecInstsTrivialSucc( + MachineBasicBlock *&MBB, + MachineBasicBlock::iterator It) const { + + do { + It = skipIgnoreExecInsts(It, MBB->end()); + if (It != MBB->end() || MBB->succ_size() != 1) + break; + + // If there is one trivial successor, advance to the next block. + MachineBasicBlock *Succ = *MBB->succ_begin(); + + // TODO: Is this really necessary? + if (!MBB->isLayoutSuccessor(Succ)) + break; + + It = Succ->begin(); + MBB = Succ; + } while (true); + + return It; +} + + // Optimize sequence // %sel = V_CNDMASK_B32_e64 0, 1, %cc // %cmp = V_CMP_NE_U32 1, %1 @@ -125,10 +191,11 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, LiveIntervals *LIS) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); - const unsigned AndOpc = AMDGPU::S_AND_B64; - const unsigned Andn2Opc = AMDGPU::S_ANDN2_B64; - const unsigned CondReg = AMDGPU::VCC; - const unsigned ExecReg = AMDGPU::EXEC; + bool Wave32 = ST.isWave32(); + const unsigned AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const unsigned Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; + const unsigned CondReg = Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; + const unsigned ExecReg = Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); @@ -172,6 +239,10 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) return AMDGPU::NoRegister; + if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) || + TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers)) + return AMDGPU::NoRegister; + Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0); Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1); MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2); @@ -187,7 +258,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc), And->getOperand(0).getReg()) .addReg(ExecReg) - .addReg(CCReg, CC->getSubReg()); + .addReg(CCReg, 0, CC->getSubReg()); And->eraseFromParent(); LIS->InsertMachineInstrInMaps(*Andn2); @@ -224,11 +295,14 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { return false; const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); + MRI = &MF.getRegInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI}); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; bool Changed = false; for (MachineBasicBlock &MBB : MF) { @@ -248,9 +322,10 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { // Skip this if the endpgm has any implicit uses, otherwise we would need // to be careful to update / remove them. + // S_ENDPGM always has a single imm operand that is not used other than to + // end up in the encoding MachineInstr &Term = MBB.back(); - if (Term.getOpcode() != AMDGPU::S_ENDPGM || - Term.getNumOperands() != 0) + if (Term.getOpcode() != AMDGPU::S_ENDPGM || Term.getNumOperands() != 1) continue; SmallVector<MachineBasicBlock*, 4> Blocks({&MBB}); @@ -304,32 +379,21 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { } // Try to collapse adjacent endifs. - auto Lead = MBB.begin(), E = MBB.end(); - if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI)) - continue; - - const MachineBasicBlock* Succ = *MBB.succ_begin(); - if (!MBB.isLayoutSuccessor(Succ)) - continue; - - auto I = std::next(Lead); - - for ( ; I != E; ++I) - if (!TII->isSALU(*I) || I->readsRegister(AMDGPU::EXEC, TRI)) - break; - - if (I != E) + auto E = MBB.end(); + auto Lead = skipDebugInstructionsForward(MBB.begin(), E); + if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI, ST)) continue; - const auto NextLead = Succ->begin(); - if (NextLead == Succ->end() || !isEndCF(*NextLead, TRI) || - !getOrExecSource(*NextLead, *TII, MRI)) + MachineBasicBlock *TmpMBB = &MBB; + auto NextLead = skipIgnoreExecInstsTrivialSucc(TmpMBB, std::next(Lead)); + if (NextLead == TmpMBB->end() || !isEndCF(*NextLead, TRI, ST) || + !getOrExecSource(*NextLead, *TII, MRI, ST)) continue; LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n'); - auto SaveExec = getOrExecSource(*Lead, *TII, MRI); - unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII); + auto SaveExec = getOrExecSource(*Lead, *TII, MRI, ST); + unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII, ST); for (auto &Op : Lead->operands()) { if (Op.isReg()) RecalcRegs.insert(Op.getReg()); @@ -363,7 +427,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (SafeToReplace) { LIS->RemoveMachineInstrFromMaps(*SaveExec); SaveExec->eraseFromParent(); - MRI.replaceRegWith(SavedExec, AMDGPU::EXEC); + MRI.replaceRegWith(SavedExec, Exec); LIS->removeInterval(SavedExec); } } @@ -375,8 +439,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (!MRI.reg_empty(Reg)) LIS->createAndComputeVirtRegInterval(Reg); } else { - for (MCRegUnitIterator U(Reg, TRI); U.isValid(); ++U) - LIS->removeRegUnit(*U); + LIS->removeAllRegUnitsForPhysReg(Reg); } } } diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 2d43d5d05ef6..2d71abc0612a 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1,9 +1,8 @@ //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -348,8 +347,8 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, if (Abs || Neg) { assert(!Sext && "Float and integer src modifiers can't be set simulteniously"); - Mods |= Abs ? SISrcMods::ABS : 0; - Mods ^= Neg ? SISrcMods::NEG : 0; + Mods |= Abs ? SISrcMods::ABS : 0u; + Mods ^= Neg ? SISrcMods::NEG : 0u; } else if (Sext) { Mods |= SISrcMods::SEXT; } @@ -419,7 +418,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { } assert(Src && Src->isReg()); - if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && !isSameReg(*Src, *getReplacedOperand())) { // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to @@ -461,7 +462,9 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused - if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && getDstSel() != AMDGPU::SDWA::DWORD) { // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD @@ -951,7 +954,8 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, if (TII->isVOPC(Opc)) { if (!ST.hasSDWASdst()) { const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); - if (SDst && SDst->getReg() != AMDGPU::VCC) + if (SDst && (SDst->getReg() != AMDGPU::VCC && + SDst->getReg() != AMDGPU::VCC_LO)) return false; } @@ -965,10 +969,16 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, return false; } - if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 || + if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || + Opc == AMDGPU::V_FMAC_F32_e32 || + Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F32_e32)) return false; + // Check if target supports this SDWA opcode + if (TII->pseudoToMCOpcode(Opc) == -1) + return false; + // FIXME: has SDWA but require handling of implicit VCC use if (Opc == AMDGPU::V_CNDMASK_B32_e32) return false; @@ -1010,7 +1020,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, SDWAInst.add(*Dst); } else { assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); - SDWAInst.addReg(AMDGPU::VCC, RegState::Define); + SDWAInst.addReg(TRI->getVCC(), RegState::Define); } // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and @@ -1039,7 +1049,9 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, SDWAInst.add(*Src1); } - if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || + if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || + SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || + SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { // v_mac_f16/32 has additional src2 operand tied to vdst MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); diff --git a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp new file mode 100644 index 000000000000..f9bfe96f65cb --- /dev/null +++ b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -0,0 +1,221 @@ +//===- SIPreAllocateWWMRegs.cpp - WWM Register Pre-allocation -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Pass to pre-allocated WWM registers +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/RegisterClassInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-pre-allocate-wwm-regs" + +namespace { + +class SIPreAllocateWWMRegs : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + LiveRegMatrix *Matrix; + VirtRegMap *VRM; + RegisterClassInfo RegClassInfo; + + std::vector<unsigned> RegsToRewrite; + +public: + static char ID; + + SIPreAllocateWWMRegs() : MachineFunctionPass(ID) { + initializeSIPreAllocateWWMRegsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.addPreserved<LiveIntervals>(); + AU.addRequired<VirtRegMap>(); + AU.addRequired<LiveRegMatrix>(); + AU.addPreserved<SlotIndexes>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool processDef(MachineOperand &MO); + void rewriteRegs(MachineFunction &MF); +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIPreAllocateWWMRegs, DEBUG_TYPE, + "SI Pre-allocate WWM Registers", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(SIPreAllocateWWMRegs, DEBUG_TYPE, + "SI Pre-allocate WWM Registers", false, false) + +char SIPreAllocateWWMRegs::ID = 0; + +char &llvm::SIPreAllocateWWMRegsID = SIPreAllocateWWMRegs::ID; + +FunctionPass *llvm::createSIPreAllocateWWMRegsPass() { + return new SIPreAllocateWWMRegs(); +} + +bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) { + if (!MO.isReg()) + return false; + + unsigned Reg = MO.getReg(); + + if (!TRI->isVGPR(*MRI, Reg)) + return false; + + if (TRI->isPhysicalRegister(Reg)) + return false; + + if (VRM->hasPhys(Reg)) + return false; + + LiveInterval &LI = LIS->getInterval(Reg); + + for (unsigned PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) { + if (!MRI->isPhysRegUsed(PhysReg) && + Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) { + Matrix->assign(LI, PhysReg); + assert(PhysReg != 0); + RegsToRewrite.push_back(Reg); + return true; + } + } + + llvm_unreachable("physreg not found for WWM expression"); + return false; +} + +void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + + const unsigned VirtReg = MO.getReg(); + if (TRI->isPhysicalRegister(VirtReg)) + continue; + + if (!VRM->hasPhys(VirtReg)) + continue; + + unsigned PhysReg = VRM->getPhys(VirtReg); + const unsigned SubReg = MO.getSubReg(); + if (SubReg != 0) { + PhysReg = TRI->getSubReg(PhysReg, SubReg); + MO.setSubReg(0); + } + + MO.setReg(PhysReg); + MO.setIsRenamable(false); + } + } + } + + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + for (unsigned Reg : RegsToRewrite) { + LIS->removeInterval(Reg); + + const unsigned PhysReg = VRM->getPhys(Reg); + assert(PhysReg != 0); + MFI->ReserveWWMRegister(PhysReg); + } + + RegsToRewrite.clear(); + + // Update the set of reserved registers to include WWM ones. + MRI->freezeReservedRegs(MF); +} + +bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n"); + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + LIS = &getAnalysis<LiveIntervals>(); + Matrix = &getAnalysis<LiveRegMatrix>(); + VRM = &getAnalysis<VirtRegMap>(); + + RegClassInfo.runOnMachineFunction(MF); + + bool RegsAssigned = false; + + // We use a reverse post-order traversal of the control-flow graph to + // guarantee that we visit definitions in dominance order. Since WWM + // expressions are guaranteed to never involve phi nodes, and we can only + // escape WWM through the special WWM instruction, this means that this is a + // perfect elimination order, so we can never do any better. + ReversePostOrderTraversal<MachineFunction*> RPOT(&MF); + + for (MachineBasicBlock *MBB : RPOT) { + bool InWWM = false; + for (MachineInstr &MI : *MBB) { + if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || + MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64) + RegsAssigned |= processDef(MI.getOperand(0)); + + if (MI.getOpcode() == AMDGPU::ENTER_WWM) { + LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n"); + InWWM = true; + continue; + } + + if (MI.getOpcode() == AMDGPU::EXIT_WWM) { + LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n"); + InWWM = false; + } + + if (!InWWM) + continue; + + LLVM_DEBUG(dbgs() << "processing " << MI << "\n"); + + for (MachineOperand &DefOpnd : MI.defs()) { + RegsAssigned |= processDef(DefOpnd); + } + } + } + + if (!RegsAssigned) + return false; + + rewriteRegs(MF); + return true; +} diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h index 383f6b575808..168f05f8fdd6 100644 --- a/lib/Target/AMDGPU/SIProgramInfo.h +++ b/lib/Target/AMDGPU/SIProgramInfo.h @@ -1,9 +1,8 @@ //===--- SIProgramInfo.h ----------------------------------------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -29,6 +28,8 @@ struct SIProgramInfo { uint32_t DX10Clamp = 0; uint32_t DebugMode = 0; uint32_t IEEEMode = 0; + uint32_t WgpMode = 0; // GFX10+ + uint32_t MemOrdered = 0; // GFX10+ uint64_t ScratchSize = 0; uint64_t ComputePGMRSrc1 = 0; @@ -50,18 +51,6 @@ struct SIProgramInfo { // Number of VGPRs that meets number of waves per execution unit request. uint32_t NumVGPRsForWavesPerEU = 0; - // Fixed SGPR number used to hold wave scratch offset for entire kernel - // execution, or std::numeric_limits<uint16_t>::max() if the register is not - // used or not known. - uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR = - std::numeric_limits<uint16_t>::max(); - - // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire - // kernel execution, or std::numeric_limits<uint16_t>::max() if the register - // is not used or not known. - uint16_t DebuggerPrivateSegmentBufferSGPR = - std::numeric_limits<uint16_t>::max(); - // Whether there is recursion, dynamic allocas, indirect calls or some other // reason there may be statically unknown stack usage. bool DynamicCallStack = false; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 97cfde2b2354..f152deb28004 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1,9 +1,8 @@ //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -17,6 +16,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" @@ -63,8 +63,10 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPURegisterInfo(), SGPRPressureSets(getNumRegPressureSets()), VGPRPressureSets(getNumRegPressureSets()), + AGPRPressureSets(getNumRegPressureSets()), SpillSGPRToVGPR(false), - SpillSGPRToSMEM(false) { + SpillSGPRToSMEM(false), + isWave32(ST.isWave32()) { if (EnableSpillSGPRToSMEM && ST.hasScalarStores()) SpillSGPRToSMEM = true; else if (EnableSpillSGPRToVGPR) @@ -74,10 +76,12 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : SGPRSetID = NumRegPressureSets; VGPRSetID = NumRegPressureSets; + AGPRSetID = NumRegPressureSets; for (unsigned i = 0; i < NumRegPressureSets; ++i) { classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); + classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets); } // Determine the number of reg units for each pressure set. @@ -89,7 +93,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : } } - unsigned VGPRMax = 0, SGPRMax = 0; + unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0; for (unsigned i = 0; i < NumRegPressureSets; ++i) { if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { VGPRSetID = i; @@ -100,10 +104,16 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : SGPRSetID = i; SGPRMax = PressureSetRegUnits[i]; } + if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) { + AGPRSetID = i; + AGPRMax = PressureSetRegUnits[i]; + continue; + } } assert(SGPRSetID < NumRegPressureSets && - VGPRSetID < NumRegPressureSets); + VGPRSetID < NumRegPressureSets && + AGPRSetID < NumRegPressureSets); } unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( @@ -139,11 +149,6 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( return AMDGPU::SGPR_32RegClass.getRegister(Reg); } -unsigned SIRegisterInfo::reservedStackPtrOffsetReg( - const MachineFunction &MF) const { - return AMDGPU::SGPR32; -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); @@ -155,15 +160,26 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // M0 has to be reserved so that llvm accepts it as a live-in into a block. reserveRegisterTuples(Reserved, AMDGPU::M0); + // Reserve src_vccz, src_execz, src_scc. + reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ); + reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ); + reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC); + // Reserve the memory aperture registers. reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); + // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen. + reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID); + // Reserve xnack_mask registers - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); + // Reserve lds_direct register - support is not implemented in Codegen. + reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT); + // Reserve Trap Handler registers - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::TBA); reserveRegisterTuples(Reserved, AMDGPU::TMA); @@ -176,6 +192,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); + // Reserve null register - it shall never be allocated + reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); + + // Disallow vcc_hi allocation in wave32. It may be allocated but most likely + // will result in bugs. + if (isWave32) { + Reserved.set(AMDGPU::VCC); + Reserved.set(AMDGPU::VCC_HI); + } + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); @@ -190,6 +216,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); reserveRegisterTuples(Reserved, Reg); + Reg = AMDGPU::AGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); } const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -225,9 +253,33 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, FrameReg)); } + for (unsigned Reg : MFI->WWMReservedRegs) { + reserveRegisterTuples(Reserved, Reg); + } + + // FIXME: Stop using reserved registers for this. + for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) + reserveRegisterTuples(Reserved, Reg); + + for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) + reserveRegisterTuples(Reserved, Reg); + return Reserved; } +bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + // On entry, the base address is 0, so it can't possibly need any more + // alignment. + + // FIXME: Should be able to specify the entry frame alignment per calling + // convention instead. + if (Info->isEntryFunction()) + return false; + + return TargetRegisterInfo::canRealignStack(MF); +} + bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); if (Info->isEntryFunction()) { @@ -252,11 +304,20 @@ bool SIRegisterInfo::requiresFrameIndexScavenging( bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( const MachineFunction &MF) const { - // m0 is needed for the scalar store offset. m0 is unallocatable, so we can't - // create a virtual register for it during frame index elimination, so the - // scavenger is directly needed. - return MF.getFrameInfo().hasStackObjects() && - MF.getSubtarget<GCNSubtarget>().hasScalarStores() && + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.hasStackObjects()) + return false; + + // The scavenger is used for large frames which may require finding a free + // register for large offsets. + if (!isUInt<12>(MFI.getStackSize())) + return true; + + // If using scalar stores, for spills, m0 is needed for the scalar store + // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual + // register for it during frame index elimination, so the scavenger is + // directly needed. + return MF.getSubtarget<GCNSubtarget>().hasScalarStores() && MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs(); } @@ -332,7 +393,8 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) .addReg(OffsetReg, RegState::Kill) - .addReg(FIReg); + .addReg(FIReg) + .addImm(0); // clamp bit } void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, @@ -394,21 +456,39 @@ const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( static unsigned getNumSubRegsForSpillOp(unsigned Op) { switch (Op) { + case AMDGPU::SI_SPILL_S1024_SAVE: + case AMDGPU::SI_SPILL_S1024_RESTORE: + case AMDGPU::SI_SPILL_V1024_SAVE: + case AMDGPU::SI_SPILL_V1024_RESTORE: + case AMDGPU::SI_SPILL_A1024_SAVE: + case AMDGPU::SI_SPILL_A1024_RESTORE: + return 32; case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V512_RESTORE: + case AMDGPU::SI_SPILL_A512_SAVE: + case AMDGPU::SI_SPILL_A512_RESTORE: return 16; case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_V256_SAVE: case AMDGPU::SI_SPILL_V256_RESTORE: return 8; + case AMDGPU::SI_SPILL_S160_SAVE: + case AMDGPU::SI_SPILL_S160_RESTORE: + case AMDGPU::SI_SPILL_V160_SAVE: + case AMDGPU::SI_SPILL_V160_RESTORE: + return 5; case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_A128_SAVE: + case AMDGPU::SI_SPILL_A128_RESTORE: return 4; + case AMDGPU::SI_SPILL_S96_SAVE: + case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V96_RESTORE: return 3; @@ -416,11 +496,15 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_V64_SAVE: case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_A64_SAVE: + case AMDGPU::SI_SPILL_A64_RESTORE: return 2; case AMDGPU::SI_SPILL_S32_SAVE: case AMDGPU::SI_SPILL_S32_RESTORE: case AMDGPU::SI_SPILL_V32_SAVE: case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_A32_SAVE: + case AMDGPU::SI_SPILL_A32_RESTORE: return 1; default: llvm_unreachable("Invalid spill opcode"); } @@ -480,6 +564,35 @@ static int getOffsetMUBUFLoad(unsigned Opc) { } } +static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, + int Index, + unsigned Lane, + unsigned ValueReg, + bool IsKill) { + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MI->getParent()->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); + + if (Reg == AMDGPU::NoRegister) + return MachineInstrBuilder(); + + bool IsStore = MI->mayStore(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); + + unsigned Dst = IsStore ? Reg : ValueReg; + unsigned Src = IsStore ? ValueReg : Reg; + unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32 + : AMDGPU::V_ACCVGPR_READ_B32; + + return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) + .addReg(Src, getKillRegState(IsKill)); +} + // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not // need to handle the case where an SGPR may need to be spilled while spilling. static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, @@ -498,6 +611,9 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, return false; const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr()) + return true; + MachineInstrBuilder NewMI = BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .add(*Reg) @@ -507,6 +623,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe + .addImm(0) // dlc .cloneMemRefs(*MI); const MachineOperand *VDataIn = TII->getNamedOperand(*MI, @@ -549,6 +666,10 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned Align = MFI.getObjectAlignment(Index); const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); + Register TmpReg = + hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg() + : Register(); + assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); if (!isUInt<12>(Offset + Size - EltSize)) { @@ -562,7 +683,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, // We don't have access to the register scavenger if this function is called // during PEI::scavengeFrameVirtualRegs(). if (RS) - SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass); + SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); if (SOffset == AMDGPU::NoRegister) { // There are no free SGPRs, and since we are in the process of spilling @@ -597,20 +718,38 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, SrcDstRegState |= getKillRegState(IsKill); } - MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); - MachineMemOperand *NewMMO - = MF->getMachineMemOperand(PInfo, MMO->getFlags(), - EltSize, MinAlign(Align, EltSize * i)); - - auto MIB = BuildMI(*MBB, MI, DL, Desc) - .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) - .addReg(ScratchRsrcReg) - .addReg(SOffset, SOffsetRegState) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addMemOperand(NewMMO); + auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill); + + if (!MIB.getInstr()) { + unsigned FinalReg = SubReg; + if (TmpReg != AMDGPU::NoRegister) { + if (IsStore) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg) + .addReg(SubReg, getKillRegState(IsKill)); + SubReg = TmpReg; + } + + MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); + MachineMemOperand *NewMMO + = MF->getMachineMemOperand(PInfo, MMO->getFlags(), + EltSize, MinAlign(Align, EltSize * i)); + + MIB = BuildMI(*MBB, MI, DL, Desc) + .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) + .addReg(ScratchRsrcReg) + .addReg(SOffset, SOffsetRegState) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(NewMMO); + + if (!IsStore && TmpReg != AMDGPU::NoRegister) + MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), + FinalReg) + .addReg(TmpReg, RegState::Kill); + } if (NumSubRegs > 1) MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); @@ -669,6 +808,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, if (SpillToSMEM && OnlyToVGPR) return false; + Register FrameReg = getFrameRegister(*MF); + assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && SuperReg != MFI->getFrameOffsetReg() && SuperReg != MFI->getScratchWaveOffsetReg())); @@ -728,11 +869,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()); + .addReg(FrameReg); } BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) @@ -740,6 +881,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, .addReg(MFI->getScratchRSrcReg()) // sbase .addReg(OffsetReg, RegState::Kill) // soff .addImm(0) // glc + .addImm(0) // dlc .addMemOperand(MMO); continue; @@ -799,11 +941,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpReg, RegState::Kill) // src - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srrsrc - .addReg(MFI->getFrameOffsetReg()) // soffset - .addImm(i * 4) // offset + .addReg(TmpReg, RegState::Kill) // src + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srrsrc + .addReg(MFI->getStackPtrOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); } } @@ -859,6 +1001,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, unsigned EltSize = 4; unsigned ScalarLoadOp; + Register FrameReg = getFrameRegister(*MF); + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be @@ -890,18 +1034,19 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(MFI->getFrameOffsetReg()); + .addReg(FrameReg); } auto MIB = BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg, RegState::Kill) // soff - .addImm(0) // glc + .addReg(MFI->getScratchRSrcReg()) // sbase + .addReg(OffsetReg, RegState::Kill) // soff + .addImm(0) // glc + .addImm(0) // dlc .addMemOperand(MMO); if (NumSubRegs > 1 && i == 0) @@ -937,10 +1082,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srsrc - .addReg(MFI->getFrameOffsetReg()) // soffset - .addImm(i * 4) // offset + .addFrameIndex(Index) // vaddr + .addReg(MFI->getScratchRSrcReg()) // srsrc + .addReg(MFI->getStackPtrOffsetReg()) // soffset + .addImm(i * 4) // offset .addMemOperand(MMO); auto MIB = @@ -969,15 +1114,21 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( int FI, RegScavenger *RS) const { switch (MI->getOpcode()) { + case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: return spillSGPR(MI, FI, RS, true); + case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: return restoreSGPR(MI, FI, RS, true); @@ -998,14 +1149,21 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); + assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?"); + MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); + Register FrameReg = getFrameRegister(*MF); + switch (MI->getOpcode()) { // SGPR register spill + case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: { spillSGPR(MI, Index, RS); @@ -1013,9 +1171,12 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } // SGPR register restore + case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: { restoreSGPR(MI, Index, RS); @@ -1023,19 +1184,29 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } // VGPR register spill + case AMDGPU::SI_SPILL_V1024_SAVE: case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V160_SAVE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V32_SAVE: { + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_A1024_SAVE: + case AMDGPU::SI_SPILL_A512_SAVE: + case AMDGPU::SI_SPILL_A128_SAVE: + case AMDGPU::SI_SPILL_A64_SAVE: + case AMDGPU::SI_SPILL_A32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), + FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); @@ -1047,16 +1218,25 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V160_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: - case AMDGPU::SI_SPILL_V512_RESTORE: { + case AMDGPU::SI_SPILL_V512_RESTORE: + case AMDGPU::SI_SPILL_V1024_RESTORE: + case AMDGPU::SI_SPILL_A32_RESTORE: + case AMDGPU::SI_SPILL_A64_RESTORE: + case AMDGPU::SI_SPILL_A128_RESTORE: + case AMDGPU::SI_SPILL_A512_RESTORE: + case AMDGPU::SI_SPILL_A1024_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), + FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); @@ -1068,24 +1248,23 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, const DebugLoc &DL = MI->getDebugLoc(); bool IsMUBUF = TII->isMUBUF(*MI); - if (!IsMUBUF && - MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) { + if (!IsMUBUF && !MFI->isEntryFunction()) { // Convert to an absolute stack address by finding the offset from the // scratch wave base and scaling by the wave size. // - // In an entry function/kernel the stack address is already the - // absolute address relative to the scratch wave offset. + // In an entry function/kernel the offset is already the absolute + // address relative to the frame register. unsigned DiffReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; - unsigned ResultReg = IsCopy ? + Register ResultReg = IsCopy ? MI->getOperand(0).getReg() : MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) - .addReg(MFI->getFrameOffsetReg()) + .addReg(FrameReg) .addReg(MFI->getScratchWaveOffsetReg()); int64_t Offset = FrameInfo.getObjectOffset(Index); @@ -1106,7 +1285,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { TII->getAddNoCarry(*MBB, MI, DL, ResultReg) .addImm(Offset) - .addReg(ScaledReg, RegState::Kill); + .addReg(ScaledReg, RegState::Kill) + .addImm(0); // clamp bit } else { unsigned ConstOffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -1115,7 +1295,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addImm(Offset); TII->getAddNoCarry(*MBB, MI, DL, ResultReg) .addReg(ConstOffsetReg, RegState::Kill) - .addReg(ScaledReg, RegState::Kill); + .addReg(ScaledReg, RegState::Kill) + .addImm(0); // clamp bit } } @@ -1133,8 +1314,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); - assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() - == MFI->getFrameOffsetReg()); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == + MFI->getStackPtrOffsetReg()); + + TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm @@ -1164,63 +1347,21 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { - #define AMDGPU_REG_ASM_NAMES - #include "AMDGPURegAsmNames.inc.cpp" - - #define REG_RANGE(BeginReg, EndReg, RegTable) \ - if (Reg >= BeginReg && Reg <= EndReg) { \ - unsigned Index = Reg - BeginReg; \ - assert(Index < array_lengthof(RegTable)); \ - return RegTable[Index]; \ - } + const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg); + unsigned Size = getRegSizeInBits(*RC); + unsigned AltName = AMDGPU::NoRegAltName; - REG_RANGE(AMDGPU::VGPR0, AMDGPU::VGPR255, VGPR32RegNames); - REG_RANGE(AMDGPU::SGPR0, AMDGPU::SGPR103, SGPR32RegNames); - REG_RANGE(AMDGPU::VGPR0_VGPR1, AMDGPU::VGPR254_VGPR255, VGPR64RegNames); - REG_RANGE(AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR102_SGPR103, SGPR64RegNames); - REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2, AMDGPU::VGPR253_VGPR254_VGPR255, - VGPR96RegNames); - - REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3, - AMDGPU::VGPR252_VGPR253_VGPR254_VGPR255, - VGPR128RegNames); - REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, - AMDGPU::SGPR100_SGPR101_SGPR102_SGPR103, - SGPR128RegNames); - - REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7, - AMDGPU::VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255, - VGPR256RegNames); - - REG_RANGE( - AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15, - AMDGPU::VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247_VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255, - VGPR512RegNames); - - REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7, - AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103, - SGPR256RegNames); - - REG_RANGE( - AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7_SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15, - AMDGPU::SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95_SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103, - SGPR512RegNames - ); - -#undef REG_RANGE - - // FIXME: Rename flat_scr so we don't need to special case this. - switch (Reg) { - case AMDGPU::FLAT_SCR: - return "flat_scratch"; - case AMDGPU::FLAT_SCR_LO: - return "flat_scratch_lo"; - case AMDGPU::FLAT_SCR_HI: - return "flat_scratch_hi"; - default: - // For the special named registers the default is fine. - return TargetRegisterInfo::getRegAsmName(Reg); + switch (Size) { + case 32: AltName = AMDGPU::Reg32; break; + case 64: AltName = AMDGPU::Reg64; break; + case 96: AltName = AMDGPU::Reg96; break; + case 128: AltName = AMDGPU::Reg128; break; + case 160: AltName = AMDGPU::Reg160; break; + case 256: AltName = AMDGPU::Reg256; break; + case 512: AltName = AMDGPU::Reg512; break; + case 1024: AltName = AMDGPU::Reg1024; break; } + return AMDGPUInstPrinter::getRegisterName(Reg, AltName); } // FIXME: This is very slow. It might be worth creating a map from physreg to @@ -1231,15 +1372,25 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { static const TargetRegisterClass *const BaseClasses[] = { &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, + &AMDGPU::AGPR_32RegClass, &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, + &AMDGPU::AReg_64RegClass, &AMDGPU::VReg_96RegClass, + &AMDGPU::SReg_96RegClass, &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, + &AMDGPU::AReg_128RegClass, + &AMDGPU::VReg_160RegClass, + &AMDGPU::SReg_160RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, &AMDGPU::VReg_512RegClass, &AMDGPU::SReg_512RegClass, + &AMDGPU::AReg_512RegClass, + &AMDGPU::SReg_1024RegClass, + &AMDGPU::VReg_1024RegClass, + &AMDGPU::AReg_1024RegClass, &AMDGPU::SCC_CLASSRegClass, &AMDGPU::Pseudo_SReg_32RegClass, &AMDGPU::Pseudo_SReg_128RegClass, @@ -1268,10 +1419,39 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; case 128: return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; + case 160: + return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr; case 256: return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; case 512: return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; + case 1024: + return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; + default: + llvm_unreachable("Invalid register class size"); + } +} + +bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { + unsigned Size = getRegSizeInBits(*RC); + if (Size < 32) + return false; + switch (Size) { + case 32: + return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr; + case 64: + return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr; + case 96: + return false; + case 128: + return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr; + case 160: + case 256: + return false; + case 512: + return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr; + case 1024: + return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr; default: llvm_unreachable("Invalid register class size"); } @@ -1288,10 +1468,32 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( return &AMDGPU::VReg_96RegClass; case 128: return &AMDGPU::VReg_128RegClass; + case 160: + return &AMDGPU::VReg_160RegClass; case 256: return &AMDGPU::VReg_256RegClass; case 512: return &AMDGPU::VReg_512RegClass; + case 1024: + return &AMDGPU::VReg_1024RegClass; + default: + llvm_unreachable("Invalid register class size"); + } +} + +const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass( + const TargetRegisterClass *SRC) const { + switch (getRegSizeInBits(*SRC)) { + case 32: + return &AMDGPU::AGPR_32RegClass; + case 64: + return &AMDGPU::AReg_64RegClass; + case 128: + return &AMDGPU::AReg_128RegClass; + case 512: + return &AMDGPU::AReg_512RegClass; + case 1024: + return &AMDGPU::AReg_1024RegClass; default: llvm_unreachable("Invalid register class size"); } @@ -1304,12 +1506,18 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( return &AMDGPU::SGPR_32RegClass; case 64: return &AMDGPU::SReg_64RegClass; + case 96: + return &AMDGPU::SReg_96RegClass; case 128: return &AMDGPU::SReg_128RegClass; + case 160: + return &AMDGPU::SReg_160RegClass; case 256: return &AMDGPU::SReg_256RegClass; case 512: return &AMDGPU::SReg_512RegClass; + case 1024: + return &AMDGPU::SReg_1024RegClass; default: llvm_unreachable("Invalid register class size"); } @@ -1328,11 +1536,31 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return &AMDGPU::SGPR_32RegClass; case 2: return &AMDGPU::SReg_64RegClass; + case 3: + return &AMDGPU::SReg_96RegClass; case 4: return &AMDGPU::SReg_128RegClass; + case 5: + return &AMDGPU::SReg_160RegClass; case 8: return &AMDGPU::SReg_256RegClass; - case 16: /* fall-through */ + case 16: + return &AMDGPU::SReg_512RegClass; + case 32: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } + } else if (hasAGPRs(RC)) { + switch (Count) { + case 1: + return &AMDGPU::AGPR_32RegClass; + case 2: + return &AMDGPU::AReg_64RegClass; + case 4: + return &AMDGPU::AReg_128RegClass; + case 16: + return &AMDGPU::AReg_512RegClass; + case 32: /* fall-through */ default: llvm_unreachable("Invalid sub-register class size"); } @@ -1346,9 +1574,13 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return &AMDGPU::VReg_96RegClass; case 4: return &AMDGPU::VReg_128RegClass; + case 5: + return &AMDGPU::VReg_160RegClass; case 8: return &AMDGPU::VReg_256RegClass; - case 16: /* fall-through */ + case 16: + return &AMDGPU::VReg_512RegClass; + case 32: /* fall-through */ default: llvm_unreachable("Invalid sub-register class size"); } @@ -1396,6 +1628,17 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { if (EltSize == 4) { + static const int16_t Sub0_31[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, + AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, + AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, + AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31, + }; + static const int16_t Sub0_15[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, @@ -1408,6 +1651,10 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, }; + static const int16_t Sub0_4[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, + }; + static const int16_t Sub0_3[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, }; @@ -1429,16 +1676,31 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC return makeArrayRef(Sub0_2); case 128: return makeArrayRef(Sub0_3); + case 160: + return makeArrayRef(Sub0_4); case 256: return makeArrayRef(Sub0_7); case 512: return makeArrayRef(Sub0_15); + case 1024: + return makeArrayRef(Sub0_31); default: llvm_unreachable("unhandled register size"); } } if (EltSize == 8) { + static const int16_t Sub0_31_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, + AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, + AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, + AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, + AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 + }; + static const int16_t Sub0_15_64[] = { AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, @@ -1465,32 +1727,73 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC return makeArrayRef(Sub0_7_64); case 512: return makeArrayRef(Sub0_15_64); + case 1024: + return makeArrayRef(Sub0_31_64); default: llvm_unreachable("unhandled register size"); } } - assert(EltSize == 16 && "unhandled register spill split size"); + if (EltSize == 16) { + + static const int16_t Sub0_31_128[] = { + AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11, + AMDGPU::sub12_sub13_sub14_sub15, + AMDGPU::sub16_sub17_sub18_sub19, + AMDGPU::sub20_sub21_sub22_sub23, + AMDGPU::sub24_sub25_sub26_sub27, + AMDGPU::sub28_sub29_sub30_sub31 + }; + + static const int16_t Sub0_15_128[] = { + AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11, + AMDGPU::sub12_sub13_sub14_sub15 + }; + + static const int16_t Sub0_7_128[] = { + AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub4_sub5_sub6_sub7 + }; - static const int16_t Sub0_15_128[] = { - AMDGPU::sub0_sub1_sub2_sub3, - AMDGPU::sub4_sub5_sub6_sub7, - AMDGPU::sub8_sub9_sub10_sub11, - AMDGPU::sub12_sub13_sub14_sub15 + switch (AMDGPU::getRegBitWidth(*RC->MC)) { + case 128: + return {}; + case 256: + return makeArrayRef(Sub0_7_128); + case 512: + return makeArrayRef(Sub0_15_128); + case 1024: + return makeArrayRef(Sub0_31_128); + default: + llvm_unreachable("unhandled register size"); + } + } + + assert(EltSize == 32 && "unhandled elt size"); + + static const int16_t Sub0_31_256[] = { + AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, + AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, + AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 }; - static const int16_t Sub0_7_128[] = { - AMDGPU::sub0_sub1_sub2_sub3, - AMDGPU::sub4_sub5_sub6_sub7 + static const int16_t Sub0_15_256[] = { + AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 }; switch (AMDGPU::getRegBitWidth(*RC->MC)) { - case 128: - return {}; case 256: - return makeArrayRef(Sub0_7_128); + return {}; case 512: - return makeArrayRef(Sub0_15_128); + return makeArrayRef(Sub0_15_256); + case 1024: + return makeArrayRef(Sub0_31_256); default: llvm_unreachable("unhandled register size"); } @@ -1512,6 +1815,13 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, return hasVGPRs(RC); } +bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, + unsigned Reg) const { + const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); + assert(RC && "Register class for the reg not found"); + return hasAGPRs(RC); +} + bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, @@ -1553,7 +1863,7 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - if (Idx == getVGPRPressureSet()) + if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet()) return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, const_cast<MachineFunction &>(MF)); @@ -1578,28 +1888,80 @@ unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { } const TargetRegisterClass * -SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, +SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, + const RegisterBank &RB, const MachineRegisterInfo &MRI) const { - unsigned Size = getRegSizeInBits(MO.getReg(), MRI); - const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); - if (!RB) - return nullptr; - switch (Size) { + case 1: { + switch (RB.getID()) { + case AMDGPU::VGPRRegBankID: + return &AMDGPU::VGPR_32RegClass; + case AMDGPU::VCCRegBankID: + return isWave32 ? + &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; + case AMDGPU::SGPRRegBankID: + return &AMDGPU::SReg_32_XM0RegClass; + case AMDGPU::SCCRegBankID: + // This needs to return an allocatable class, so don't bother returning + // the dummy SCC class. + return &AMDGPU::SReg_32_XM0RegClass; + default: + llvm_unreachable("unknown register bank"); + } + } case 32: - return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : - &AMDGPU::SReg_32_XM0RegClass; + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : + &AMDGPU::SReg_32_XM0RegClass; case 64: - return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : - &AMDGPU::SReg_64_XEXECRegClass; + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : + &AMDGPU::SReg_64_XEXECRegClass; case 96: - return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : - nullptr; + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : + &AMDGPU::SReg_96RegClass; case 128: - return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : - &AMDGPU::SReg_128RegClass; + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : + &AMDGPU::SReg_128RegClass; + case 160: + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : + &AMDGPU::SReg_160RegClass; + case 256: + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass : + &AMDGPU::SReg_256RegClass; + case 512: + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : + &AMDGPU::SReg_512RegClass; + default: + if (Size < 32) + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : + &AMDGPU::SReg_32_XM0RegClass; + return nullptr; + } +} + +const TargetRegisterClass * +SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, + const MachineRegisterInfo &MRI) const { + if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg())) + return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); + return nullptr; +} + +unsigned SIRegisterInfo::getVCC() const { + return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; +} + +const TargetRegisterClass * +SIRegisterInfo::getRegClass(unsigned RCID) const { + switch ((int)RCID) { + case AMDGPU::SReg_1RegClassID: + return getBoolRC(); + case AMDGPU::SReg_1_XEXECRegClassID: + return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass + : &AMDGPU::SReg_64_XEXECRegClass; + case -1: + return nullptr; default: - llvm_unreachable("not implemented"); + return AMDGPURegisterInfo::getRegClass(RCID); } } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index b82fefde47e1..34487c96e72e 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -1,9 +1,8 @@ //===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -30,10 +29,13 @@ class SIRegisterInfo final : public AMDGPURegisterInfo { private: unsigned SGPRSetID; unsigned VGPRSetID; + unsigned AGPRSetID; BitVector SGPRPressureSets; BitVector VGPRPressureSets; + BitVector AGPRPressureSets; bool SpillSGPRToVGPR; bool SpillSGPRToSMEM; + bool isWave32; void classifyPressureSet(unsigned PSetID, unsigned Reg, BitVector &PressureSets) const; @@ -57,8 +59,6 @@ public: unsigned reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const; - unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; @@ -72,8 +72,9 @@ public: return 100; } - unsigned getFrameRegister(const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; + bool canRealignStack(const MachineFunction &MF) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; @@ -130,7 +131,7 @@ public: /// \returns true if this class contains only SGPR registers bool isSGPRClass(const TargetRegisterClass *RC) const { - return !hasVGPRs(RC); + return !hasVGPRs(RC) && !hasAGPRs(RC); } /// \returns true if this class ID contains only SGPR registers @@ -150,10 +151,22 @@ public: /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; + /// \returns true if this class contains AGPR registers. + bool hasAGPRs(const TargetRegisterClass *RC) const; + + /// \returns true if this class contains any vector registers. + bool hasVectorRegisters(const TargetRegisterClass *RC) const { + return hasVGPRs(RC) || hasAGPRs(RC); + } + /// \returns A VGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentVGPRClass( const TargetRegisterClass *SRC) const; + /// \returns An AGPR reg class with the same width as \p SRC + const TargetRegisterClass *getEquivalentAGPRClass( + const TargetRegisterClass *SRC) const; + /// \returns A SGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentSGPRClass( const TargetRegisterClass *VRC) const; @@ -191,16 +204,32 @@ public: unsigned getSGPRPressureSet() const { return SGPRSetID; }; unsigned getVGPRPressureSet() const { return VGPRSetID; }; + unsigned getAGPRPressureSet() const { return AGPRSetID; }; const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI, unsigned Reg) const; bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + bool isAGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + bool isVectorRegister(const MachineRegisterInfo &MRI, unsigned Reg) const { + return isVGPR(MRI, Reg) || isAGPR(MRI, Reg); + } + + virtual bool + isDivergentRegClass(const TargetRegisterClass *RC) const override { + return !isSGPRClass(RC); + } bool isSGPRPressureSet(unsigned SetID) const { - return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID); + return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID) && + !AGPRPressureSets.test(SetID); } bool isVGPRPressureSet(unsigned SetID) const { - return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID); + return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) && + !AGPRPressureSets.test(SetID); + } + bool isAGPRPressureSet(unsigned SetID) const { + return AGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) && + !VGPRPressureSets.test(SetID); } ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, @@ -225,15 +254,44 @@ public: unsigned getReturnAddressReg(const MachineFunction &MF) const; const TargetRegisterClass * + getRegClassForSizeOnBank(unsigned Size, + const RegisterBank &Bank, + const MachineRegisterInfo &MRI) const; + + const TargetRegisterClass * + getRegClassForTypeOnBank(LLT Ty, + const RegisterBank &Bank, + const MachineRegisterInfo &MRI) const { + return getRegClassForSizeOnBank(Ty.getSizeInBits(), Bank, MRI); + } + + const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override; + const TargetRegisterClass *getBoolRC() const { + return isWave32 ? &AMDGPU::SReg_32_XM0RegClass + : &AMDGPU::SReg_64RegClass; + } + + const TargetRegisterClass *getWaveMaskRegClass() const { + return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass + : &AMDGPU::SReg_64_XEXECRegClass; + } + + unsigned getVCC() const; + + const TargetRegisterClass *getRegClass(unsigned RCID) const; + // Find reaching register definition MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const; + const uint32_t *getAllVGPRRegMask() const; + const uint32_t *getAllAllocatableSRegMask() const; + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index c625ecc9b750..d5948a7862cc 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1,9 +1,8 @@ //===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -15,43 +14,86 @@ class getSubRegs<int size> { list<SubRegIndex> ret2 = [sub0, sub1]; list<SubRegIndex> ret3 = [sub0, sub1, sub2]; list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3]; + list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4]; list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15]; + list<SubRegIndex> ret32 = [sub0, sub1, sub2, sub3, + sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, + sub12, sub13, sub14, sub15, + sub16, sub17, sub18, sub19, + sub20, sub21, sub22, sub23, + sub24, sub25, sub26, sub27, + sub28, sub29, sub30, sub31]; list<SubRegIndex> ret = !if(!eq(size, 2), ret2, !if(!eq(size, 3), ret3, !if(!eq(size, 4), ret4, - !if(!eq(size, 8), ret8, ret16)))); + !if(!eq(size, 5), ret5, + !if(!eq(size, 8), ret8, + !if(!eq(size, 16), ret16, ret32)))))); +} + +let Namespace = "AMDGPU" in { +defset list<RegAltNameIndex> AllRegAltNameIndices = { + def Reg32 : RegAltNameIndex; + def Reg64 : RegAltNameIndex; + def Reg96 : RegAltNameIndex; + def Reg128 : RegAltNameIndex; + def Reg160 : RegAltNameIndex; + def Reg256 : RegAltNameIndex; + def Reg512 : RegAltNameIndex; + def Reg1024 : RegAltNameIndex; +} } //===----------------------------------------------------------------------===// // Declarations that describe the SI registers //===----------------------------------------------------------------------===// -class SIReg <string n, bits<16> regIdx = 0> : Register<n>, +class SIReg <string n, bits<16> regIdx = 0, string prefix = "", + int regNo = !cast<int>(regIdx)> : + Register<n, !if(!eq(prefix, ""), + [ n, n, n, n, n, n, n, n ], + [ prefix # regNo, + prefix # "[" # regNo # ":" # !and(!add(regNo, 1), 255) # "]", + prefix # "[" # regNo # ":" # !and(!add(regNo, 2), 255) # "]", + prefix # "[" # regNo # ":" # !and(!add(regNo, 3), 255) # "]", + prefix # "[" # regNo # ":" # !and(!add(regNo, 4), 255) # "]", + prefix # "[" # regNo # ":" # !and(!add(regNo, 7), 255) # "]", + prefix # "[" # regNo # ":" # !and(!add(regNo, 15), 255) # "]", + prefix # "[" # regNo # ":" # !and(!add(regNo, 31), 255) # "]", + ])>, DwarfRegNum<[!cast<int>(HWEncoding)]> { let Namespace = "AMDGPU"; + let RegAltNameIndices = AllRegAltNameIndices; // This is the not yet the complete register encoding. An additional // bit is set for VGPRs. let HWEncoding = regIdx; } +class SIRegisterWithSubRegs<string n, list<Register> subregs> : + RegisterWithSubRegs<n, subregs> { + let RegAltNameIndices = AllRegAltNameIndices; + let AltNames = [ n, n, n, n, n, n, n, n ]; +} + // Special Registers def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; // Pseudo-registers: Used as placeholders during isel and immediately // replaced, never seeing the verifier. -def PRIVATE_RSRC_REG : SIReg<"", 0>; -def FP_REG : SIReg<"", 0>; -def SP_REG : SIReg<"", 0>; -def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>; +def PRIVATE_RSRC_REG : SIReg<"private_rsrc", 0>; +def FP_REG : SIReg<"fp", 0>; +def SP_REG : SIReg<"sp", 0>; +def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>; // VCC for 64-bit instructions -def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, +def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, DwarfRegAlias<VCC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -61,25 +103,38 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, def EXEC_LO : SIReg<"exec_lo", 126>; def EXEC_HI : SIReg<"exec_hi", 127>; -def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>, +def EXEC : SIRegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegAlias<EXEC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 126; } -def SCC : SIReg<"scc", 253>; +// 32-bit real registers, for MC only. +// May be used with both 32-bit and 64-bit operands. +def SRC_VCCZ : SIReg<"src_vccz", 251>; +def SRC_EXECZ : SIReg<"src_execz", 252>; +def SRC_SCC : SIReg<"src_scc", 253>; + +// 1-bit pseudo register, for codegen only. +// Should never be emitted. +def SCC : SIReg<"scc">; + def M0 : SIReg <"m0", 124>; +def SGPR_NULL : SIReg<"null", 125>; def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>; def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>; def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>; def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>; +def SRC_POPS_EXITING_WAVE_ID : SIReg<"src_pops_exiting_wave_id", 239>; + +def LDS_DIRECT : SIReg <"src_lds_direct", 254>; def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>; def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>; -def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>, +def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>, DwarfRegAlias<XNACK_MASK_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -90,7 +145,7 @@ def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI def TBA_LO : SIReg<"tba_lo", 108>; def TBA_HI : SIReg<"tba_hi", 109>; -def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, +def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, DwarfRegAlias<TBA_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -100,7 +155,7 @@ def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, def TMA_LO : SIReg<"tma_lo", 110>; def TMA_HI : SIReg<"tma_hi", 111>; -def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, +def TMA : SIRegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, DwarfRegAlias<TMA_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -108,19 +163,19 @@ def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, } foreach Index = 0-15 in { - def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>; - def TTMP#Index#_gfx9 : SIReg<"ttmp"#Index, !add(108, Index)>; - def TTMP#Index : SIReg<"", 0>; + def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>; + def TTMP#Index#_gfx9_gfx10 : SIReg<"ttmp"#Index, !add(108, Index)>; + def TTMP#Index : SIReg<"ttmp"#Index, 0>; } multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { def _ci : SIReg<n, ci_e>; def _vi : SIReg<n, vi_e>; - def "" : SIReg<"", 0>; + def "" : SIReg<n, 0>; } class FlatReg <Register lo, Register hi, bits<16> encoding> : - RegisterWithSubRegs<"flat_scratch", [lo, hi]>, + SIRegisterWithSubRegs<"flat_scratch", [lo, hi]>, DwarfRegAlias<lo> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -135,13 +190,20 @@ def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>; def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>; // SGPR registers -foreach Index = 0-103 in { - def SGPR#Index : SIReg <"SGPR"#Index, Index>; +foreach Index = 0-105 in { + def SGPR#Index : SIReg <"SGPR"#Index, Index, "s">; } // VGPR registers foreach Index = 0-255 in { - def VGPR#Index : SIReg <"VGPR"#Index, Index> { + def VGPR#Index : SIReg <"VGPR"#Index, Index, "v"> { + let HWEncoding{8} = 1; + } +} + +// AccVGPR registers +foreach Index = 0-255 in { + def AGPR#Index : SIReg <"AGPR"#Index, Index, "a"> { let HWEncoding{8} = 1; } } @@ -164,10 +226,10 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> { // SGPR 32-bit registers def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add (sequence "SGPR%u", 0, 103))> { + (add (sequence "SGPR%u", 0, 105)), Reg32> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. - let AllocationPriority = 7; + let AllocationPriority = 9; } // SGPR 64-bit registers @@ -175,6 +237,12 @@ def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret, [(add (decimate SGPR_32, 2)), (add (decimate (shl SGPR_32, 1), 2))]>; +// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs. +def SGPR_96Regs : RegisterTuples<getSubRegs<3>.ret, + [(add (decimate SGPR_32, 3)), + (add (decimate (shl SGPR_32, 1), 3)), + (add (decimate (shl SGPR_32, 2), 3))]>; + // SGPR 128-bit registers def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret, [(add (decimate SGPR_32, 4)), @@ -182,6 +250,14 @@ def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret, (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4))]>; +// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs. +def SGPR_160Regs : RegisterTuples<getSubRegs<5>.ret, + [(add (decimate SGPR_32, 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4))]>; + // SGPR 256-bit registers def SGPR_256Regs : RegisterTuples<getSubRegs<8>.ret, [(add (decimate SGPR_32, 4)), @@ -212,6 +288,41 @@ def SGPR_512Regs : RegisterTuples<getSubRegs<16>.ret, (add (decimate (shl SGPR_32, 14), 4)), (add (decimate (shl SGPR_32, 15), 4))]>; +// SGPR 1024-bit registers +def SGPR_1024Regs : RegisterTuples<getSubRegs<32>.ret, + [(add (decimate SGPR_32, 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4)), + (add (decimate (shl SGPR_32, 5), 4)), + (add (decimate (shl SGPR_32, 6), 4)), + (add (decimate (shl SGPR_32, 7), 4)), + (add (decimate (shl SGPR_32, 8), 4)), + (add (decimate (shl SGPR_32, 9), 4)), + (add (decimate (shl SGPR_32, 10), 4)), + (add (decimate (shl SGPR_32, 11), 4)), + (add (decimate (shl SGPR_32, 12), 4)), + (add (decimate (shl SGPR_32, 13), 4)), + (add (decimate (shl SGPR_32, 14), 4)), + (add (decimate (shl SGPR_32, 15), 4)), + (add (decimate (shl SGPR_32, 16), 4)), + (add (decimate (shl SGPR_32, 17), 4)), + (add (decimate (shl SGPR_32, 18), 4)), + (add (decimate (shl SGPR_32, 19), 4)), + (add (decimate (shl SGPR_32, 20), 4)), + (add (decimate (shl SGPR_32, 21), 4)), + (add (decimate (shl SGPR_32, 22), 4)), + (add (decimate (shl SGPR_32, 23), 4)), + (add (decimate (shl SGPR_32, 24), 4)), + (add (decimate (shl SGPR_32, 25), 4)), + (add (decimate (shl SGPR_32, 26), 4)), + (add (decimate (shl SGPR_32, 27), 4)), + (add (decimate (shl SGPR_32, 28), 4)), + (add (decimate (shl SGPR_32, 29), 4)), + (add (decimate (shl SGPR_32, 30), 4)), + (add (decimate (shl SGPR_32, 31), 4))]>; + // Trap handler TMP 32-bit registers def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, (add (sequence "TTMP%u", 0, 15))> { @@ -263,7 +374,7 @@ class TmpRegTuplesBase<int index, int size, list<SubRegIndex> indices = getSubRegs<size>.ret, int index1 = !add(index, !add(size, -1)), string name = "ttmp["#index#":"#index1#"]"> : - RegisterWithSubRegs<name, subRegs> { + SIRegisterWithSubRegs<name, subRegs> { let HWEncoding = subRegs[0].HWEncoding; let SubRegIndices = indices; } @@ -293,8 +404,8 @@ class TmpRegTuples<string tgt, getSubRegs<size>.ret>; foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in { - def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>; - def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 2, Index>; } foreach Index = {0, 4, 8, 12} in { @@ -303,7 +414,7 @@ foreach Index = {0, 4, 8, 12} in { _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 4, Index>; def TTMP#Index#_TTMP#!add(Index,1)# _TTMP#!add(Index,2)# - _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>; + _TTMP#!add(Index,3)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 4, Index>; } foreach Index = {0, 4, 8} in { @@ -320,7 +431,7 @@ foreach Index = {0, 4, 8} in { _TTMP#!add(Index,4)# _TTMP#!add(Index,5)# _TTMP#!add(Index,6)# - _TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>; + _TTMP#!add(Index,7)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 8, Index>; } def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi : @@ -330,18 +441,17 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi, TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>; -def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 : +def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9_gfx10 : TmpRegTuplesBase<0, 16, - [TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9, - TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9, - TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9, - TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>; - + [TTMP0_gfx9_gfx10, TTMP1_gfx9_gfx10, TTMP2_gfx9_gfx10, TTMP3_gfx9_gfx10, + TTMP4_gfx9_gfx10, TTMP5_gfx9_gfx10, TTMP6_gfx9_gfx10, TTMP7_gfx9_gfx10, + TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10, + TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>; // VGPR 32-bit registers // i16/f16 only on VI+ def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add (sequence "VGPR%u", 0, 255))> { + (add (sequence "VGPR%u", 0, 255)), Reg32> { let AllocationPriority = 1; let Size = 32; } @@ -364,6 +474,14 @@ def VGPR_128 : RegisterTuples<getSubRegs<4>.ret, (add (shl VGPR_32, 2)), (add (shl VGPR_32, 3))]>; +// VGPR 160-bit registers +def VGPR_160 : RegisterTuples<getSubRegs<5>.ret, + [(add (trunc VGPR_32, 252)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4))]>; + // VGPR 256-bit registers def VGPR_256 : RegisterTuples<getSubRegs<8>.ret, [(add (trunc VGPR_32, 249)), @@ -394,88 +512,257 @@ def VGPR_512 : RegisterTuples<getSubRegs<16>.ret, (add (shl VGPR_32, 14)), (add (shl VGPR_32, 15))]>; +// VGPR 1024-bit registers +def VGPR_1024 : RegisterTuples<getSubRegs<32>.ret, + [(add (trunc VGPR_32, 225)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4)), + (add (shl VGPR_32, 5)), + (add (shl VGPR_32, 6)), + (add (shl VGPR_32, 7)), + (add (shl VGPR_32, 8)), + (add (shl VGPR_32, 9)), + (add (shl VGPR_32, 10)), + (add (shl VGPR_32, 11)), + (add (shl VGPR_32, 12)), + (add (shl VGPR_32, 13)), + (add (shl VGPR_32, 14)), + (add (shl VGPR_32, 15)), + (add (shl VGPR_32, 16)), + (add (shl VGPR_32, 17)), + (add (shl VGPR_32, 18)), + (add (shl VGPR_32, 19)), + (add (shl VGPR_32, 20)), + (add (shl VGPR_32, 21)), + (add (shl VGPR_32, 22)), + (add (shl VGPR_32, 23)), + (add (shl VGPR_32, 24)), + (add (shl VGPR_32, 25)), + (add (shl VGPR_32, 26)), + (add (shl VGPR_32, 27)), + (add (shl VGPR_32, 28)), + (add (shl VGPR_32, 29)), + (add (shl VGPR_32, 30)), + (add (shl VGPR_32, 31))]>; + +// AccVGPR 32-bit registers +def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add (sequence "AGPR%u", 0, 255)), Reg32> { + let AllocationPriority = 1; + let Size = 32; +} + +// AGPR 64-bit registers +def AGPR_64 : RegisterTuples<getSubRegs<2>.ret, + [(add (trunc AGPR_32, 255)), + (add (shl AGPR_32, 1))]>; + +// AGPR 128-bit registers +def AGPR_128 : RegisterTuples<getSubRegs<4>.ret, + [(add (trunc AGPR_32, 253)), + (add (shl AGPR_32, 1)), + (add (shl AGPR_32, 2)), + (add (shl AGPR_32, 3))]>; + +// AGPR 512-bit registers +def AGPR_512 : RegisterTuples<getSubRegs<16>.ret, + [(add (trunc AGPR_32, 241)), + (add (shl AGPR_32, 1)), + (add (shl AGPR_32, 2)), + (add (shl AGPR_32, 3)), + (add (shl AGPR_32, 4)), + (add (shl AGPR_32, 5)), + (add (shl AGPR_32, 6)), + (add (shl AGPR_32, 7)), + (add (shl AGPR_32, 8)), + (add (shl AGPR_32, 9)), + (add (shl AGPR_32, 10)), + (add (shl AGPR_32, 11)), + (add (shl AGPR_32, 12)), + (add (shl AGPR_32, 13)), + (add (shl AGPR_32, 14)), + (add (shl AGPR_32, 15))]>; + +// AGPR 1024-bit registers +def AGPR_1024 : RegisterTuples<getSubRegs<32>.ret, + [(add (trunc AGPR_32, 225)), + (add (shl AGPR_32, 1)), + (add (shl AGPR_32, 2)), + (add (shl AGPR_32, 3)), + (add (shl AGPR_32, 4)), + (add (shl AGPR_32, 5)), + (add (shl AGPR_32, 6)), + (add (shl AGPR_32, 7)), + (add (shl AGPR_32, 8)), + (add (shl AGPR_32, 9)), + (add (shl AGPR_32, 10)), + (add (shl AGPR_32, 11)), + (add (shl AGPR_32, 12)), + (add (shl AGPR_32, 13)), + (add (shl AGPR_32, 14)), + (add (shl AGPR_32, 15)), + (add (shl AGPR_32, 16)), + (add (shl AGPR_32, 17)), + (add (shl AGPR_32, 18)), + (add (shl AGPR_32, 19)), + (add (shl AGPR_32, 20)), + (add (shl AGPR_32, 21)), + (add (shl AGPR_32, 22)), + (add (shl AGPR_32, 23)), + (add (shl AGPR_32, 24)), + (add (shl AGPR_32, 25)), + (add (shl AGPR_32, 26)), + (add (shl AGPR_32, 27)), + (add (shl AGPR_32, 28)), + (add (shl AGPR_32, 29)), + (add (shl AGPR_32, 30)), + (add (shl AGPR_32, 31))]>; + //===----------------------------------------------------------------------===// // Register classes used as source and destination //===----------------------------------------------------------------------===// def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { + (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG), Reg32> { let isAllocatable = 0; let CopyCost = -1; } def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32, - (add PRIVATE_RSRC_REG)> { + (add PRIVATE_RSRC_REG), Reg128> { + let isAllocatable = 0; + let CopyCost = -1; +} + +def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add LDS_DIRECT), Reg32> { let isAllocatable = 0; let CopyCost = -1; } // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. -def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, - TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, - SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> { - let AllocationPriority = 7; + SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, + SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, + SRC_VCCZ, SRC_EXECZ, SRC_SCC), Reg32> { + let AllocationPriority = 10; } -def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { - let AllocationPriority = 7; +def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, + (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS), Reg32> { + let AllocationPriority = 10; } -def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { - let AllocationPriority = 7; +def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, + (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI), Reg32> { + let AllocationPriority = 10; } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { - let AllocationPriority = 7; +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, + (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI), Reg32> { + let AllocationPriority = 10; +} + +def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, + (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS), + Reg32> { + let isAllocatable = 0; } -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, + (add SGPR_64Regs), Reg64> { let CopyCost = 1; - let AllocationPriority = 8; + let AllocationPriority = 11; +} + +// CCR (call clobbered registers) SGPR 64-bit registers +def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, + (add (trunc SGPR_64, 16)), Reg64> { + let CopyCost = SGPR_64.CopyCost; + let AllocationPriority = SGPR_64.AllocationPriority; } -def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> { +def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, + (add TTMP_64Regs)> { let isAllocatable = 0; } def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, - (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { + (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA), Reg64> { let CopyCost = 1; - let AllocationPriority = 8; + let AllocationPriority = 13; } def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, - (add SReg_64_XEXEC, EXEC)> { + (add SReg_64_XEXEC, EXEC), Reg64> { let CopyCost = 1; - let AllocationPriority = 8; + let AllocationPriority = 13; +} + +def SReg_1_XEXEC : RegisterClass<"AMDGPU", [i1], 32, + (add SReg_64_XEXEC, SReg_32_XM0_XEXEC)> { + let CopyCost = 1; + let isAllocatable = 0; +} + +def SReg_1 : RegisterClass<"AMDGPU", [i1], 32, + (add SReg_1_XEXEC, EXEC, EXEC_LO)> { + let CopyCost = 1; + let isAllocatable = 0; } // Requires 2 s_mov_b64 to copy let CopyCost = 2 in { -def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add SGPR_128Regs)> { - let AllocationPriority = 10; +// There are no 3-component scalar instructions, but this is needed +// for symmetry with VGPRs. +def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, + (add SGPR_96Regs), Reg96> { + let AllocationPriority = 14; } -def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add TTMP_128Regs)> { +def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, + (add SGPR_96), Reg96> { + let AllocationPriority = 14; +} + +def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, + (add SGPR_128Regs), Reg128> { + let AllocationPriority = 15; +} + +def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, + (add TTMP_128Regs)> { let isAllocatable = 0; } def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, - (add SGPR_128, TTMP_128)> { - let AllocationPriority = 10; + (add SGPR_128, TTMP_128), Reg128> { + let AllocationPriority = 15; } } // End CopyCost = 2 -def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> { - let AllocationPriority = 11; +// There are no 5-component scalar instructions, but this is needed +// for symmetry with VGPRs. +def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, + (add SGPR_160Regs), Reg160> { + let AllocationPriority = 16; +} + +def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, + (add SGPR_160), Reg160> { + let AllocationPriority = 16; +} + +def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs), + Reg256> { + let AllocationPriority = 17; } def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { @@ -483,29 +770,48 @@ def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { } def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, - (add SGPR_256, TTMP_256)> { + (add SGPR_256, TTMP_256), Reg256> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; - let AllocationPriority = 11; + let AllocationPriority = 17; } -def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512Regs)> { - let AllocationPriority = 12; +def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, + (add SGPR_512Regs), Reg512> { + let AllocationPriority = 18; } -def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add TTMP_512Regs)> { +def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, + (add TTMP_512Regs)> { let isAllocatable = 0; } def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add SGPR_512, TTMP_512)> { + (add SGPR_512, TTMP_512), Reg512> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; - let AllocationPriority = 12; + let AllocationPriority = 18; +} + +def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add VGPR_32, LDS_DIRECT_CLASS), Reg32> { + let isAllocatable = 0; +} + +def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, + (add SGPR_1024Regs), Reg1024> { + let AllocationPriority = 19; +} + +def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, + (add SGPR_1024), Reg1024> { + let CopyCost = 16; + let AllocationPriority = 19; } // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> { +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, + (add VGPR_64), Reg64> { let Size = 64; // Requires 2 v_mov_b32 to copy @@ -513,7 +819,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32 let AllocationPriority = 2; } -def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { +def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96> { let Size = 96; // Requires 3 v_mov_b32 to copy @@ -521,7 +827,8 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { let AllocationPriority = 3; } -def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> { +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, + (add VGPR_128), Reg128> { let Size = 128; // Requires 4 v_mov_b32 to copy @@ -529,28 +836,88 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VG let AllocationPriority = 4; } -def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> { +def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, + (add VGPR_160), Reg160> { + let Size = 160; + + // Requires 5 v_mov_b32 to copy + let CopyCost = 5; + let AllocationPriority = 5; +} + +def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, + (add VGPR_256), Reg256> { let Size = 256; let CopyCost = 8; - let AllocationPriority = 5; + let AllocationPriority = 6; } -def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> { +def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, + (add VGPR_512), Reg512> { let Size = 512; let CopyCost = 16; - let AllocationPriority = 6; + let AllocationPriority = 7; +} + +def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, + (add VGPR_1024), Reg1024> { + let Size = 1024; + let CopyCost = 32; + let AllocationPriority = 8; } -def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { +def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, + (add AGPR_64), Reg64> { + let Size = 64; + + let CopyCost = 5; + let AllocationPriority = 2; +} + +def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, + (add AGPR_128), Reg128> { + let Size = 128; + + // Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr + let CopyCost = 9; + let AllocationPriority = 4; +} + +def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, + (add AGPR_512), Reg512> { + let Size = 512; + let CopyCost = 33; + let AllocationPriority = 7; +} + +def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, + (add AGPR_1024), Reg1024> { + let Size = 1024; + let CopyCost = 65; + let AllocationPriority = 8; +} + +def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32), Reg32> { let Size = 32; } def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add VGPR_32, SReg_32)> { + (add VGPR_32, SReg_32, LDS_DIRECT_CLASS), Reg32> { + let isAllocatable = 0; +} + +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64), + Reg64> { let isAllocatable = 0; } -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { +def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add AGPR_32, VGPR_32), Reg32> { + let isAllocatable = 0; +} + +def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32, + (add AReg_64, VReg_64), Reg64> { let isAllocatable = 0; } @@ -563,47 +930,40 @@ class RegImmMatcher<string name> : AsmOperandClass { let RenderMethod = "addRegOrImmOperands"; } -multiclass SIRegOperand <string rc, string MatchName, string opType> { +multiclass SIRegOperand32 <string rc, string MatchName, string opType, + string rc_suffix = "_32"> { let OperandNamespace = "AMDGPU" in { - def _b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + def _b16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { let OperandType = opType#"_INT16"; let ParserMatchClass = RegImmMatcher<MatchName#"B16">; let DecoderMethod = "decodeOperand_VSrc16"; } - def _f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + def _f16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { let OperandType = opType#"_FP16"; let ParserMatchClass = RegImmMatcher<MatchName#"F16">; - let DecoderMethod = "decodeOperand_VSrc16"; + let DecoderMethod = "decodeOperand_" # rc # "_16"; } - def _b32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + def _b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { let OperandType = opType#"_INT32"; let ParserMatchClass = RegImmMatcher<MatchName#"B32">; + let DecoderMethod = "decodeOperand_" # rc # rc_suffix; } - def _f32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + def _f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { let OperandType = opType#"_FP32"; let ParserMatchClass = RegImmMatcher<MatchName#"F32">; + let DecoderMethod = "decodeOperand_" # rc # rc_suffix; } - def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> { - let OperandType = opType#"_INT64"; - let ParserMatchClass = RegImmMatcher<MatchName#"B64">; - } - - def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> { - let OperandType = opType#"_FP64"; - let ParserMatchClass = RegImmMatcher<MatchName#"F64">; - } - - def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { let OperandType = opType#"_V2INT16"; let ParserMatchClass = RegImmMatcher<MatchName#"V2B16">; let DecoderMethod = "decodeOperand_VSrcV216"; } - def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { let OperandType = opType#"_V2FP16"; let ParserMatchClass = RegImmMatcher<MatchName#"V2F16">; let DecoderMethod = "decodeOperand_VSrcV216"; @@ -611,6 +971,21 @@ multiclass SIRegOperand <string rc, string MatchName, string opType> { } } +multiclass SIRegOperand <string rc, string MatchName, string opType> : + SIRegOperand32<rc, MatchName, opType> { + let OperandNamespace = "AMDGPU" in { + def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> { + let OperandType = opType#"_INT64"; + let ParserMatchClass = RegImmMatcher<MatchName#"B64">; + } + + def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> { + let OperandType = opType#"_FP64"; + let ParserMatchClass = RegImmMatcher<MatchName#"F64">; + } + } +} + // FIXME: 64-bit sources can sometimes use 32-bit constants. multiclass RegImmOperand <string rc, string MatchName> : SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">; @@ -618,20 +993,32 @@ multiclass RegImmOperand <string rc, string MatchName> multiclass RegInlineOperand <string rc, string MatchName> : SIRegOperand<rc, MatchName, "OPERAND_REG_INLINE_C">; +multiclass RegInlineOperand32 <string rc, string MatchName, + string rc_suffix = "_32"> + : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>; + +multiclass RegInlineOperandAC <string rc, string MatchName, + string rc_suffix = "_32"> + : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix>; + //===----------------------------------------------------------------------===// // SSrc_* Operands with an SGPR or a 32-bit immediate //===----------------------------------------------------------------------===// defm SSrc : RegImmOperand<"SReg", "SSrc">; +def SSrcOrLds_b32 : RegisterOperand<SRegOrLds_32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM_INT32"; + let ParserMatchClass = RegImmMatcher<"SSrcOrLdsB32">; +} + //===----------------------------------------------------------------------===// // SCSrc_* Operands with an SGPR or a inline constant //===----------------------------------------------------------------------===// defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ; -def SCSrc_i1 : RegisterOperand<SReg_64_XEXEC>; - //===----------------------------------------------------------------------===// // VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate //===----------------------------------------------------------------------===// @@ -654,7 +1041,45 @@ def VRegSrc_32 : RegisterOperand<VGPR_32> { } //===----------------------------------------------------------------------===// +// ASrc_* Operands with an AccVGPR +//===----------------------------------------------------------------------===// + +def ARegSrc_32 : RegisterOperand<AGPR_32> { + let DecoderMethod = "DecodeAGPR_32RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +//===----------------------------------------------------------------------===// // VCSrc_* Operands with an SGPR, VGPR or an inline constant //===----------------------------------------------------------------------===// defm VCSrc : RegInlineOperand<"VS", "VCSrc">; + +//===----------------------------------------------------------------------===// +// VISrc_* Operands with a VGPR or an inline constant +//===----------------------------------------------------------------------===// + +defm VISrc : RegInlineOperand32<"VGPR", "VISrc">; + +//===----------------------------------------------------------------------===// +// AVSrc_* Operands with an AGPR or VGPR +//===----------------------------------------------------------------------===// + +def AVSrc_32 : RegisterOperand<AV_32> { + let DecoderMethod = "DecodeAV_32RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVSrc_64 : RegisterOperand<AV_64> { + let DecoderMethod = "DecodeAV_64RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +//===----------------------------------------------------------------------===// +// ACSrc_* Operands with an AGPR or an inline constant +//===----------------------------------------------------------------------===// + +defm AISrc : RegInlineOperandAC<"AGPR", "AISrc">; +defm AISrc_128 : RegInlineOperandAC<"AReg", "AISrc_128", "_128">; +defm AISrc_512 : RegInlineOperandAC<"AReg", "AISrc_512", "_512">; +defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">; diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td index 7af69cb6a46d..824d1aeb0df9 100644 --- a/lib/Target/AMDGPU/SISchedule.td +++ b/lib/Target/AMDGPU/SISchedule.td @@ -1,9 +1,8 @@ //===-- SISchedule.td - SI Scheduling definitons -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -25,6 +24,9 @@ def WriteSMEM : SchedWrite; def WriteVMEM : SchedWrite; def WriteBarrier : SchedWrite; +def MIVGPRRead : SchedRead; +def MIMFMARead : SchedRead; + // Vector ALU instructions def Write32Bit : SchedWrite; def WriteQuarterRate32 : SchedWrite; @@ -38,9 +40,17 @@ def WriteDouble : SchedWrite; // half rate f64 instruction (same as v_add_f64) def WriteDoubleAdd : SchedWrite; +// Conversion to or from f64 instruction +def WriteDoubleCvt : SchedWrite; + // Half rate 64-bit instructions. def Write64Bit : SchedWrite; +// mAI multipass instructions. +def Write2PassMAI : SchedWrite; +def Write8PassMAI : SchedWrite; +def Write16PassMAI : SchedWrite; + // FIXME: Should there be a class for instructions which are VALU // instructions and have VALU rates, but write to the SALU (i.e. VOPC // instructions) @@ -62,6 +72,7 @@ class SISchedMachineModel : SchedMachineModel { def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; +def GFX10SpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? def HWBranch : ProcResource<1> { @@ -82,6 +93,9 @@ def HWVMEM : ProcResource<1> { def HWVALU : ProcResource<1> { let BufferSize = 1; } +def HWRC : ProcResource<1> { // Register destination cache + let BufferSize = 1; +} class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, int latency> : WriteRes<write, resources> { @@ -91,6 +105,11 @@ class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, class HWVALUWriteRes<SchedWrite write, int latency> : HWWriteRes<write, [HWVALU], latency>; +def PredMIReadVGPR : SchedPredicate<[{TII->hasVGPRUses(*MI)}]>; + +def MIReadVGPR : SchedReadVariant<[ + SchedVar<PredMIReadVGPR, [MIVGPRRead]>, + SchedVar<NoSchedPred, [ReadDefault]>]>; // The latency numbers are taken from AMD Accelerated Parallel Processing // guide. They may not be accurate. @@ -109,6 +128,24 @@ multiclass SICommonWriteRes { def : HWVALUWriteRes<Write32Bit, 1>; def : HWVALUWriteRes<Write64Bit, 2>; def : HWVALUWriteRes<WriteQuarterRate32, 4>; + def : HWVALUWriteRes<Write2PassMAI, 2>; + def : HWVALUWriteRes<Write8PassMAI, 8>; + def : HWVALUWriteRes<Write16PassMAI, 16>; + + def : ReadAdvance<MIVGPRRead, -2>; + def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>; + + // Technicaly mfma reads can be from 0 to 4 cycles but that does not make + // sense to model because its register setup is huge. In particular if we + // properly model read advanice as -2 for a vgpr read it will result in a + // bad scheduling of acc writes before that mfma. To avoid it we would + // need to consume 2 or 4 more vgprs to be initialized before the acc + // write sequence. Just assume worst case here. + def : ReadAdvance<MIMFMARead, -4>; + + def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>; + def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>; + def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>; } def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>; @@ -125,6 +162,7 @@ defm : SICommonWriteRes; def : HWVALUWriteRes<WriteFloatFMA, 1>; def : HWVALUWriteRes<WriteDouble, 4>; def : HWVALUWriteRes<WriteDoubleAdd, 2>; +def : HWVALUWriteRes<WriteDoubleCvt, 4>; def : InstRW<[WriteCopy], (instrs COPY)>; @@ -137,7 +175,32 @@ defm : SICommonWriteRes; def : HWVALUWriteRes<WriteFloatFMA, 16>; def : HWVALUWriteRes<WriteDouble, 16>; def : HWVALUWriteRes<WriteDoubleAdd, 8>; +def : HWVALUWriteRes<WriteDoubleCvt, 4>; def : InstRW<[WriteCopy], (instrs COPY)>; } // End SchedModel = SIQuarterSpeedModel + +let SchedModel = GFX10SpeedModel in { + +// The latency values are 1 / (operations / cycle). +// Add 1 stall cycle for VGPR read. +def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; +def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 9>; +def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 17>; +def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>; +def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 17>; +def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 17>; +def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 17>; + +def : HWWriteRes<WriteBranch, [HWBranch], 32>; +def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>; +def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>; +def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 5>; +def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>; +def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; +def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; + +def : InstRW<[WriteCopy], (instrs COPY)>; + +} // End SchedModel = GFX10SpeedModel diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 6ad7dd0e3a7c..7ee178149c7a 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -1,9 +1,8 @@ //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// The pass tries to use the 32-bit encoding for instructions when possible. //===----------------------------------------------------------------------===// @@ -39,6 +38,8 @@ class SIShrinkInstructions : public MachineFunctionPass { public: static char ID; + void shrinkMIMG(MachineInstr &MI); + public: SIShrinkInstructions() : MachineFunctionPass(ID) { } @@ -94,6 +95,10 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, Src0.setSubReg(0); Src0.ChangeToFrameIndex(MovSrc.getIndex()); ConstantFolded = true; + } else if (MovSrc.isGlobal()) { + Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), + MovSrc.getTargetFlags()); + ConstantFolded = true; } if (ConstantFolded) { @@ -212,6 +217,96 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { } } +// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. +void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); + if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + return; + + MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); + unsigned NewAddrDwords = Info->VAddrDwords; + const TargetRegisterClass *RC; + + if (Info->VAddrDwords == 2) { + RC = &AMDGPU::VReg_64RegClass; + } else if (Info->VAddrDwords == 3) { + RC = &AMDGPU::VReg_96RegClass; + } else if (Info->VAddrDwords == 4) { + RC = &AMDGPU::VReg_128RegClass; + } else if (Info->VAddrDwords <= 8) { + RC = &AMDGPU::VReg_256RegClass; + NewAddrDwords = 8; + } else { + RC = &AMDGPU::VReg_512RegClass; + NewAddrDwords = 16; + } + + unsigned VgprBase = 0; + bool IsUndef = true; + bool IsKill = NewAddrDwords == Info->VAddrDwords; + for (unsigned i = 0; i < Info->VAddrDwords; ++i) { + const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); + unsigned Vgpr = TRI.getHWRegIndex(Op.getReg()); + + if (i == 0) { + VgprBase = Vgpr; + } else if (VgprBase + i != Vgpr) + return; + + if (!Op.isUndef()) + IsUndef = false; + if (!Op.isKill()) + IsKill = false; + } + + if (VgprBase + NewAddrDwords > 256) + return; + + // Further check for implicit tied operands - this may be present if TFE is + // enabled + int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); + int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); + unsigned TFEVal = MI.getOperand(TFEIdx).getImm(); + unsigned LWEVal = MI.getOperand(LWEIdx).getImm(); + int ToUntie = -1; + if (TFEVal || LWEVal) { + // TFE/LWE is enabled so we need to deal with an implicit tied operand + for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { + if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && + MI.getOperand(i).isImplicit()) { + // This is the tied operand + assert( + ToUntie == -1 && + "found more than one tied implicit operand when expecting only 1"); + ToUntie = i; + MI.untieRegOperand(ToUntie); + } + } + } + + unsigned NewOpcode = + AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, + Info->VDataDwords, NewAddrDwords); + MI.setDesc(TII->get(NewOpcode)); + MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); + MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); + MI.getOperand(VAddr0Idx).setIsKill(IsKill); + + for (unsigned i = 1; i < Info->VAddrDwords; ++i) + MI.RemoveOperand(VAddr0Idx + 1); + + if (ToUntie >= 0) { + MI.tieOperands( + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), + ToUntie - (Info->VAddrDwords - 1)); + } +} + /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. /// If the inverse of the immediate is legal, use ANDN2, ORN2 or @@ -277,7 +372,9 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, if (Opc == AMDGPU::S_BITSET0_B32 || Opc == AMDGPU::S_BITSET1_B32) { Src0->ChangeToImmediate(NewImm); - MI.RemoveOperand(2); + // Remove the immediate and add the tied input. + MI.getOperand(2).ChangeToRegister(Dest->getReg(), false); + MI.tieOperands(0, 2); } else { SrcImm->setImm(NewImm); } @@ -458,6 +555,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; std::vector<unsigned> I1Defs; @@ -596,6 +694,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } + if (TII->isMIMG(MI.getOpcode()) && + ST.getGeneration() >= AMDGPUSubtarget::GFX10 && + MF.getProperties().hasProperty( + MachineFunctionProperties::Property::NoVRegs)) { + shrinkMIMG(MI); + continue; + } + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; @@ -625,10 +731,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // So, instead of forcing the instruction to write to VCC, we provide // a hint to the register allocator to use VCC and then we will run // this pass again after RA and shrink it if it outputs to VCC. - MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); + MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg); continue; } - if (DstReg != AMDGPU::VCC) + if (DstReg != VCCReg) continue; } @@ -641,10 +747,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; unsigned SReg = Src2->getReg(); if (TargetRegisterInfo::isVirtualRegister(SReg)) { - MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); + MRI.setRegAllocationHint(SReg, 0, VCCReg); continue; } - if (SReg != AMDGPU::VCC) + if (SReg != VCCReg) continue; } @@ -657,20 +763,24 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { AMDGPU::OpName::src2); if (SDst) { - if (SDst->getReg() != AMDGPU::VCC) { + bool Next = false; + + if (SDst->getReg() != VCCReg) { if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) - MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC); - continue; + MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg); + Next = true; } // All of the instructions with carry outs also have an SGPR input in // src2. - if (Src2 && Src2->getReg() != AMDGPU::VCC) { + if (Src2 && Src2->getReg() != VCCReg) { if (TargetRegisterInfo::isVirtualRegister(Src2->getReg())) - MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC); + MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg); + Next = true; + } + if (Next) continue; - } } // We can shrink this instruction diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 879726b1528c..4e07efff55d8 100644 --- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -1,9 +1,8 @@ //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -149,6 +148,7 @@ private: CallingConv::ID CallingConv; const SIInstrInfo *TII; const SIRegisterInfo *TRI; + const GCNSubtarget *ST; MachineRegisterInfo *MRI; LiveIntervals *LIS; @@ -201,6 +201,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LiveIntervals>(); + AU.addPreserved<SlotIndexes>(); + AU.addPreserved<LiveIntervals>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -277,7 +279,7 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, // for VCC, which can appear as the (implicit) input of a uniform branch, // e.g. when a loop counter is stored in a VGPR. if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - if (Reg == AMDGPU::EXEC) + if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO) continue; for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { @@ -386,7 +388,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, unsigned Reg = MO.getReg(); if (!TRI->isVirtualRegister(Reg) && - TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { + TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) { Flags = StateWQM; break; } @@ -619,13 +621,16 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineInstr *MI; if (SaveWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? + AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64), SaveWQM) .addReg(LiveMaskReg); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? + AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64), + Exec) + .addReg(Exec) .addReg(LiveMaskReg); } @@ -637,13 +642,15 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, unsigned SavedWQM) { MachineInstr *MI; + unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (SavedWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec) .addReg(SavedWQM); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? + AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), + Exec) + .addReg(Exec); } LIS->InsertMachineInstrInMaps(*MI); @@ -655,8 +662,7 @@ void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, MachineInstr *MI; assert(SaveOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), - SaveOrig) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) .addImm(-1); LIS->InsertMachineInstrInMaps(*MI); } @@ -667,7 +673,8 @@ void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, MachineInstr *MI; assert(SavedOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), + ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) .addReg(SavedOrig); LIS->InsertMachineInstrInMaps(*MI); } @@ -693,6 +700,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool WQMFromExec = isEntry; char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; char NonWWMState = 0; + const TargetRegisterClass *BoolRC = TRI->getBoolRC(); auto II = MBB.getFirstNonPHI(), IE = MBB.end(); if (isEntry) @@ -780,13 +788,13 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, if (Needs == StateWWM) { NonWWMState = State; - SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + SavedNonWWMReg = MRI->createVirtualRegister(BoolRC); toWWM(MBB, Before, SavedNonWWMReg); State = StateWWM; } else { if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { if (!WQMFromExec && (OutNeeds & StateWQM)) - SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + SavedWQMReg = MRI->createVirtualRegister(BoolRC); toExact(MBB, Before, SavedWQMReg, LiveMaskReg); State = StateExact; @@ -838,7 +846,23 @@ void SIWholeQuadMode::lowerCopyInstrs() { for (MachineInstr *MI : LowerToCopyInstrs) { for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) MI->RemoveOperand(i); - MI->setDesc(TII->get(AMDGPU::COPY)); + + const unsigned Reg = MI->getOperand(0).getReg(); + + if (TRI->isVGPR(*MRI, Reg)) { + const TargetRegisterClass *regClass = + TargetRegisterInfo::isVirtualRegister(Reg) + ? MRI->getRegClass(Reg) + : TRI->getPhysRegClass(Reg); + + const unsigned MovOp = TII->getMovOpcode(regClass); + MI->setDesc(TII->get(MovOp)); + + // And make it implicitly depend on exec (like all VALU movs should do). + MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + } else { + MI->setDesc(TII->get(AMDGPU::COPY)); + } } } @@ -849,17 +873,18 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LowerToCopyInstrs.clear(); CallingConv = MF.getFunction().getCallingConv(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + ST = &MF.getSubtarget<GCNSubtarget>(); - TII = ST.getInstrInfo(); + TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis<LiveIntervals>(); char GlobalFlags = analyzeFunction(MF); unsigned LiveMaskReg = 0; + unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (!(GlobalFlags & StateWQM)) { - lowerLiveMaskQueries(AMDGPU::EXEC); + lowerLiveMaskQueries(Exec); if (!(GlobalFlags & StateWWM)) return !LiveMaskQueries.empty(); } else { @@ -868,10 +893,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { - LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(AMDGPU::EXEC); + .addReg(Exec); LIS->InsertMachineInstrInMaps(*MI); } @@ -879,9 +904,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (GlobalFlags == StateWQM) { // For a shader that needs only WQM, we can just set it once. - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ? + AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), + Exec) + .addReg(Exec); lowerCopyInstrs(); // EntryMI may become invalid here diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td index 8a063e1a4867..1b410b6b5912 100644 --- a/lib/Target/AMDGPU/SMInstructions.td +++ b/lib/Target/AMDGPU/SMInstructions.td @@ -1,9 +1,8 @@ //===---- SMInstructions.td - Scalar Memory Instruction Defintions --------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -34,7 +33,6 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt let hasSideEffects = 0; let UseNamedOperandTable = 1; let SchedRW = [WriteSMEM]; - let SubtargetPredicate = isGCN; string Mnemonic = opName; string AsmOperands = asmOps; @@ -42,6 +40,7 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt bits<1> has_sbase = 1; bits<1> has_sdst = 1; bit has_glc = 0; + bit has_dlc = 0; bits<1> has_offset = 1; bits<1> offset_is_imm = 0; } @@ -81,6 +80,7 @@ class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> let mayLoad = 1; let mayStore = 0; let has_glc = 1; + let has_dlc = 1; } class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern = []> @@ -90,6 +90,7 @@ class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern let mayLoad = 0; let mayStore = 1; let has_glc = 1; + let has_dlc = 1; let ScalarStore = 1; } @@ -110,21 +111,23 @@ multiclass SM_Pseudo_Loads<string opName, RegisterClass dstClass> { def _IMM : SM_Load_Pseudo <opName, (outs dstClass:$sdst), - (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc), - " $sdst, $sbase, $offset$glc", []> { + (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc), + " $sdst, $sbase, $offset$glc$dlc", []> { let offset_is_imm = 1; let BaseClass = baseClass; let PseudoInstr = opName # "_IMM"; let has_glc = 1; + let has_dlc = 1; } def _SGPR : SM_Load_Pseudo <opName, (outs dstClass:$sdst), - (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc), - " $sdst, $sbase, $offset$glc", []> { + (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc), + " $sdst, $sbase, $offset$glc$dlc", []> { let BaseClass = baseClass; let PseudoInstr = opName # "_SGPR"; let has_glc = 1; + let has_dlc = 1; } } @@ -132,8 +135,8 @@ multiclass SM_Pseudo_Stores<string opName, RegisterClass baseClass, RegisterClass srcClass> { def _IMM : SM_Store_Pseudo <opName, - (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc), - " $sdata, $sbase, $offset$glc", []> { + (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc), + " $sdata, $sbase, $offset$glc$dlc", []> { let offset_is_imm = 1; let BaseClass = baseClass; let SrcClass = srcClass; @@ -141,8 +144,8 @@ multiclass SM_Pseudo_Stores<string opName, } def _SGPR : SM_Store_Pseudo <opName, - (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc), - " $sdata, $sbase, $offset$glc", []> { + (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc), + " $sdata, $sbase, $offset$glc$dlc", []> { let BaseClass = baseClass; let SrcClass = srcClass; let PseudoInstr = opName # "_SGPR"; @@ -154,17 +157,25 @@ multiclass SM_Pseudo_Discards<string opName> { def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>; } -class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo< +class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pseudo< opName, (outs SReg_64_XEXEC:$sdst), (ins), " $sdst", [(set i64:$sdst, (node))]> { let hasSideEffects = 1; - let mayStore = 0; + + // FIXME: This should be definitively mayStore = 0. TableGen + // brokenly tries to infer these based on the intrinsic properties + // corresponding to the IR attributes. The target intrinsics are + // considered as writing to memory for IR dependency purposes, but + // those can be modeled with hasSideEffects here. These also end up + // inferring differently for llvm.readcyclecounter and the amdgcn + // intrinsics. + let mayStore = ?; let mayLoad = 1; let has_sbase = 0; let has_offset = 0; } -class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo< +class SM_Inval_Pseudo <string opName, SDPatternOperator node = null_frag> : SM_Pseudo< opName, (outs), (ins), "", [(node)]> { let hasSideEffects = 1; let mayStore = 1; @@ -178,6 +189,16 @@ multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> { def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>; } +class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo< + opName, (outs SReg_32_XM0_XEXEC:$sdst), (ins), + " $sdst", [(set i32:$sdst, (node))]> { + let hasSideEffects = 1; + let mayStore = 0; + let mayLoad = 1; + let has_sbase = 0; + let has_offset = 0; +} + //===----------------------------------------------------------------------===// // Scalar Atomic Memory Classes //===----------------------------------------------------------------------===// @@ -191,6 +212,7 @@ class SM_Atomic_Pseudo <string opName, let mayLoad = 1; let mayStore = 1; let has_glc = 1; + let has_dlc = 1; // Should these be set? let ScalarStore = 1; @@ -206,9 +228,9 @@ class SM_Pseudo_Atomic<string opName, SM_Atomic_Pseudo<opName, !if(isRet, (outs dataClass:$sdst), (outs)), !if(isImm, - (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset), - (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset)), - !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", ""), + (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset, DLC:$dlc), + (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, DLC:$dlc)), + !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", "") # "$dlc", isRet> { let offset_is_imm = isImm; let PseudoInstr = opName # !if(isImm, @@ -266,6 +288,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads < "s_buffer_load_dwordx16", SReg_128, SReg_512 >; +let SubtargetPredicate = HasScalarStores in { defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>; defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>; defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>; @@ -281,25 +304,32 @@ defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores < defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores < "s_buffer_store_dwordx4", SReg_128, SReg_128 >; - +} // End SubtargetPredicate = HasScalarStores def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>; def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>; -let SubtargetPredicate = isCIVI in { +let SubtargetPredicate = isGFX7GFX8GFX9 in { def S_DCACHE_INV_VOL : SM_Inval_Pseudo <"s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>; -} // let SubtargetPredicate = isCIVI +} // let SubtargetPredicate = isGFX7GFX8GFX9 -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8Plus in { +let OtherPredicates = [HasScalarStores] in { def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>; def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; +} // End OtherPredicates = [HasScalarStores] def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>; defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>; defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>; -} // SubtargetPredicate = isVI +} // SubtargetPredicate = isGFX8Plus + +let SubtargetPredicate = isGFX10Plus in { +def S_GL1_INV : SM_Inval_Pseudo<"s_gl1_inv">; +def S_GET_WAVEID_IN_WORKGROUP : SM_WaveId_Pseudo <"s_get_waveid_in_workgroup", int_amdgcn_s_get_waveid_in_workgroup>; +} // End SubtargetPredicate = isGFX10Plus -let SubtargetPredicate = HasFlatScratchInsts, Uses = [FLAT_SCR] in { +let SubtargetPredicate = HasScalarFlatScratchInsts, Uses = [FLAT_SCR] in { defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>; defm S_SCRATCH_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_scratch_load_dwordx2", SReg_64, SReg_64_XEXEC>; defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_64, SReg_128>; @@ -307,7 +337,7 @@ defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_6 defm S_SCRATCH_STORE_DWORD : SM_Pseudo_Stores <"s_scratch_store_dword", SReg_64, SReg_32_XM0_XEXEC>; defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <"s_scratch_store_dwordx2", SReg_64, SReg_64_XEXEC>; defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg_64, SReg_128>; -} // SubtargetPredicate = HasFlatScratchInsts +} // SubtargetPredicate = HasScalarFlatScratchInsts let SubtargetPredicate = HasScalarAtomics in { @@ -369,7 +399,7 @@ defm S_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_atomic_dec_x2", SReg_6 } // let SubtargetPredicate = HasScalarAtomics -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = HasScalarAtomics in { defm S_DCACHE_DISCARD : SM_Pseudo_Discards <"s_dcache_discard">; defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">; } @@ -387,8 +417,8 @@ class SMRD_Real_si <bits<5> op, SM_Pseudo ps> , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> , Enc32 { - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; + let AssemblerPredicates = [isGFX6GFX7]; + let DecoderNamespace = "GFX6GFX7"; let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?); let Inst{8} = imm; @@ -405,13 +435,13 @@ multiclass SM_Real_Loads_si<bits<5> op, string ps, SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { def _IMM_si : SMRD_Real_si <op, immPs> { - let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc); + let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc, DLC:$dlc); } // FIXME: The operand name $offset is inconsistent with $soff used // in the pseudo def _SGPR_si : SMRD_Real_si <op, sgprPs> { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); } } @@ -441,8 +471,8 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps> , Enc64 { bit glc; - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; + let AssemblerPredicates = [isGFX8GFX9]; + let DecoderNamespace = "GFX8"; let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); @@ -458,10 +488,10 @@ multiclass SM_Real_Loads_vi<bits<8> op, string ps, SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { def _IMM_vi : SMEM_Real_vi <op, immPs> { - let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc); + let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); } def _SGPR_vi : SMEM_Real_vi <op, sgprPs> { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); } } @@ -479,11 +509,11 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps, // FIXME: The operand name $offset is inconsistent with $soff used // in the pseudo def _IMM_vi : SMEM_Real_Store_vi <op, immPs> { - let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc); + let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); } def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> { - let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc); + let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); } } @@ -630,9 +660,9 @@ class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> : SM_Real<ps>, Enc64 { - let AssemblerPredicates = [isCIOnly]; - let DecoderNamespace = "CI"; - let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc); + let AssemblerPredicates = [isGFX7Only]; + let DecoderNamespace = "GFX7"; + let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc, DLC:$dlc); let LGKM_CNT = ps.LGKM_CNT; let SMRD = ps.SMRD; @@ -667,8 +697,8 @@ class SMRD_Real_ci <bits<5> op, SM_Pseudo ps> , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> , Enc32 { - let AssemblerPredicates = [isCIOnly]; - let DecoderNamespace = "CI"; + let AssemblerPredicates = [isGFX7Only]; + let DecoderNamespace = "GFX7"; let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?); let Inst{8} = imm; @@ -684,7 +714,22 @@ def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>; // Scalar Memory Patterns //===----------------------------------------------------------------------===// -def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]>; +def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]> { + let GISelPredicateCode = [{ + if (!MI.hasOneMemOperand()) + return false; + if (!isInstrUniform(MI)) + return false; + + // FIXME: We should probably be caching this. + SmallVector<GEPInfo, 4> AddrInfo; + getAddrModeInfo(MI, MRI, AddrInfo); + + if (hasVgprParts(AddrInfo)) + return false; + return true; + }]; +} def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; @@ -697,41 +742,49 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> { // 1. IMM offset def : GCNPat < (smrd_load (SMRDImm i64:$sbase, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0)) + (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0, 0)) >; // 2. 32-bit IMM offset on CI def : GCNPat < (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), - (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> { - let OtherPredicates = [isCIOnly]; + (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0, 0))> { + let OtherPredicates = [isGFX7Only]; } // 3. SGPR offset def : GCNPat < (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0)) + (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0, 0)) + >; + + // 4. No offset + def : GCNPat < + (vt (smrd_load (i64 SReg_64:$sbase))), + (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0, 0)) >; } multiclass SMLoad_Pattern <string Instr, ValueType vt> { // 1. Offset as an immediate def : GCNPat < - (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc), - (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc))) + (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc, i1:$dlc), + (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc), + (as_i1imm $dlc))) >; // 2. 32-bit IMM offset on CI def : GCNPat < - (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)), - (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc))> { - let OtherPredicates = [isCIOnly]; + (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc, i1:$dlc)), + (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc), (as_i1imm $dlc))> { + let OtherPredicates = [isGFX7Only]; } // 3. Offset loaded in an 32bit SGPR def : GCNPat < - (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc))) + (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc, i1:$dlc), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc), + (as_i1imm $dlc))) >; } @@ -759,18 +812,202 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>; } // End let AddedComplexity = 100 -let OtherPredicates = [isSICI] in { def : GCNPat < (i64 (readcyclecounter)), (S_MEMTIME) >; + +//===----------------------------------------------------------------------===// +// GFX10. +//===----------------------------------------------------------------------===// + +class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> : + SM_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10>, Enc64 { + bit glc; + bit dlc; + + let AssemblerPredicates = [isGFX10Plus]; + let DecoderNamespace = "GFX10"; + + let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); + let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); + let Inst{14} = !if(ps.has_dlc, dlc, ?); + let Inst{16} = !if(ps.has_glc, glc, ?); + let Inst{25-18} = op; + let Inst{31-26} = 0x3d; + let Inst{51-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{19-0}, ?), ?); + let Inst{63-57} = !if(ps.offset_is_imm, !cast<int>(SGPR_NULL.HWEncoding), + !if(ps.has_offset, offset{6-0}, ?)); } -let OtherPredicates = [isVI] in { +multiclass SM_Real_Loads_gfx10<bits<8> op, string ps, + SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), + SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { + def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> { + let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); + } + def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> { + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); + } +} -def : GCNPat < - (i64 (readcyclecounter)), - (S_MEMREALTIME) ->; +class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps> { + bits<7> sdata; + + let sdst = ?; + let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?); +} + +multiclass SM_Real_Stores_gfx10<bits<8> op, string ps, + SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM), + SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> { + // FIXME: The operand name $offset is inconsistent with $soff used + // in the pseudo + def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> { + let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); + } + + def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> { + let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); + } +} + +defm S_LOAD_DWORD : SM_Real_Loads_gfx10<0x000, "S_LOAD_DWORD">; +defm S_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x001, "S_LOAD_DWORDX2">; +defm S_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x002, "S_LOAD_DWORDX4">; +defm S_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x003, "S_LOAD_DWORDX8">; +defm S_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x004, "S_LOAD_DWORDX16">; + +let SubtargetPredicate = HasScalarFlatScratchInsts in { +defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_gfx10<0x005, "S_SCRATCH_LOAD_DWORD">; +defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x006, "S_SCRATCH_LOAD_DWORDX2">; +defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x007, "S_SCRATCH_LOAD_DWORDX4">; +} // End SubtargetPredicate = HasScalarFlatScratchInsts + +defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_gfx10<0x008, "S_BUFFER_LOAD_DWORD">; +defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x009, "S_BUFFER_LOAD_DWORDX2">; +defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x00a, "S_BUFFER_LOAD_DWORDX4">; +defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x00b, "S_BUFFER_LOAD_DWORDX8">; +defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x00c, "S_BUFFER_LOAD_DWORDX16">; + +let SubtargetPredicate = HasScalarStores in { +defm S_STORE_DWORD : SM_Real_Stores_gfx10<0x010, "S_STORE_DWORD">; +defm S_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x011, "S_STORE_DWORDX2">; +defm S_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x012, "S_STORE_DWORDX4">; +let OtherPredicates = [HasScalarFlatScratchInsts] in { +defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_gfx10<0x015, "S_SCRATCH_STORE_DWORD">; +defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x016, "S_SCRATCH_STORE_DWORDX2">; +defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x017, "S_SCRATCH_STORE_DWORDX4">; +} // End OtherPredicates = [HasScalarFlatScratchInsts] +defm S_BUFFER_STORE_DWORD : SM_Real_Stores_gfx10<0x018, "S_BUFFER_STORE_DWORD">; +defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x019, "S_BUFFER_STORE_DWORDX2">; +defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x01a, "S_BUFFER_STORE_DWORDX4">; +} // End SubtargetPredicate = HasScalarStores + +def S_MEMREALTIME_gfx10 : SMEM_Real_gfx10<0x025, S_MEMREALTIME>; +def S_MEMTIME_gfx10 : SMEM_Real_gfx10<0x024, S_MEMTIME>; +def S_GL1_INV_gfx10 : SMEM_Real_gfx10<0x01f, S_GL1_INV>; +def S_GET_WAVEID_IN_WORKGROUP_gfx10 : SMEM_Real_gfx10<0x02a, S_GET_WAVEID_IN_WORKGROUP>; +def S_DCACHE_INV_gfx10 : SMEM_Real_gfx10<0x020, S_DCACHE_INV>; + +let SubtargetPredicate = HasScalarStores in { +def S_DCACHE_WB_gfx10 : SMEM_Real_gfx10<0x021, S_DCACHE_WB>; +} // End SubtargetPredicate = HasScalarStores + +multiclass SM_Real_Probe_gfx10<bits<8> op, string ps> { + def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>; + def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>; +} + +defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26, "S_ATC_PROBE">; +defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx10 <0x27, "S_ATC_PROBE_BUFFER">; + +class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps> + : SMEM_Real_gfx10 <op, ps> { + + bits<7> sdata; + bit dlc; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + let glc = ps.glc; + + let Inst{14} = !if(ps.has_dlc, dlc, 0); + let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0}); +} + +multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> { + def _IMM_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>; + def _SGPR_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>; + def _IMM_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>; + def _SGPR_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>; +} + +let SubtargetPredicate = HasScalarAtomics in { -} // let OtherPredicates = [isVI] +defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x40, "S_BUFFER_ATOMIC_SWAP">; +defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x41, "S_BUFFER_ATOMIC_CMPSWAP">; +defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x42, "S_BUFFER_ATOMIC_ADD">; +defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x43, "S_BUFFER_ATOMIC_SUB">; +defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x44, "S_BUFFER_ATOMIC_SMIN">; +defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x45, "S_BUFFER_ATOMIC_UMIN">; +defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x46, "S_BUFFER_ATOMIC_SMAX">; +defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x47, "S_BUFFER_ATOMIC_UMAX">; +defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x48, "S_BUFFER_ATOMIC_AND">; +defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x49, "S_BUFFER_ATOMIC_OR">; +defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x4a, "S_BUFFER_ATOMIC_XOR">; +defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x4b, "S_BUFFER_ATOMIC_INC">; +defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x4c, "S_BUFFER_ATOMIC_DEC">; + +defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0x60, "S_BUFFER_ATOMIC_SWAP_X2">; +defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">; +defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0x62, "S_BUFFER_ATOMIC_ADD_X2">; +defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0x63, "S_BUFFER_ATOMIC_SUB_X2">; +defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0x64, "S_BUFFER_ATOMIC_SMIN_X2">; +defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0x65, "S_BUFFER_ATOMIC_UMIN_X2">; +defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0x66, "S_BUFFER_ATOMIC_SMAX_X2">; +defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0x67, "S_BUFFER_ATOMIC_UMAX_X2">; +defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0x68, "S_BUFFER_ATOMIC_AND_X2">; +defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0x69, "S_BUFFER_ATOMIC_OR_X2">; +defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0x6a, "S_BUFFER_ATOMIC_XOR_X2">; +defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0x6b, "S_BUFFER_ATOMIC_INC_X2">; +defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0x6c, "S_BUFFER_ATOMIC_DEC_X2">; + +defm S_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x80, "S_ATOMIC_SWAP">; +defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x81, "S_ATOMIC_CMPSWAP">; +defm S_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x82, "S_ATOMIC_ADD">; +defm S_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x83, "S_ATOMIC_SUB">; +defm S_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x84, "S_ATOMIC_SMIN">; +defm S_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x85, "S_ATOMIC_UMIN">; +defm S_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x86, "S_ATOMIC_SMAX">; +defm S_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x87, "S_ATOMIC_UMAX">; +defm S_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x88, "S_ATOMIC_AND">; +defm S_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x89, "S_ATOMIC_OR">; +defm S_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x8a, "S_ATOMIC_XOR">; +defm S_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x8b, "S_ATOMIC_INC">; +defm S_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x8c, "S_ATOMIC_DEC">; + +defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0xa0, "S_ATOMIC_SWAP_X2">; +defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0xa1, "S_ATOMIC_CMPSWAP_X2">; +defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0xa2, "S_ATOMIC_ADD_X2">; +defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0xa3, "S_ATOMIC_SUB_X2">; +defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0xa4, "S_ATOMIC_SMIN_X2">; +defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0xa5, "S_ATOMIC_UMIN_X2">; +defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0xa6, "S_ATOMIC_SMAX_X2">; +defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0xa7, "S_ATOMIC_UMAX_X2">; +defm S_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0xa8, "S_ATOMIC_AND_X2">; +defm S_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0xa9, "S_ATOMIC_OR_X2">; +defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0xaa, "S_ATOMIC_XOR_X2">; +defm S_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0xab, "S_ATOMIC_INC_X2">; +defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0xac, "S_ATOMIC_DEC_X2">; + +multiclass SM_Real_Discard_gfx10<bits<8> op, string ps> { + def _IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>; + def _SGPR_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>; +} + +defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">; +defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_gfx10 <0x29, "S_DCACHE_DISCARD_X2">; + +} // End SubtargetPredicate = HasScalarAtomics diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index ca5e981ac5c2..dfafdccc05a3 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -1,15 +1,15 @@ //===-- SOPInstructions.td - SOP Instruction Defintions -------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// def GPRIdxModeMatchClass : AsmOperandClass { let Name = "GPRIdxMode"; let PredicateMethod = "isGPRIdxMode"; + let ParserMethod = "parseGPRIdxMode"; let RenderMethod = "addImmOperands"; } @@ -26,7 +26,6 @@ class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps, let isPseudo = 1; let isCodeGenOnly = 1; - let SubtargetPredicate = isGCN; string Mnemonic = opName; string AsmOperands = asmOps; @@ -78,10 +77,13 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps> : let Inst{31-23} = 0x17d; //encoding; } -class SOP1_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < - opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0), - "$sdst, $src0", pattern ->; +class SOP1_32 <string opName, list<dag> pattern=[], bit tied_in = 0> : SOP1_Pseudo < + opName, (outs SReg_32:$sdst), + !if(tied_in, (ins SSrc_b32:$src0, SReg_32:$sdst_in), + (ins SSrc_b32:$src0)), + "$sdst, $src0", pattern> { + let Constraints = !if(tied_in, "$sdst = $sdst_in", ""); +} // 32-bit input, no output. class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo < @@ -108,10 +110,13 @@ class SOP1_32_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < >; // 32-bit input, 64-bit output. -class SOP1_64_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < - opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0), - "$sdst, $src0", pattern ->; +class SOP1_64_32 <string opName, list<dag> pattern=[], bit tied_in = 0> : SOP1_Pseudo < + opName, (outs SReg_64:$sdst), + !if(tied_in, (ins SSrc_b32:$src0, SReg_64:$sdst_in), + (ins SSrc_b32:$src0)), + "$sdst, $src0", pattern> { + let Constraints = !if(tied_in, "$sdst = $sdst_in", ""); +} // no input, 64-bit output. class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < @@ -120,8 +125,8 @@ class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < } // 64-bit input, no output -class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < - opName, (outs), (ins SReg_64:$src0), "$src0", pattern> { +class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs), (ins rc:$src0), "$src0", pattern> { let has_sdst = 0; } @@ -147,12 +152,24 @@ let Defs = [SCC] in { [(set i64:$sdst, (not i64:$src0))] >; def S_WQM_B32 : SOP1_32 <"s_wqm_b32">; - def S_WQM_B64 : SOP1_64 <"s_wqm_b64", - [(set i1:$sdst, (int_amdgcn_wqm_vote i1:$src0))] - >; + def S_WQM_B64 : SOP1_64 <"s_wqm_b64">; } // End Defs = [SCC] +let WaveSizePredicate = isWave32 in { +def : GCNPat < + (int_amdgcn_wqm_vote i1:$src0), + (S_WQM_B32 $src0) +>; +} + +let WaveSizePredicate = isWave64 in { +def : GCNPat < + (int_amdgcn_wqm_vote i1:$src0), + (S_WQM_B64 $src0) +>; +} + def S_BREV_B32 : SOP1_32 <"s_brev_b32", [(set i32:$sdst, (bitreverse i32:$src0))] >; @@ -191,10 +208,10 @@ def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16", [(set i32:$sdst, (sext_inreg i32:$src0, i16))] >; -def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">; -def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">; -def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">; -def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">; +def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32", [], 1>; +def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>; +def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>; +def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>; def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64", [(set i64:$sdst, (int_amdgcn_s_getpc))] >; @@ -207,7 +224,7 @@ def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">; let isReturn = 1 in { // Define variant marked as return rather than branch. -def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>; +def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>; } } // End isTerminator = 1, isBarrier = 1 @@ -241,8 +258,11 @@ def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">; def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">; } // End Uses = [M0] +let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in { def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">; def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">; +} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9 + let Defs = [SCC] in { def S_ABS_I32 : SOP1_32 <"s_abs_i32">; } // End Defs = [SCC] @@ -255,7 +275,7 @@ def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> { } } -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = isGFX9Plus in { let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in { def S_ANDN1_SAVEEXEC_B64 : SOP1_64<"s_andn1_saveexec_b64">; def S_ORN1_SAVEEXEC_B64 : SOP1_64<"s_orn1_saveexec_b64">; @@ -264,7 +284,28 @@ let SubtargetPredicate = isGFX9 in { } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">; -} // End SubtargetPredicate = isGFX9 +} // End SubtargetPredicate = isGFX9Plus + +let SubtargetPredicate = isGFX10Plus in { + let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in { + def S_AND_SAVEEXEC_B32 : SOP1_32<"s_and_saveexec_b32">; + def S_OR_SAVEEXEC_B32 : SOP1_32<"s_or_saveexec_b32">; + def S_XOR_SAVEEXEC_B32 : SOP1_32<"s_xor_saveexec_b32">; + def S_ANDN2_SAVEEXEC_B32 : SOP1_32<"s_andn2_saveexec_b32">; + def S_ORN2_SAVEEXEC_B32 : SOP1_32<"s_orn2_saveexec_b32">; + def S_NAND_SAVEEXEC_B32 : SOP1_32<"s_nand_saveexec_b32">; + def S_NOR_SAVEEXEC_B32 : SOP1_32<"s_nor_saveexec_b32">; + def S_XNOR_SAVEEXEC_B32 : SOP1_32<"s_xnor_saveexec_b32">; + def S_ANDN1_SAVEEXEC_B32 : SOP1_32<"s_andn1_saveexec_b32">; + def S_ORN1_SAVEEXEC_B32 : SOP1_32<"s_orn1_saveexec_b32">; + def S_ANDN1_WREXEC_B32 : SOP1_32<"s_andn1_wrexec_b32">; + def S_ANDN2_WREXEC_B32 : SOP1_32<"s_andn2_wrexec_b32">; + } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] + + let Uses = [M0] in { + def S_MOVRELSD_2_B32 : SOP1_32<"s_movrelsd_2_b32">; + } // End Uses = [M0] +} // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// // SOP2 Instructions @@ -302,6 +343,8 @@ class SOP2_Real<bits<7> op, SOP_Pseudo ps> : // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let AsmMatchConverter = ps.AsmMatchConverter; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let TSFlags = ps.TSFlags; // encoding bits<7> sdst; @@ -468,22 +511,22 @@ let AddedComplexity = 1 in { let Defs = [SCC] in { // TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3 def S_LSHL_B32 : SOP2_32 <"s_lshl_b32", - [(set i32:$sdst, (UniformBinFrag<shl> i32:$src0, i32:$src1))] + [(set SReg_32:$sdst, (shl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64", - [(set i64:$sdst, (UniformBinFrag<shl> i64:$src0, i32:$src1))] + [(set SReg_64:$sdst, (shl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHR_B32 : SOP2_32 <"s_lshr_b32", - [(set i32:$sdst, (UniformBinFrag<srl> i32:$src0, i32:$src1))] + [(set SReg_32:$sdst, (srl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64", - [(set i64:$sdst, (UniformBinFrag<srl> i64:$src0, i32:$src1))] + [(set SReg_64:$sdst, (srl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; def S_ASHR_I32 : SOP2_32 <"s_ashr_i32", - [(set i32:$sdst, (UniformBinFrag<sra> i32:$src0, i32:$src1))] + [(set SReg_32:$sdst, (sra (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64", - [(set i64:$sdst, (UniformBinFrag<sra> i64:$src0, i32:$src1))] + [(set SReg_64:$sdst, (sra (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; } // End Defs = [SCC] @@ -512,13 +555,14 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo < "$src0, $src1" > { let has_sdst = 0; + let SubtargetPredicate = isGFX6GFX7GFX8GFX9; } let Defs = [SCC] in { def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">; } // End Defs = [SCC] -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8GFX9 in { def S_RFE_RESTORE_B64 : SOP2_Pseudo < "s_rfe_restore_b64", (outs), (ins SSrc_b64:$src0, SSrc_b32:$src1), @@ -529,7 +573,7 @@ let SubtargetPredicate = isVI in { } } -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = isGFX9Plus in { def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">; def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">; def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">; @@ -543,7 +587,7 @@ let SubtargetPredicate = isGFX9 in { def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">; def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">; -} +} // End SubtargetPredicate = isGFX9Plus //===----------------------------------------------------------------------===// // SOPK Instructions @@ -555,7 +599,6 @@ class SOPK_Pseudo <string opName, dag outs, dag ins, SIMCInstr<opName, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; - let SubtargetPredicate = isGCN; let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -618,6 +661,19 @@ class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo < "$sdst, $simm16", pattern>; +class SOPK_32_BR <string opName, list<dag> pattern=[]> : SOPK_Pseudo < + opName, + (outs), + (ins sopp_brtarget:$simm16, SReg_32:$sdst), + "$sdst, $simm16", + pattern> { + let Defs = [EXEC]; + let Uses = [EXEC]; + let isBranch = 1; + let isTerminator = 1; + let SchedRW = [WriteBranch]; +} + class SOPK_SCC <string opName, string base_op, bit isSignExt> : SOPK_Pseudo < opName, (outs), @@ -684,9 +740,10 @@ let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", def S_MULK_I32 : SOPK_32TIE <"s_mulk_i32">; } +let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in def S_CBRANCH_I_FORK : SOPK_Pseudo < "s_cbranch_i_fork", - (outs), (ins SReg_64:$sdst, s16imm:$simm16), + (outs), (ins SReg_64:$sdst, sopp_brtarget:$simm16), "$sdst, $simm16" >; @@ -720,15 +777,46 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo < } // End hasSideEffects = 1 -let SubtargetPredicate = isGFX9 in { +class SOPK_WAITCNT<string opName, list<dag> pat=[]> : + SOPK_Pseudo< + opName, + (outs), + (ins SReg_32:$sdst, s16imm:$simm16), + "$sdst, $simm16", + pat> { + let hasSideEffects = 1; + let mayLoad = 1; + let mayStore = 1; + let has_sdst = 1; // First source takes place of sdst in encoding +} + +let SubtargetPredicate = isGFX9Plus in { def S_CALL_B64 : SOPK_Pseudo< "s_call_b64", (outs SReg_64:$sdst), - (ins s16imm:$simm16), + (ins sopp_brtarget:$simm16), "$sdst, $simm16"> { let isCall = 1; } -} +} // End SubtargetPredicate = isGFX9Plus + +let SubtargetPredicate = isGFX10Plus in { + def S_VERSION : SOPK_Pseudo< + "s_version", + (outs), + (ins s16imm:$simm16), + "$simm16"> { + let has_sdst = 0; + } + + def S_SUBVECTOR_LOOP_BEGIN : SOPK_32_BR<"s_subvector_loop_begin">; + def S_SUBVECTOR_LOOP_END : SOPK_32_BR<"s_subvector_loop_end">; + + def S_WAITCNT_VSCNT : SOPK_WAITCNT<"s_waitcnt_vscnt">; + def S_WAITCNT_VMCNT : SOPK_WAITCNT<"s_waitcnt_vmcnt">; + def S_WAITCNT_EXPCNT : SOPK_WAITCNT<"s_waitcnt_expcnt">; + def S_WAITCNT_LGKMCNT : SOPK_WAITCNT<"s_waitcnt_lgkmcnt">; +} // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// // SOPC Instructions @@ -756,7 +844,6 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm, let Defs = [SCC]; let SchedRW = [WriteSALU]; let UseNamedOperandTable = 1; - let SubtargetPredicate = isGCN; } class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1, @@ -811,12 +898,13 @@ def S_BITCMP0_B32 : SOPC_32 <0x0c, "s_bitcmp0_b32">; def S_BITCMP1_B32 : SOPC_32 <0x0d, "s_bitcmp1_b32">; def S_BITCMP0_B64 : SOPC_64_32 <0x0e, "s_bitcmp0_b64">; def S_BITCMP1_B64 : SOPC_64_32 <0x0f, "s_bitcmp1_b64">; +let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in def S_SETVSKIP : SOPC_32 <0x10, "s_setvskip">; -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8Plus in { def S_CMP_EQ_U64 : SOPC_CMP_64 <0x12, "s_cmp_eq_u64", COND_EQ>; def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>; -} +} // End SubtargetPredicate = isGFX8Plus let SubtargetPredicate = HasVGPRIndexMode in { def S_SET_GPR_IDX_ON : SOPC <0x11, @@ -834,6 +922,10 @@ def S_SET_GPR_IDX_ON : SOPC <0x11, // SOPP Instructions //===----------------------------------------------------------------------===// +class Base_SOPP <string asm> { + string AsmString = asm; +} + class SOPPe <bits<7> op> : Enc32 { bits <16> simm16; @@ -843,7 +935,7 @@ class SOPPe <bits<7> op> : Enc32 { } class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> : - InstSI <(outs), ins, asm, pattern >, SOPPe <op> { + InstSI <(outs), ins, asm, pattern >, SOPPe <op>, Base_SOPP <asm> { let mayLoad = 0; let mayStore = 0; @@ -854,92 +946,124 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> : let SchedRW = [WriteSALU]; let UseNamedOperandTable = 1; - let SubtargetPredicate = isGCN; } - def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; +class SOPP_w_nop_e <bits<7> op> : Enc64 { + bits <16> simm16; + + let Inst{15-0} = simm16; + let Inst{22-16} = op; + let Inst{31-23} = 0x17f; // encoding + let Inst{47-32} = 0x0; + let Inst{54-48} = S_NOP.Inst{22-16}; // opcode + let Inst{63-55} = S_NOP.Inst{31-23}; // encoding +} + +class SOPP_w_nop <bits<7> op, dag ins, string asm, list<dag> pattern = []> : + InstSI <(outs), ins, asm, pattern >, SOPP_w_nop_e <op>, Base_SOPP <asm> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPP = 1; + let Size = 8; + let SchedRW = [WriteSALU]; + + let UseNamedOperandTable = 1; +} + +multiclass SOPP_With_Relaxation <bits<7> op, dag ins, string asm, list<dag> pattern = []> { + def "" : SOPP <op, ins, asm, pattern>; + def _pad_s_nop : SOPP_w_nop <op, ins, asm, pattern>; +} + let isTerminator = 1 in { -def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", - [(AMDGPUendpgm)]> { - let simm16 = 0; +def S_ENDPGM : SOPP <0x00000001, (ins EndpgmImm:$simm16), "s_endpgm$simm16"> { let isBarrier = 1; let isReturn = 1; } -let SubtargetPredicate = isVI in { def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> { + let SubtargetPredicate = isGFX8Plus; let simm16 = 0; let isBarrier = 1; let isReturn = 1; } -} -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = isGFX9Plus in { let isBarrier = 1, isReturn = 1, simm16 = 0 in { def S_ENDPGM_ORDERED_PS_DONE : SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">; } // End isBarrier = 1, isReturn = 1, simm16 = 0 -} // End SubtargetPredicate = isGFX9 +} // End SubtargetPredicate = isGFX9Plus + +let SubtargetPredicate = isGFX10Plus in { + let isBarrier = 1, isReturn = 1, simm16 = 0 in { + def S_CODE_END : + SOPP<0x01f, (ins), "s_code_end">; + } // End isBarrier = 1, isReturn = 1, simm16 = 0 +} // End SubtargetPredicate = isGFX10Plus let isBranch = 1, SchedRW = [WriteBranch] in { -def S_BRANCH : SOPP < +let isBarrier = 1 in { +defm S_BRANCH : SOPP_With_Relaxation < 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", - [(br bb:$simm16)]> { - let isBarrier = 1; + [(br bb:$simm16)]>; } let Uses = [SCC] in { -def S_CBRANCH_SCC0 : SOPP < +defm S_CBRANCH_SCC0 : SOPP_With_Relaxation < 0x00000004, (ins sopp_brtarget:$simm16), "s_cbranch_scc0 $simm16" >; -def S_CBRANCH_SCC1 : SOPP < +defm S_CBRANCH_SCC1 : SOPP_With_Relaxation < 0x00000005, (ins sopp_brtarget:$simm16), "s_cbranch_scc1 $simm16" >; } // End Uses = [SCC] let Uses = [VCC] in { -def S_CBRANCH_VCCZ : SOPP < +defm S_CBRANCH_VCCZ : SOPP_With_Relaxation < 0x00000006, (ins sopp_brtarget:$simm16), "s_cbranch_vccz $simm16" >; -def S_CBRANCH_VCCNZ : SOPP < +defm S_CBRANCH_VCCNZ : SOPP_With_Relaxation < 0x00000007, (ins sopp_brtarget:$simm16), "s_cbranch_vccnz $simm16" >; } // End Uses = [VCC] let Uses = [EXEC] in { -def S_CBRANCH_EXECZ : SOPP < +defm S_CBRANCH_EXECZ : SOPP_With_Relaxation < 0x00000008, (ins sopp_brtarget:$simm16), "s_cbranch_execz $simm16" >; -def S_CBRANCH_EXECNZ : SOPP < +defm S_CBRANCH_EXECNZ : SOPP_With_Relaxation < 0x00000009, (ins sopp_brtarget:$simm16), "s_cbranch_execnz $simm16" >; } // End Uses = [EXEC] -def S_CBRANCH_CDBGSYS : SOPP < +defm S_CBRANCH_CDBGSYS : SOPP_With_Relaxation < 0x00000017, (ins sopp_brtarget:$simm16), "s_cbranch_cdbgsys $simm16" >; -def S_CBRANCH_CDBGSYS_AND_USER : SOPP < +defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_With_Relaxation < 0x0000001A, (ins sopp_brtarget:$simm16), "s_cbranch_cdbgsys_and_user $simm16" >; -def S_CBRANCH_CDBGSYS_OR_USER : SOPP < +defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_With_Relaxation < 0x00000019, (ins sopp_brtarget:$simm16), "s_cbranch_cdbgsys_or_user $simm16" >; -def S_CBRANCH_CDBGUSER : SOPP < +defm S_CBRANCH_CDBGUSER : SOPP_With_Relaxation < 0x00000018, (ins sopp_brtarget:$simm16), "s_cbranch_cdbguser $simm16" >; @@ -957,16 +1081,16 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", let isConvergent = 1; } -let SubtargetPredicate = isVI in { def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> { + let SubtargetPredicate = isGFX8Plus; let simm16 = 0; let mayLoad = 1; let mayStore = 1; } -} let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in -def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; +def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16", + [(int_amdgcn_s_waitcnt UIMM16bit:$simm16)]>; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">; @@ -994,7 +1118,10 @@ def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $ >; } // End Uses = [EXEC, M0] -def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; +def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16"> { + let isTrap = 1; +} + def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { let simm16 = 0; } @@ -1028,6 +1155,25 @@ def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16), } } +let SubtargetPredicate = isGFX10Plus in { + def S_INST_PREFETCH : + SOPP<0x020, (ins s16imm:$simm16), "s_inst_prefetch $simm16">; + def S_CLAUSE : + SOPP<0x021, (ins s16imm:$simm16), "s_clause $simm16">; + def S_WAITCNT_IDLE : + SOPP <0x022, (ins), "s_wait_idle"> { + let simm16 = 0; + } + def S_WAITCNT_DEPCTR : + SOPP <0x023, (ins s16imm:$simm16), "s_waitcnt_depctr $simm16">; + def S_ROUND_MODE : + SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">; + def S_DENORM_MODE : + SOPP<0x025, (ins s16imm:$simm16), "s_denorm_mode $simm16">; + def S_TTRACEDATA_IMM : + SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">; +} // End SubtargetPredicate = isGFX10Plus + //===----------------------------------------------------------------------===// // S_GETREG_B32 Intrinsic Pattern. //===----------------------------------------------------------------------===// @@ -1041,6 +1187,11 @@ def : GCNPat < //===----------------------------------------------------------------------===// def : GCNPat < + (AMDGPUendpgm), + (S_ENDPGM (i16 0)) +>; + +def : GCNPat < (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, @@ -1097,162 +1248,261 @@ def : GCNPat< >; +//===----------------------------------------------------------------------===// +// Target-specific instruction encodings. +//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// SOPP Patterns +// SOP1 - GFX10. //===----------------------------------------------------------------------===// -def : GCNPat < - (int_amdgcn_s_waitcnt i32:$simm16), - (S_WAITCNT (as_i16imm $simm16)) ->; +class Select_gfx10<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX10> { + Predicate AssemblerPredicate = isGFX10Plus; + string DecoderNamespace = "GFX10"; +} + +multiclass SOP1_Real_gfx10<bits<8> op> { + def _gfx10 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>, + Select_gfx10<!cast<SOP1_Pseudo>(NAME).Mnemonic>; +} +defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>; +defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x038>; +defm S_ANDN1_WREXEC_B64 : SOP1_Real_gfx10<0x039>; +defm S_ANDN2_WREXEC_B64 : SOP1_Real_gfx10<0x03a>; +defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx10<0x03b>; +defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03c>; +defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03d>; +defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03e>; +defm S_ANDN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x03f>; +defm S_ORN2_SAVEEXEC_B32 : SOP1_Real_gfx10<0x040>; +defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx10<0x041>; +defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x042>; +defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx10<0x043>; +defm S_ANDN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x044>; +defm S_ORN1_SAVEEXEC_B32 : SOP1_Real_gfx10<0x045>; +defm S_ANDN1_WREXEC_B32 : SOP1_Real_gfx10<0x046>; +defm S_ANDN2_WREXEC_B32 : SOP1_Real_gfx10<0x047>; +defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>; //===----------------------------------------------------------------------===// -// Real target instructions, move this to the appropriate subtarget TD file +// SOP1 - GFX6, GFX7. //===----------------------------------------------------------------------===// -class Select_si<string opName> : - SIMCInstr<opName, SIEncodingFamily.SI> { - list<Predicate> AssemblerPredicates = [isSICI]; - string DecoderNamespace = "SICI"; +class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> { + Predicate AssemblerPredicate = isGFX6GFX7; + string DecoderNamespace = "GFX6GFX7"; } -class SOP1_Real_si<bits<8> op, SOP1_Pseudo ps> : - SOP1_Real<op, ps>, - Select_si<ps.Mnemonic>; +multiclass SOP1_Real_gfx6_gfx7<bits<8> op> { + def _gfx6_gfx7 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>, + Select_gfx6_gfx7<!cast<SOP1_Pseudo>(NAME).Mnemonic>; +} -class SOP2_Real_si<bits<7> op, SOP2_Pseudo ps> : - SOP2_Real<op, ps>, - Select_si<ps.Mnemonic>; +multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> : + SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>; + +defm S_CBRANCH_JOIN : SOP1_Real_gfx6_gfx7<0x032>; +defm S_MOV_REGRD_B32 : SOP1_Real_gfx6_gfx7<0x033>; + +defm S_MOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x003>; +defm S_MOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x004>; +defm S_CMOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x005>; +defm S_CMOV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x006>; +defm S_NOT_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x007>; +defm S_NOT_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x008>; +defm S_WQM_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x009>; +defm S_WQM_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00a>; +defm S_BREV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00b>; +defm S_BREV_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00c>; +defm S_BCNT0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00d>; +defm S_BCNT0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x00e>; +defm S_BCNT1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x00f>; +defm S_BCNT1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x010>; +defm S_FF0_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x011>; +defm S_FF0_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x012>; +defm S_FF1_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x013>; +defm S_FF1_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x014>; +defm S_FLBIT_I32_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x015>; +defm S_FLBIT_I32_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x016>; +defm S_FLBIT_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x017>; +defm S_FLBIT_I32_I64 : SOP1_Real_gfx6_gfx7_gfx10<0x018>; +defm S_SEXT_I32_I8 : SOP1_Real_gfx6_gfx7_gfx10<0x019>; +defm S_SEXT_I32_I16 : SOP1_Real_gfx6_gfx7_gfx10<0x01a>; +defm S_BITSET0_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01b>; +defm S_BITSET0_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01c>; +defm S_BITSET1_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x01d>; +defm S_BITSET1_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01e>; +defm S_GETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x01f>; +defm S_SETPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x020>; +defm S_SWAPPC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x021>; +defm S_RFE_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x022>; +defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x024>; +defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x025>; +defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x026>; +defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>; +defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x028>; +defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x029>; +defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02a>; +defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02b>; +defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02c>; +defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02d>; +defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02e>; +defm S_MOVRELS_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02f>; +defm S_MOVRELD_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x030>; +defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x031>; +defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>; +defm S_MOV_FED_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x035>; -class SOPK_Real_si<bits<5> op, SOPK_Pseudo ps> : - SOPK_Real32<op, ps>, - Select_si<ps.Mnemonic>; - -def S_MOV_B32_si : SOP1_Real_si <0x03, S_MOV_B32>; -def S_MOV_B64_si : SOP1_Real_si <0x04, S_MOV_B64>; -def S_CMOV_B32_si : SOP1_Real_si <0x05, S_CMOV_B32>; -def S_CMOV_B64_si : SOP1_Real_si <0x06, S_CMOV_B64>; -def S_NOT_B32_si : SOP1_Real_si <0x07, S_NOT_B32>; -def S_NOT_B64_si : SOP1_Real_si <0x08, S_NOT_B64>; -def S_WQM_B32_si : SOP1_Real_si <0x09, S_WQM_B32>; -def S_WQM_B64_si : SOP1_Real_si <0x0a, S_WQM_B64>; -def S_BREV_B32_si : SOP1_Real_si <0x0b, S_BREV_B32>; -def S_BREV_B64_si : SOP1_Real_si <0x0c, S_BREV_B64>; -def S_BCNT0_I32_B32_si : SOP1_Real_si <0x0d, S_BCNT0_I32_B32>; -def S_BCNT0_I32_B64_si : SOP1_Real_si <0x0e, S_BCNT0_I32_B64>; -def S_BCNT1_I32_B32_si : SOP1_Real_si <0x0f, S_BCNT1_I32_B32>; -def S_BCNT1_I32_B64_si : SOP1_Real_si <0x10, S_BCNT1_I32_B64>; -def S_FF0_I32_B32_si : SOP1_Real_si <0x11, S_FF0_I32_B32>; -def S_FF0_I32_B64_si : SOP1_Real_si <0x12, S_FF0_I32_B64>; -def S_FF1_I32_B32_si : SOP1_Real_si <0x13, S_FF1_I32_B32>; -def S_FF1_I32_B64_si : SOP1_Real_si <0x14, S_FF1_I32_B64>; -def S_FLBIT_I32_B32_si : SOP1_Real_si <0x15, S_FLBIT_I32_B32>; -def S_FLBIT_I32_B64_si : SOP1_Real_si <0x16, S_FLBIT_I32_B64>; -def S_FLBIT_I32_si : SOP1_Real_si <0x17, S_FLBIT_I32>; -def S_FLBIT_I32_I64_si : SOP1_Real_si <0x18, S_FLBIT_I32_I64>; -def S_SEXT_I32_I8_si : SOP1_Real_si <0x19, S_SEXT_I32_I8>; -def S_SEXT_I32_I16_si : SOP1_Real_si <0x1a, S_SEXT_I32_I16>; -def S_BITSET0_B32_si : SOP1_Real_si <0x1b, S_BITSET0_B32>; -def S_BITSET0_B64_si : SOP1_Real_si <0x1c, S_BITSET0_B64>; -def S_BITSET1_B32_si : SOP1_Real_si <0x1d, S_BITSET1_B32>; -def S_BITSET1_B64_si : SOP1_Real_si <0x1e, S_BITSET1_B64>; -def S_GETPC_B64_si : SOP1_Real_si <0x1f, S_GETPC_B64>; -def S_SETPC_B64_si : SOP1_Real_si <0x20, S_SETPC_B64>; -def S_SWAPPC_B64_si : SOP1_Real_si <0x21, S_SWAPPC_B64>; -def S_RFE_B64_si : SOP1_Real_si <0x22, S_RFE_B64>; -def S_AND_SAVEEXEC_B64_si : SOP1_Real_si <0x24, S_AND_SAVEEXEC_B64>; -def S_OR_SAVEEXEC_B64_si : SOP1_Real_si <0x25, S_OR_SAVEEXEC_B64>; -def S_XOR_SAVEEXEC_B64_si : SOP1_Real_si <0x26, S_XOR_SAVEEXEC_B64>; -def S_ANDN2_SAVEEXEC_B64_si: SOP1_Real_si <0x27, S_ANDN2_SAVEEXEC_B64>; -def S_ORN2_SAVEEXEC_B64_si : SOP1_Real_si <0x28, S_ORN2_SAVEEXEC_B64>; -def S_NAND_SAVEEXEC_B64_si : SOP1_Real_si <0x29, S_NAND_SAVEEXEC_B64>; -def S_NOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2a, S_NOR_SAVEEXEC_B64>; -def S_XNOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2b, S_XNOR_SAVEEXEC_B64>; -def S_QUADMASK_B32_si : SOP1_Real_si <0x2c, S_QUADMASK_B32>; -def S_QUADMASK_B64_si : SOP1_Real_si <0x2d, S_QUADMASK_B64>; -def S_MOVRELS_B32_si : SOP1_Real_si <0x2e, S_MOVRELS_B32>; -def S_MOVRELS_B64_si : SOP1_Real_si <0x2f, S_MOVRELS_B64>; -def S_MOVRELD_B32_si : SOP1_Real_si <0x30, S_MOVRELD_B32>; -def S_MOVRELD_B64_si : SOP1_Real_si <0x31, S_MOVRELD_B64>; -def S_CBRANCH_JOIN_si : SOP1_Real_si <0x32, S_CBRANCH_JOIN>; -def S_MOV_REGRD_B32_si : SOP1_Real_si <0x33, S_MOV_REGRD_B32>; -def S_ABS_I32_si : SOP1_Real_si <0x34, S_ABS_I32>; -def S_MOV_FED_B32_si : SOP1_Real_si <0x35, S_MOV_FED_B32>; - -def S_ADD_U32_si : SOP2_Real_si <0x00, S_ADD_U32>; -def S_ADD_I32_si : SOP2_Real_si <0x02, S_ADD_I32>; -def S_SUB_U32_si : SOP2_Real_si <0x01, S_SUB_U32>; -def S_SUB_I32_si : SOP2_Real_si <0x03, S_SUB_I32>; -def S_ADDC_U32_si : SOP2_Real_si <0x04, S_ADDC_U32>; -def S_SUBB_U32_si : SOP2_Real_si <0x05, S_SUBB_U32>; -def S_MIN_I32_si : SOP2_Real_si <0x06, S_MIN_I32>; -def S_MIN_U32_si : SOP2_Real_si <0x07, S_MIN_U32>; -def S_MAX_I32_si : SOP2_Real_si <0x08, S_MAX_I32>; -def S_MAX_U32_si : SOP2_Real_si <0x09, S_MAX_U32>; -def S_CSELECT_B32_si : SOP2_Real_si <0x0a, S_CSELECT_B32>; -def S_CSELECT_B64_si : SOP2_Real_si <0x0b, S_CSELECT_B64>; -def S_AND_B32_si : SOP2_Real_si <0x0e, S_AND_B32>; -def S_AND_B64_si : SOP2_Real_si <0x0f, S_AND_B64>; -def S_OR_B32_si : SOP2_Real_si <0x10, S_OR_B32>; -def S_OR_B64_si : SOP2_Real_si <0x11, S_OR_B64>; -def S_XOR_B32_si : SOP2_Real_si <0x12, S_XOR_B32>; -def S_XOR_B64_si : SOP2_Real_si <0x13, S_XOR_B64>; -def S_ANDN2_B32_si : SOP2_Real_si <0x14, S_ANDN2_B32>; -def S_ANDN2_B64_si : SOP2_Real_si <0x15, S_ANDN2_B64>; -def S_ORN2_B32_si : SOP2_Real_si <0x16, S_ORN2_B32>; -def S_ORN2_B64_si : SOP2_Real_si <0x17, S_ORN2_B64>; -def S_NAND_B32_si : SOP2_Real_si <0x18, S_NAND_B32>; -def S_NAND_B64_si : SOP2_Real_si <0x19, S_NAND_B64>; -def S_NOR_B32_si : SOP2_Real_si <0x1a, S_NOR_B32>; -def S_NOR_B64_si : SOP2_Real_si <0x1b, S_NOR_B64>; -def S_XNOR_B32_si : SOP2_Real_si <0x1c, S_XNOR_B32>; -def S_XNOR_B64_si : SOP2_Real_si <0x1d, S_XNOR_B64>; -def S_LSHL_B32_si : SOP2_Real_si <0x1e, S_LSHL_B32>; -def S_LSHL_B64_si : SOP2_Real_si <0x1f, S_LSHL_B64>; -def S_LSHR_B32_si : SOP2_Real_si <0x20, S_LSHR_B32>; -def S_LSHR_B64_si : SOP2_Real_si <0x21, S_LSHR_B64>; -def S_ASHR_I32_si : SOP2_Real_si <0x22, S_ASHR_I32>; -def S_ASHR_I64_si : SOP2_Real_si <0x23, S_ASHR_I64>; -def S_BFM_B32_si : SOP2_Real_si <0x24, S_BFM_B32>; -def S_BFM_B64_si : SOP2_Real_si <0x25, S_BFM_B64>; -def S_MUL_I32_si : SOP2_Real_si <0x26, S_MUL_I32>; -def S_BFE_U32_si : SOP2_Real_si <0x27, S_BFE_U32>; -def S_BFE_I32_si : SOP2_Real_si <0x28, S_BFE_I32>; -def S_BFE_U64_si : SOP2_Real_si <0x29, S_BFE_U64>; -def S_BFE_I64_si : SOP2_Real_si <0x2a, S_BFE_I64>; -def S_CBRANCH_G_FORK_si : SOP2_Real_si <0x2b, S_CBRANCH_G_FORK>; -def S_ABSDIFF_I32_si : SOP2_Real_si <0x2c, S_ABSDIFF_I32>; - -def S_MOVK_I32_si : SOPK_Real_si <0x00, S_MOVK_I32>; -def S_CMOVK_I32_si : SOPK_Real_si <0x02, S_CMOVK_I32>; -def S_CMPK_EQ_I32_si : SOPK_Real_si <0x03, S_CMPK_EQ_I32>; -def S_CMPK_LG_I32_si : SOPK_Real_si <0x04, S_CMPK_LG_I32>; -def S_CMPK_GT_I32_si : SOPK_Real_si <0x05, S_CMPK_GT_I32>; -def S_CMPK_GE_I32_si : SOPK_Real_si <0x06, S_CMPK_GE_I32>; -def S_CMPK_LT_I32_si : SOPK_Real_si <0x07, S_CMPK_LT_I32>; -def S_CMPK_LE_I32_si : SOPK_Real_si <0x08, S_CMPK_LE_I32>; -def S_CMPK_EQ_U32_si : SOPK_Real_si <0x09, S_CMPK_EQ_U32>; -def S_CMPK_LG_U32_si : SOPK_Real_si <0x0a, S_CMPK_LG_U32>; -def S_CMPK_GT_U32_si : SOPK_Real_si <0x0b, S_CMPK_GT_U32>; -def S_CMPK_GE_U32_si : SOPK_Real_si <0x0c, S_CMPK_GE_U32>; -def S_CMPK_LT_U32_si : SOPK_Real_si <0x0d, S_CMPK_LT_U32>; -def S_CMPK_LE_U32_si : SOPK_Real_si <0x0e, S_CMPK_LE_U32>; -def S_ADDK_I32_si : SOPK_Real_si <0x0f, S_ADDK_I32>; -def S_MULK_I32_si : SOPK_Real_si <0x10, S_MULK_I32>; -def S_CBRANCH_I_FORK_si : SOPK_Real_si <0x11, S_CBRANCH_I_FORK>; -def S_GETREG_B32_si : SOPK_Real_si <0x12, S_GETREG_B32>; -def S_SETREG_B32_si : SOPK_Real_si <0x13, S_SETREG_B32>; -//def S_GETREG_REGRD_B32_si : SOPK_Real_si <0x14, S_GETREG_REGRD_B32>; // see pseudo for comments -def S_SETREG_IMM32_B32_si : SOPK_Real64<0x15, S_SETREG_IMM32_B32>, - Select_si<S_SETREG_IMM32_B32.Mnemonic>; +//===----------------------------------------------------------------------===// +// SOP2 - GFX10. +//===----------------------------------------------------------------------===// + +multiclass SOP2_Real_gfx10<bits<7> op> { + def _gfx10 : SOP2_Real<op, !cast<SOP2_Pseudo>(NAME)>, + Select_gfx10<!cast<SOP2_Pseudo>(NAME).Mnemonic>; +} + +defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>; +defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10<0x02f>; +defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10<0x030>; +defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10<0x031>; +defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10<0x032>; +defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10<0x033>; +defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10<0x034>; +defm S_MUL_HI_U32 : SOP2_Real_gfx10<0x035>; +defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>; + +//===----------------------------------------------------------------------===// +// SOP2 - GFX6, GFX7. +//===----------------------------------------------------------------------===// +multiclass SOP2_Real_gfx6_gfx7<bits<7> op> { + def _gfx6_gfx7 : SOP2_Real<op, !cast<SOP_Pseudo>(NAME)>, + Select_gfx6_gfx7<!cast<SOP_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> : + SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>; + +defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>; + +defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x000>; +defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x001>; +defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x002>; +defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x003>; +defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x004>; +defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x005>; +defm S_MIN_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x006>; +defm S_MIN_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x007>; +defm S_MAX_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x008>; +defm S_MAX_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x009>; +defm S_CSELECT_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00a>; +defm S_CSELECT_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00b>; +defm S_AND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00e>; +defm S_AND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00f>; +defm S_OR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x010>; +defm S_OR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x011>; +defm S_XOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x012>; +defm S_XOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x013>; +defm S_ANDN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x014>; +defm S_ANDN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x015>; +defm S_ORN2_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x016>; +defm S_ORN2_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x017>; +defm S_NAND_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x018>; +defm S_NAND_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x019>; +defm S_NOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01a>; +defm S_NOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01b>; +defm S_XNOR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01c>; +defm S_XNOR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01d>; +defm S_LSHL_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x01e>; +defm S_LSHL_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x01f>; +defm S_LSHR_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x020>; +defm S_LSHR_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x021>; +defm S_ASHR_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x022>; +defm S_ASHR_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x023>; +defm S_BFM_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x024>; +defm S_BFM_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x025>; +defm S_MUL_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x026>; +defm S_BFE_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x027>; +defm S_BFE_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x028>; +defm S_BFE_U64 : SOP2_Real_gfx6_gfx7_gfx10<0x029>; +defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x02a>; +defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>; + +//===----------------------------------------------------------------------===// +// SOPK - GFX10. +//===----------------------------------------------------------------------===// + +multiclass SOPK_Real32_gfx10<bits<5> op> { + def _gfx10 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>, + Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOPK_Real64_gfx10<bits<5> op> { + def _gfx10 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, + Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>; +} + +defm S_VERSION : SOPK_Real32_gfx10<0x001>; +defm S_CALL_B64 : SOPK_Real32_gfx10<0x016>; +defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>; +defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>; +defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx10<0x019>; +defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx10<0x01a>; +defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx10<0x01b>; +defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>; + +//===----------------------------------------------------------------------===// +// SOPK - GFX6, GFX7. +//===----------------------------------------------------------------------===// + +multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> { + def _gfx6_gfx7 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>, + Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> { + def _gfx6_gfx7 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, + Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> : + SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>; + +multiclass SOPK_Real64_gfx6_gfx7_gfx10<bits<5> op> : + SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>; + +defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>; + +defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x000>; +defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x002>; +defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x003>; +defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x004>; +defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x005>; +defm S_CMPK_GE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x006>; +defm S_CMPK_LT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x007>; +defm S_CMPK_LE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x008>; +defm S_CMPK_EQ_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x009>; +defm S_CMPK_LG_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00a>; +defm S_CMPK_GT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00b>; +defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00c>; +defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00d>; +defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00e>; +defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00f>; +defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x010>; +defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x012>; +defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x013>; +defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>; + +//===----------------------------------------------------------------------===// +// GFX8, GFX9 (VI). +//===----------------------------------------------------------------------===// class Select_vi<string opName> : SIMCInstr<opName, SIEncodingFamily.VI> { - list<Predicate> AssemblerPredicates = [isVI]; - string DecoderNamespace = "VI"; + list<Predicate> AssemblerPredicates = [isGFX8GFX9]; + string DecoderNamespace = "GFX8"; } class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> : diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp index e4c442db3016..30cf12337c6e 100644 --- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp +++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp @@ -1,9 +1,8 @@ //===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPUTargetMachine.h" +#include "TargetInfo/AMDGPUTargetInfo.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h new file mode 100644 index 000000000000..1e6dbd90b0c1 --- /dev/null +++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h @@ -0,0 +1,29 @@ +//===-- TargetInfo/AMDGPUTargetInfo.h - TargetInfo for AMDGPU ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H +#define LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H + +namespace llvm { + +class Target; + +/// The target which supports all AMD GPUs. This will eventually +/// be deprecated and there will be a R600 target and a GCN target. +Target &getTheAMDGPUTarget(); + +/// The target for GCN GPUs +Target &getTheGCNTarget(); + +} + +#endif // LLVM_LIB_TARGET_AMDGPU_TARGETINFO_AMDGPUTARGETINFO_H diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 9eb4c6513cce..075e08986c0c 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -1,9 +1,8 @@ //===-- AMDGPUAsmUtils.cpp - AsmParser/InstPrinter common -----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "AMDGPUAsmUtils.h" @@ -23,8 +22,8 @@ const char* const IdSymbolic[] = { nullptr, nullptr, nullptr, - nullptr, - nullptr, + "MSG_GS_ALLOC_REQ", + "MSG_GET_DOORBELL", nullptr, nullptr, nullptr, @@ -69,7 +68,17 @@ const char* const IdSymbolic[] = { nullptr, nullptr, nullptr, - "HW_REG_SH_MEM_BASES" + "HW_REG_SH_MEM_BASES", + "HW_REG_TBA_LO", + "HW_REG_TBA_HI", + "HW_REG_TMA_LO", + "HW_REG_TMA_HI", + "HW_REG_FLAT_SCR_LO", + "HW_REG_FLAT_SCR_HI", + "HW_REG_XNACK_MASK", + nullptr, // HW_ID1, no predictable values + nullptr, // HW_ID2, no predictable values + "HW_REG_POPS_PACKER" }; } // namespace Hwreg @@ -86,5 +95,18 @@ const char* const IdSymbolic[] = { }; } // namespace Swizzle + +namespace VGPRIndexMode { + +// This must be in sync with llvm::AMDGPU::VGPRIndexMode::Id enum members, see SIDefines.h. +const char* const IdSymbolic[] = { + "SRC0", + "SRC1", + "SRC2", + "DST", +}; + +} // namespace VGPRIndexMode + } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index ebb2be22b487..cd91c5f6edd5 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -1,9 +1,8 @@ //===-- AMDGPUAsmUtils.h - AsmParser/InstPrinter common ---------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -31,6 +30,13 @@ namespace Swizzle { // Symbolic names for the swizzle(...) syntax. extern const char* const IdSymbolic[]; } // namespace Swizzle + +namespace VGPRIndexMode { // Symbolic names for the gpr_idx(...) syntax. + +extern const char* const IdSymbolic[]; + +} // namespace VGPRIndexMode + } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 54c866bdc63c..e90f40e6abea 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1,9 +1,8 @@ //===- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -11,6 +10,7 @@ #include "AMDGPUTargetTransformInfo.h" #include "AMDGPU.h" #include "SIDefines.h" +#include "AMDGPUAsmUtils.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/BinaryFormat/ELF.h" @@ -85,7 +85,9 @@ unsigned getExpcntBitWidth() { return 3; } unsigned getLgkmcntBitShift() { return 8; } /// \returns Lgkmcnt bit width. -unsigned getLgkmcntBitWidth() { return 4; } +unsigned getLgkmcntBitWidth(unsigned VersionMajor) { + return (VersionMajor >= 10) ? 6 : 4; +} /// \returns Vmcnt bit shift (higher bits). unsigned getVmcntBitShiftHi() { return 14; } @@ -99,18 +101,11 @@ namespace llvm { namespace AMDGPU { -struct MIMGInfo { - uint16_t Opcode; - uint16_t BaseOpcode; - uint8_t MIMGEncoding; - uint8_t VDataDwords; - uint8_t VAddrDwords; -}; - #define GET_MIMGBaseOpcodesTable_IMPL #define GET_MIMGDimInfoTable_IMPL #define GET_MIMGInfoTable_IMPL #define GET_MIMGLZMappingTable_IMPL +#define GET_MIMGMIPMappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -120,6 +115,11 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, return Info ? Info->Opcode : -1; } +const MIMGBaseOpcodeInfo *getMIMGBaseOpcode(unsigned Opc) { + const MIMGInfo *Info = getMIMGInfo(Opc); + return Info ? getMIMGBaseOpcodeInfo(Info->BaseOpcode) : nullptr; +} + int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) { const MIMGInfo *OrigInfo = getMIMGInfo(Opc); const MIMGInfo *NewInfo = @@ -230,7 +230,8 @@ unsigned getEUsPerCU(const MCSubtargetInfo *STI) { unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize) { - if (!STI->getFeatureBits().test(FeatureGCN)) + assert(FlatWorkGroupSize != 0); + if (STI->getTargetTriple().getArch() != Triple::amdgcn) return 8; unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize); if (N == 1) @@ -279,6 +280,8 @@ unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI, unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) { IsaVersion Version = getIsaVersion(STI->getCPU()); + if (Version.Major >= 10) + return getAddressableNumSGPRs(STI); if (Version.Major >= 8) return 16; return 8; @@ -300,6 +303,8 @@ unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) { return FIXED_NUM_SGPRS_FOR_INIT_BUG; IsaVersion Version = getIsaVersion(STI->getCPU()); + if (Version.Major >= 10) + return 106; if (Version.Major >= 8) return 102; return 104; @@ -308,6 +313,10 @@ unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) { unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { assert(WavesPerEU != 0); + IsaVersion Version = getIsaVersion(STI->getCPU()); + if (Version.Major >= 10) + return 0; + if (WavesPerEU >= getMaxWavesPerEU()) return 0; @@ -322,8 +331,10 @@ unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable) { assert(WavesPerEU != 0); - IsaVersion Version = getIsaVersion(STI->getCPU()); unsigned AddressableNumSGPRs = getAddressableNumSGPRs(STI); + IsaVersion Version = getIsaVersion(STI->getCPU()); + if (Version.Major >= 10) + return Addressable ? AddressableNumSGPRs : 108; if (Version.Major >= 8 && !Addressable) AddressableNumSGPRs = 112; unsigned MaxNumSGPRs = getTotalNumSGPRs(STI) / WavesPerEU; @@ -340,6 +351,9 @@ unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, ExtraSGPRs = 2; IsaVersion Version = getIsaVersion(STI->getCPU()); + if (Version.Major >= 10) + return ExtraSGPRs; + if (Version.Major < 8) { if (FlatScrUsed) ExtraSGPRs = 4; @@ -366,12 +380,17 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) { return NumSGPRs / getSGPREncodingGranule(STI) - 1; } -unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI) { - return 4; +unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, + Optional<bool> EnableWavefrontSize32) { + bool IsWave32 = EnableWavefrontSize32 ? + *EnableWavefrontSize32 : + STI->getFeatureBits().test(FeatureWavefrontSize32); + return IsWave32 ? 8 : 4; } -unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI) { - return getVGPRAllocGranule(STI); +unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, + Optional<bool> EnableWavefrontSize32) { + return getVGPRAllocGranule(STI, EnableWavefrontSize32); } unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { @@ -402,10 +421,12 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { return std::min(MaxNumVGPRs, AddressableNumVGPRs); } -unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs) { - NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(STI)); +unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, + Optional<bool> EnableWavefrontSize32) { + NumVGPRs = alignTo(std::max(1u, NumVGPRs), + getVGPREncodingGranule(STI, EnableWavefrontSize32)); // VGPRBlocks is actual number of VGPR blocks minus 1. - return NumVGPRs / getVGPREncodingGranule(STI) - 1; + return NumVGPRs / getVGPREncodingGranule(STI, EnableWavefrontSize32) - 1; } } // end namespace IsaInfo @@ -423,7 +444,6 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.amd_machine_version_minor = Version.Minor; Header.amd_machine_version_stepping = Version.Stepping; Header.kernel_code_entry_byte_offset = sizeof(Header); - // wavefront_size is specified as a power of 2: 2^6 = 64 threads. Header.wavefront_size = 6; // If the code object does not support indirect functions, then the value must @@ -435,11 +455,25 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.kernarg_segment_alignment = 4; Header.group_segment_alignment = 4; Header.private_segment_alignment = 4; + + if (Version.Major >= 10) { + if (STI->getFeatureBits().test(FeatureWavefrontSize32)) { + Header.wavefront_size = 5; + Header.code_properties |= AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; + } + Header.compute_pgm_resource_registers |= + S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) | + S_00B848_MEM_ORDERED(1); + } } -amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() { +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( + const MCSubtargetInfo *STI) { + IsaVersion Version = getIsaVersion(STI->getCPU()); + amdhsa::kernel_descriptor_t KD; memset(&KD, 0, sizeof(KD)); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE); @@ -449,6 +483,16 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() { amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1); AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1); + if (Version.Major >= 10) { + AMDHSA_BITS_SET(KD.kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, + STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1 : 0); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE, + STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1); + } return KD; } @@ -523,13 +567,14 @@ unsigned getExpcntBitMask(const IsaVersion &Version) { } unsigned getLgkmcntBitMask(const IsaVersion &Version) { - return (1 << getLgkmcntBitWidth()) - 1; + return (1 << getLgkmcntBitWidth(Version.Major)) - 1; } unsigned getWaitcntBitMask(const IsaVersion &Version) { unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo()); unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); - unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth()); + unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), + getLgkmcntBitWidth(Version.Major)); unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt; if (Version.Major < 9) return Waitcnt; @@ -555,7 +600,8 @@ unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt) { } unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt) { - return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); + return unpackBits(Waitcnt, getLgkmcntBitShift(), + getLgkmcntBitWidth(Version.Major)); } void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, @@ -591,7 +637,8 @@ unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Lgkmcnt) { - return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); + return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), + getLgkmcntBitWidth(Version.Major)); } unsigned encodeWaitcnt(const IsaVersion &Version, @@ -607,6 +654,181 @@ unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) { return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt); } +//===----------------------------------------------------------------------===// +// hwreg +//===----------------------------------------------------------------------===// + +namespace Hwreg { + +int64_t getHwregId(const StringRef Name) { + for (int Id = ID_SYMBOLIC_FIRST_; Id < ID_SYMBOLIC_LAST_; ++Id) { + if (IdSymbolic[Id] && Name == IdSymbolic[Id]) + return Id; + } + return ID_UNKNOWN_; +} + +static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) { + if (isSI(STI) || isCI(STI) || isVI(STI)) + return ID_SYMBOLIC_FIRST_GFX9_; + else if (isGFX9(STI)) + return ID_SYMBOLIC_FIRST_GFX10_; + else + return ID_SYMBOLIC_LAST_; +} + +bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) { + return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) && + IdSymbolic[Id]; +} + +bool isValidHwreg(int64_t Id) { + return 0 <= Id && isUInt<ID_WIDTH_>(Id); +} + +bool isValidHwregOffset(int64_t Offset) { + return 0 <= Offset && isUInt<OFFSET_WIDTH_>(Offset); +} + +bool isValidHwregWidth(int64_t Width) { + return 0 <= (Width - 1) && isUInt<WIDTH_M1_WIDTH_>(Width - 1); +} + +uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) { + return (Id << ID_SHIFT_) | + (Offset << OFFSET_SHIFT_) | + ((Width - 1) << WIDTH_M1_SHIFT_); +} + +StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) { + return isValidHwreg(Id, STI) ? IdSymbolic[Id] : ""; +} + +void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) { + Id = (Val & ID_MASK_) >> ID_SHIFT_; + Offset = (Val & OFFSET_MASK_) >> OFFSET_SHIFT_; + Width = ((Val & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1; +} + +} // namespace Hwreg + +//===----------------------------------------------------------------------===// +// SendMsg +//===----------------------------------------------------------------------===// + +namespace SendMsg { + +int64_t getMsgId(const StringRef Name) { + for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) { + if (IdSymbolic[i] && Name == IdSymbolic[i]) + return i; + } + return ID_UNKNOWN_; +} + +static bool isValidMsgId(int64_t MsgId) { + return (ID_GAPS_FIRST_ <= MsgId && MsgId < ID_GAPS_LAST_) && IdSymbolic[MsgId]; +} + +bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) { + if (Strict) { + if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL) + return isGFX9(STI) || isGFX10(STI); + else + return isValidMsgId(MsgId); + } else { + return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId); + } +} + +StringRef getMsgName(int64_t MsgId) { + return isValidMsgId(MsgId)? IdSymbolic[MsgId] : ""; +} + +int64_t getMsgOpId(int64_t MsgId, const StringRef Name) { + const char* const *S = (MsgId == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic; + const int F = (MsgId == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_; + const int L = (MsgId == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_; + for (int i = F; i < L; ++i) { + if (Name == S[i]) { + return i; + } + } + return OP_UNKNOWN_; +} + +bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict) { + + if (!Strict) + return 0 <= OpId && isUInt<OP_WIDTH_>(OpId); + + switch(MsgId) + { + case ID_GS: + return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP; + case ID_GS_DONE: + return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_; + case ID_SYSMSG: + return OP_SYS_FIRST_ <= OpId && OpId < OP_SYS_LAST_; + default: + return OpId == OP_NONE_; + } +} + +StringRef getMsgOpName(int64_t MsgId, int64_t OpId) { + assert(msgRequiresOp(MsgId)); + return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId]; +} + +bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict) { + + if (!Strict) + return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId); + + switch(MsgId) + { + case ID_GS: + return STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_; + case ID_GS_DONE: + return (OpId == OP_GS_NOP)? + (StreamId == STREAM_ID_NONE_) : + (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_); + default: + return StreamId == STREAM_ID_NONE_; + } +} + +bool msgRequiresOp(int64_t MsgId) { + return MsgId == ID_GS || MsgId == ID_GS_DONE || MsgId == ID_SYSMSG; +} + +bool msgSupportsStream(int64_t MsgId, int64_t OpId) { + return (MsgId == ID_GS || MsgId == ID_GS_DONE) && OpId != OP_GS_NOP; +} + +void decodeMsg(unsigned Val, + uint16_t &MsgId, + uint16_t &OpId, + uint16_t &StreamId) { + MsgId = Val & ID_MASK_; + OpId = (Val & OP_MASK_) >> OP_SHIFT_; + StreamId = (Val & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_; +} + +uint64_t encodeMsg(uint64_t MsgId, + uint64_t OpId, + uint64_t StreamId) { + return (MsgId << ID_SHIFT_) | + (OpId << OP_SHIFT_) | + (StreamId << STREAM_ID_SHIFT_); +} + +} // namespace SendMsg + +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } @@ -679,6 +901,10 @@ bool isGFX9(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; } +bool isGFX10(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; +} + bool isGCN3Encoding(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]; } @@ -704,46 +930,46 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { CASE_CI_VI(FLAT_SCR) \ CASE_CI_VI(FLAT_SCR_LO) \ CASE_CI_VI(FLAT_SCR_HI) \ - CASE_VI_GFX9(TTMP0) \ - CASE_VI_GFX9(TTMP1) \ - CASE_VI_GFX9(TTMP2) \ - CASE_VI_GFX9(TTMP3) \ - CASE_VI_GFX9(TTMP4) \ - CASE_VI_GFX9(TTMP5) \ - CASE_VI_GFX9(TTMP6) \ - CASE_VI_GFX9(TTMP7) \ - CASE_VI_GFX9(TTMP8) \ - CASE_VI_GFX9(TTMP9) \ - CASE_VI_GFX9(TTMP10) \ - CASE_VI_GFX9(TTMP11) \ - CASE_VI_GFX9(TTMP12) \ - CASE_VI_GFX9(TTMP13) \ - CASE_VI_GFX9(TTMP14) \ - CASE_VI_GFX9(TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1) \ - CASE_VI_GFX9(TTMP2_TTMP3) \ - CASE_VI_GFX9(TTMP4_TTMP5) \ - CASE_VI_GFX9(TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP8_TTMP9) \ - CASE_VI_GFX9(TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP12_TTMP13) \ - CASE_VI_GFX9(TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \ - CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0) \ + CASE_VI_GFX9_GFX10(TTMP1) \ + CASE_VI_GFX9_GFX10(TTMP2) \ + CASE_VI_GFX9_GFX10(TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4) \ + CASE_VI_GFX9_GFX10(TTMP5) \ + CASE_VI_GFX9_GFX10(TTMP6) \ + CASE_VI_GFX9_GFX10(TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8) \ + CASE_VI_GFX9_GFX10(TTMP9) \ + CASE_VI_GFX9_GFX10(TTMP10) \ + CASE_VI_GFX9_GFX10(TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12) \ + CASE_VI_GFX9_GFX10(TTMP13) \ + CASE_VI_GFX9_GFX10(TTMP14) \ + CASE_VI_GFX9_GFX10(TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1) \ + CASE_VI_GFX9_GFX10(TTMP2_TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5) \ + CASE_VI_GFX9_GFX10(TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9) \ + CASE_VI_GFX9_GFX10(TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12_TTMP13) \ + CASE_VI_GFX9_GFX10(TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ } #define CASE_CI_VI(node) \ assert(!isSI(STI)); \ case node: return isCI(STI) ? node##_ci : node##_vi; -#define CASE_VI_GFX9(node) \ - case node: return isGFX9(STI) ? node##_gfx9 : node##_vi; +#define CASE_VI_GFX9_GFX10(node) \ + case node: return (isGFX9(STI) || isGFX10(STI)) ? node##_gfx9_gfx10 : node##_vi; unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { if (STI.getTargetTriple().getArch() == Triple::r600) @@ -752,17 +978,17 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { } #undef CASE_CI_VI -#undef CASE_VI_GFX9 +#undef CASE_VI_GFX9_GFX10 #define CASE_CI_VI(node) case node##_ci: case node##_vi: return node; -#define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node; +#define CASE_VI_GFX9_GFX10(node) case node##_vi: case node##_gfx9_gfx10: return node; unsigned mc2PseudoReg(unsigned Reg) { MAP_REG2REG } #undef CASE_CI_VI -#undef CASE_VI_GFX9 +#undef CASE_VI_GFX9_GFX10 #undef MAP_REG2REG bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { @@ -779,10 +1005,17 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: return true; default: return false; @@ -802,28 +1035,46 @@ unsigned getRegBitWidth(unsigned RCID) { switch (RCID) { case AMDGPU::SGPR_32RegClassID: case AMDGPU::VGPR_32RegClassID: + case AMDGPU::VRegOrLds_32RegClassID: + case AMDGPU::AGPR_32RegClassID: case AMDGPU::VS_32RegClassID: + case AMDGPU::AV_32RegClassID: case AMDGPU::SReg_32RegClassID: case AMDGPU::SReg_32_XM0RegClassID: + case AMDGPU::SRegOrLds_32RegClassID: return 32; case AMDGPU::SGPR_64RegClassID: case AMDGPU::VS_64RegClassID: + case AMDGPU::AV_64RegClassID: case AMDGPU::SReg_64RegClassID: case AMDGPU::VReg_64RegClassID: + case AMDGPU::AReg_64RegClassID: case AMDGPU::SReg_64_XEXECRegClassID: return 64; + case AMDGPU::SGPR_96RegClassID: + case AMDGPU::SReg_96RegClassID: case AMDGPU::VReg_96RegClassID: return 96; case AMDGPU::SGPR_128RegClassID: case AMDGPU::SReg_128RegClassID: case AMDGPU::VReg_128RegClassID: + case AMDGPU::AReg_128RegClassID: return 128; + case AMDGPU::SGPR_160RegClassID: + case AMDGPU::SReg_160RegClassID: + case AMDGPU::VReg_160RegClassID: + return 160; case AMDGPU::SReg_256RegClassID: case AMDGPU::VReg_256RegClassID: return 256; case AMDGPU::SReg_512RegClassID: case AMDGPU::VReg_512RegClassID: + case AMDGPU::AReg_512RegClassID: return 512; + case AMDGPU::SReg_1024RegClassID: + case AMDGPU::VReg_1024RegClassID: + case AMDGPU::AReg_1024RegClassID: + return 1024; default: llvm_unreachable("Unexpected register class"); } @@ -905,6 +1156,13 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { assert(HasInv2Pi); + if (isInt<16>(Literal) || isUInt<16>(Literal)) { + int16_t Trunc = static_cast<int16_t>(Literal); + return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi); + } + if (!(Literal & 0xffff)) + return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi); + int16_t Lo16 = static_cast<int16_t>(Literal); int16_t Hi16 = static_cast<int16_t>(Literal >> 16); return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); @@ -936,15 +1194,19 @@ bool isArgPassedInSGPR(const Argument *A) { } } +static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) { + return isGCN3Encoding(ST) || isGFX10(ST); +} + int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { - if (isGCN3Encoding(ST)) + if (hasSMEMByteOffset(ST)) return ByteOffset; return ByteOffset >> 2; } bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset); - return isGCN3Encoding(ST) ? + return (hasSMEMByteOffset(ST)) ? isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset); } @@ -994,6 +1256,19 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, return true; } +SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { + *this = getDefaultForCallingConv(F.getCallingConv()); + + StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); + if (!IEEEAttr.empty()) + IEEE = IEEEAttr == "true"; + + StringRef DX10ClampAttr + = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString(); + if (!DX10ClampAttr.empty()) + DX10Clamp = DX10ClampAttr == "true"; +} + namespace { struct SourceOfDivergence { @@ -1009,5 +1284,6 @@ const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr); bool isIntrinsicSourceOfDivergence(unsigned IntrID) { return lookupSourceOfDivergence(IntrID); } + } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 20123ed4ac81..209ef7eef749 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1,9 +1,8 @@ //===- AMDGPUBaseInfo.h - Top level definitions for AMDGPU ------*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -46,6 +45,7 @@ namespace AMDGPU { #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL #define GET_MIMGLZMapping_DECL +#define GET_MIMGMIPMapping_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -150,10 +150,18 @@ unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs); /// \returns VGPR allocation granularity for given subtarget \p STI. -unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI); +/// +/// For subtargets which support it, \p EnableWavefrontSize32 should match +/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field. +unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, + Optional<bool> EnableWavefrontSize32 = None); /// \returns VGPR encoding granularity for given subtarget \p STI. -unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI); +/// +/// For subtargets which support it, \p EnableWavefrontSize32 should match +/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field. +unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, + Optional<bool> EnableWavefrontSize32 = None); /// \returns Total number of VGPRs for given subtarget \p STI. unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI); @@ -171,13 +179,20 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); /// \returns Number of VGPR blocks needed for given subtarget \p STI when /// \p NumVGPRs are used. -unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs); +/// +/// For subtargets which support it, \p EnableWavefrontSize32 should match the +/// ENABLE_WAVEFRONT_SIZE32 kernel descriptor field. +unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs, + Optional<bool> EnableWavefrontSize32 = None); } // end namespace IsaInfo LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); +LLVM_READONLY +int getSOPPWithRelaxation(uint16_t Opcode); + struct MIMGBaseOpcodeInfo { MIMGBaseOpcode BaseOpcode; bool Store; @@ -201,26 +216,53 @@ struct MIMGDimInfo { uint8_t NumCoords; uint8_t NumGradients; bool DA; + uint8_t Encoding; + const char *AsmSuffix; }; LLVM_READONLY -const MIMGDimInfo *getMIMGDimInfo(unsigned Dim); +const MIMGDimInfo *getMIMGDimInfo(unsigned DimEnum); + +LLVM_READONLY +const MIMGDimInfo *getMIMGDimInfoByEncoding(uint8_t DimEnc); + +LLVM_READONLY +const MIMGDimInfo *getMIMGDimInfoByAsmSuffix(StringRef AsmSuffix); struct MIMGLZMappingInfo { MIMGBaseOpcode L; MIMGBaseOpcode LZ; }; +struct MIMGMIPMappingInfo { + MIMGBaseOpcode MIP; + MIMGBaseOpcode NONMIP; +}; + LLVM_READONLY const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L); LLVM_READONLY +const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned L); + +LLVM_READONLY int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords); LLVM_READONLY int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels); +struct MIMGInfo { + uint16_t Opcode; + uint16_t BaseOpcode; + uint8_t MIMGEncoding; + uint8_t VDataDwords; + uint8_t VAddrDwords; +}; + +LLVM_READONLY +const MIMGInfo *getMIMGInfo(unsigned Opc); + LLVM_READONLY int getMUBUFBaseOpcode(unsigned Opc); @@ -245,7 +287,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen); void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI); -amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(); +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( + const MCSubtargetInfo *STI); bool isGroupSegment(const GlobalValue *GV); bool isGlobalSegment(const GlobalValue *GV); @@ -285,21 +328,30 @@ struct Waitcnt { unsigned VmCnt = ~0u; unsigned ExpCnt = ~0u; unsigned LgkmCnt = ~0u; + unsigned VsCnt = ~0u; Waitcnt() {} - Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt) - : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt) {} + Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt) + : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {} + + static Waitcnt allZero(const IsaVersion &Version) { + return Waitcnt(0, 0, 0, Version.Major >= 10 ? 0 : ~0u); + } + static Waitcnt allZeroExceptVsCnt() { return Waitcnt(0, 0, 0, ~0u); } - static Waitcnt allZero() { return Waitcnt(0, 0, 0); } + bool hasWait() const { + return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u; + } bool dominates(const Waitcnt &Other) const { return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt && - LgkmCnt <= Other.LgkmCnt; + LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt; } Waitcnt combined(const Waitcnt &Other) const { return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt), - std::min(LgkmCnt, Other.LgkmCnt)); + std::min(LgkmCnt, Other.LgkmCnt), + std::min(VsCnt, Other.VsCnt)); } }; @@ -332,7 +384,8 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt); /// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9 only) /// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only) /// \p Expcnt = \p Waitcnt[6:4] -/// \p Lgkmcnt = \p Waitcnt[11:8] +/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10 only) +/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10+ only) void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); @@ -357,7 +410,8 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt, /// Waitcnt[3:0] = \p Vmcnt (pre-gfx9 only) /// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9+ only) /// Waitcnt[6:4] = \p Expcnt -/// Waitcnt[11:8] = \p Lgkmcnt +/// Waitcnt[11:8] = \p Lgkmcnt (pre-gfx10 only) +/// Waitcnt[13:8] = \p Lgkmcnt (gfx10+ only) /// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9+ only) /// /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given @@ -367,6 +421,75 @@ unsigned encodeWaitcnt(const IsaVersion &Version, unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded); +namespace Hwreg { + +LLVM_READONLY +int64_t getHwregId(const StringRef Name); + +LLVM_READNONE +bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI); + +LLVM_READNONE +bool isValidHwreg(int64_t Id); + +LLVM_READNONE +bool isValidHwregOffset(int64_t Offset); + +LLVM_READNONE +bool isValidHwregWidth(int64_t Width); + +LLVM_READNONE +uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width); + +LLVM_READNONE +StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI); + +void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width); + +} // namespace Hwreg + +namespace SendMsg { + +LLVM_READONLY +int64_t getMsgId(const StringRef Name); + +LLVM_READONLY +int64_t getMsgOpId(int64_t MsgId, const StringRef Name); + +LLVM_READNONE +StringRef getMsgName(int64_t MsgId); + +LLVM_READNONE +StringRef getMsgOpName(int64_t MsgId, int64_t OpId); + +LLVM_READNONE +bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true); + +LLVM_READNONE +bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict = true); + +LLVM_READNONE +bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict = true); + +LLVM_READNONE +bool msgRequiresOp(int64_t MsgId); + +LLVM_READNONE +bool msgSupportsStream(int64_t MsgId, int64_t OpId); + +void decodeMsg(unsigned Val, + uint16_t &MsgId, + uint16_t &OpId, + uint16_t &StreamId); + +LLVM_READNONE +uint64_t encodeMsg(uint64_t MsgId, + uint64_t OpId, + uint64_t StreamId); + +} // namespace SendMsg + + unsigned getInitialPSInputAddr(const Function &F); LLVM_READNONE @@ -399,6 +522,7 @@ bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); bool isGFX9(const MCSubtargetInfo &STI); +bool isGFX10(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); @@ -440,6 +564,8 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: return 4; case AMDGPU::OPERAND_REG_IMM_INT64: @@ -454,6 +580,12 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: return 2; default: @@ -496,6 +628,45 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); + +// Track defaults for fields in the MODE registser. +struct SIModeRegisterDefaults { + /// Floating point opcodes that support exception flag gathering quiet and + /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10 + /// become IEEE 754- 2008 compliant due to signaling NaN propagation and + /// quieting. + bool IEEE : 1; + + /// Used by the vector ALU to force DX10-style treatment of NaNs: when set, + /// clamp NaN to zero; otherwise, pass NaN through. + bool DX10Clamp : 1; + + // TODO: FP mode fields + + SIModeRegisterDefaults() : + IEEE(true), + DX10Clamp(true) {} + + SIModeRegisterDefaults(const Function &F); + + static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) { + SIModeRegisterDefaults Mode; + Mode.DX10Clamp = true; + Mode.IEEE = AMDGPU::isCompute(CC); + return Mode; + } + + bool operator ==(const SIModeRegisterDefaults Other) const { + return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp; + } + + // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should + // be able to override. + bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const { + return *this == CalleeMode; + } +}; + } // end namespace AMDGPU } // end namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp new file mode 100644 index 000000000000..db20d5ccf5f9 --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -0,0 +1,723 @@ +//===-- AMDGPUPALMetadata.cpp - Accumulate and print AMDGPU PAL metadata -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This class has methods called by AMDGPUAsmPrinter to accumulate and print +/// the PAL metadata. +// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPUPALMetadata.h" +#include "AMDGPU.h" +#include "AMDGPUAsmPrinter.h" +#include "MCTargetDesc/AMDGPUTargetStreamer.h" +#include "SIDefines.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/Support/AMDGPUMetadata.h" +#include "llvm/Support/EndianStream.h" + +using namespace llvm; +using namespace llvm::AMDGPU; + +// Read the PAL metadata from IR metadata, where it was put by the frontend. +void AMDGPUPALMetadata::readFromIR(Module &M) { + auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata.msgpack"); + if (NamedMD && NamedMD->getNumOperands()) { + // This is the new msgpack format for metadata. It is a NamedMD containing + // an MDTuple containing an MDString containing the msgpack data. + BlobType = ELF::NT_AMDGPU_METADATA; + auto MDN = dyn_cast<MDTuple>(NamedMD->getOperand(0)); + if (MDN && MDN->getNumOperands()) { + if (auto MDS = dyn_cast<MDString>(MDN->getOperand(0))) + setFromMsgPackBlob(MDS->getString()); + } + return; + } + BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA; + NamedMD = M.getNamedMetadata("amdgpu.pal.metadata"); + if (!NamedMD || !NamedMD->getNumOperands()) + return; + // This is the old reg=value pair format for metadata. It is a NamedMD + // containing an MDTuple containing a number of MDNodes each of which is an + // integer value, and each two integer values forms a key=value pair that we + // store as Registers[key]=value in the map. + auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0)); + if (!Tuple) + return; + for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) { + auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I)); + auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1)); + if (!Key || !Val) + continue; + setRegister(Key->getZExtValue(), Val->getZExtValue()); + } +} + +// Set PAL metadata from a binary blob from the applicable .note record. +// Returns false if bad format. Blob must remain valid for the lifetime of the +// Metadata. +bool AMDGPUPALMetadata::setFromBlob(unsigned Type, StringRef Blob) { + BlobType = Type; + if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA) + return setFromLegacyBlob(Blob); + return setFromMsgPackBlob(Blob); +} + +// Set PAL metadata from legacy (array of key=value pairs) blob. +bool AMDGPUPALMetadata::setFromLegacyBlob(StringRef Blob) { + auto Data = reinterpret_cast<const uint32_t *>(Blob.data()); + for (unsigned I = 0; I != Blob.size() / sizeof(uint32_t) / 2; ++I) + setRegister(Data[I * 2], Data[I * 2 + 1]); + return true; +} + +// Set PAL metadata from msgpack blob. +bool AMDGPUPALMetadata::setFromMsgPackBlob(StringRef Blob) { + msgpack::Reader Reader(Blob); + return MsgPackDoc.readFromBlob(Blob, /*Multi=*/false); +} + +// Given the calling convention, calculate the register number for rsrc1. In +// principle the register number could change in future hardware, but we know +// it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so +// we can use fixed values. +static unsigned getRsrc1Reg(CallingConv::ID CC) { + switch (CC) { + default: + return PALMD::R_2E12_COMPUTE_PGM_RSRC1; + case CallingConv::AMDGPU_LS: + return PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS; + case CallingConv::AMDGPU_HS: + return PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS; + case CallingConv::AMDGPU_ES: + return PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES; + case CallingConv::AMDGPU_GS: + return PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS; + case CallingConv::AMDGPU_VS: + return PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS; + case CallingConv::AMDGPU_PS: + return PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS; + } +} + +// Calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used +// with a constant offset to access any non-register shader-specific PAL +// metadata key. +static unsigned getScratchSizeKey(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_PS: + return PALMD::Key::PS_SCRATCH_SIZE; + case CallingConv::AMDGPU_VS: + return PALMD::Key::VS_SCRATCH_SIZE; + case CallingConv::AMDGPU_GS: + return PALMD::Key::GS_SCRATCH_SIZE; + case CallingConv::AMDGPU_ES: + return PALMD::Key::ES_SCRATCH_SIZE; + case CallingConv::AMDGPU_HS: + return PALMD::Key::HS_SCRATCH_SIZE; + case CallingConv::AMDGPU_LS: + return PALMD::Key::LS_SCRATCH_SIZE; + default: + return PALMD::Key::CS_SCRATCH_SIZE; + } +} + +// Set the rsrc1 register in the metadata for a particular shader stage. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setRsrc1(CallingConv::ID CC, unsigned Val) { + setRegister(getRsrc1Reg(CC), Val); +} + +// Set the rsrc2 register in the metadata for a particular shader stage. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setRsrc2(CallingConv::ID CC, unsigned Val) { + setRegister(getRsrc1Reg(CC) + 1, Val); +} + +// Set the SPI_PS_INPUT_ENA register in the metadata. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setSpiPsInputEna(unsigned Val) { + setRegister(PALMD::R_A1B3_SPI_PS_INPUT_ENA, Val); +} + +// Set the SPI_PS_INPUT_ADDR register in the metadata. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setSpiPsInputAddr(unsigned Val) { + setRegister(PALMD::R_A1B4_SPI_PS_INPUT_ADDR, Val); +} + +// Get a register from the metadata, or 0 if not currently set. +unsigned AMDGPUPALMetadata::getRegister(unsigned Reg) { + auto Regs = getRegisters(); + auto It = Regs.find(MsgPackDoc.getNode(Reg)); + if (It == Regs.end()) + return 0; + auto N = It->second; + if (N.getKind() != msgpack::Type::UInt) + return 0; + return N.getUInt(); +} + +// Set a register in the metadata. +// In fact this ORs the value into any previous setting of the register. +void AMDGPUPALMetadata::setRegister(unsigned Reg, unsigned Val) { + if (!isLegacy()) { + // In the new MsgPack format, ignore register numbered >= 0x10000000. It + // is a PAL ABI pseudo-register in the old non-MsgPack format. + if (Reg >= 0x10000000) + return; + } + auto &N = getRegisters()[MsgPackDoc.getNode(Reg)]; + if (N.getKind() == msgpack::Type::UInt) + Val |= N.getUInt(); + N = N.getDocument()->getNode(Val); +} + +// Set the entry point name for one shader. +void AMDGPUPALMetadata::setEntryPoint(unsigned CC, StringRef Name) { + if (isLegacy()) + return; + // Msgpack format. + getHwStage(CC)[".entry_point"] = MsgPackDoc.getNode(Name, /*Copy=*/true); +} + +// Set the number of used vgprs in the metadata. This is an optional +// advisory record for logging etc; wave dispatch actually uses the rsrc1 +// register for the shader stage to determine the number of vgprs to +// allocate. +void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) { + if (isLegacy()) { + // Old non-msgpack format. + unsigned NumUsedVgprsKey = getScratchSizeKey(CC) + + PALMD::Key::VS_NUM_USED_VGPRS - + PALMD::Key::VS_SCRATCH_SIZE; + setRegister(NumUsedVgprsKey, Val); + return; + } + // Msgpack format. + getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val); +} + +// Set the number of used sgprs in the metadata. This is an optional advisory +// record for logging etc; wave dispatch actually uses the rsrc1 register for +// the shader stage to determine the number of sgprs to allocate. +void AMDGPUPALMetadata::setNumUsedSgprs(CallingConv::ID CC, unsigned Val) { + if (isLegacy()) { + // Old non-msgpack format. + unsigned NumUsedSgprsKey = getScratchSizeKey(CC) + + PALMD::Key::VS_NUM_USED_SGPRS - + PALMD::Key::VS_SCRATCH_SIZE; + setRegister(NumUsedSgprsKey, Val); + return; + } + // Msgpack format. + getHwStage(CC)[".sgpr_count"] = MsgPackDoc.getNode(Val); +} + +// Set the scratch size in the metadata. +void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) { + if (isLegacy()) { + // Old non-msgpack format. + setRegister(getScratchSizeKey(CC), Val); + return; + } + // Msgpack format. + getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val); +} + +// Set the hardware register bit in PAL metadata to enable wave32 on the +// shader of the given calling convention. +void AMDGPUPALMetadata::setWave32(unsigned CC) { + switch (CC) { + case CallingConv::AMDGPU_HS: + setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_HS_W32_EN(1)); + break; + case CallingConv::AMDGPU_GS: + setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_GS_W32_EN(1)); + break; + case CallingConv::AMDGPU_VS: + setRegister(PALMD::R_A2D5_VGT_SHADER_STAGES_EN, S_028B54_VS_W32_EN(1)); + break; + case CallingConv::AMDGPU_PS: + setRegister(PALMD::R_A1B6_SPI_PS_IN_CONTROL, S_0286D8_PS_W32_EN(1)); + break; + case CallingConv::AMDGPU_CS: + setRegister(PALMD::R_2E00_COMPUTE_DISPATCH_INITIATOR, + S_00B800_CS_W32_EN(1)); + break; + } +} + +// Convert a register number to name, for display by toString(). +// Returns nullptr if none. +static const char *getRegisterName(unsigned RegNum) { + // Table of registers. + static const struct RegInfo { + unsigned Num; + const char *Name; + } RegInfoTable[] = { + // Registers that code generation sets/modifies metadata for. + {PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS, "SPI_SHADER_PGM_RSRC1_VS"}, + {PALMD::R_2C4A_SPI_SHADER_PGM_RSRC1_VS + 1, "SPI_SHADER_PGM_RSRC2_VS"}, + {PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS, "SPI_SHADER_PGM_RSRC1_LS"}, + {PALMD::R_2D4A_SPI_SHADER_PGM_RSRC1_LS + 1, "SPI_SHADER_PGM_RSRC2_LS"}, + {PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS, "SPI_SHADER_PGM_RSRC1_HS"}, + {PALMD::R_2D0A_SPI_SHADER_PGM_RSRC1_HS + 1, "SPI_SHADER_PGM_RSRC2_HS"}, + {PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES, "SPI_SHADER_PGM_RSRC1_ES"}, + {PALMD::R_2CCA_SPI_SHADER_PGM_RSRC1_ES + 1, "SPI_SHADER_PGM_RSRC2_ES"}, + {PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS, "SPI_SHADER_PGM_RSRC1_GS"}, + {PALMD::R_2C8A_SPI_SHADER_PGM_RSRC1_GS + 1, "SPI_SHADER_PGM_RSRC2_GS"}, + {PALMD::R_2E00_COMPUTE_DISPATCH_INITIATOR, "COMPUTE_DISPATCH_INITIATOR"}, + {PALMD::R_2E12_COMPUTE_PGM_RSRC1, "COMPUTE_PGM_RSRC1"}, + {PALMD::R_2E12_COMPUTE_PGM_RSRC1 + 1, "COMPUTE_PGM_RSRC2"}, + {PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS, "SPI_SHADER_PGM_RSRC1_PS"}, + {PALMD::R_2C0A_SPI_SHADER_PGM_RSRC1_PS + 1, "SPI_SHADER_PGM_RSRC2_PS"}, + {PALMD::R_A1B3_SPI_PS_INPUT_ENA, "SPI_PS_INPUT_ENA"}, + {PALMD::R_A1B4_SPI_PS_INPUT_ADDR, "SPI_PS_INPUT_ADDR"}, + {PALMD::R_A1B6_SPI_PS_IN_CONTROL, "SPI_PS_IN_CONTROL"}, + {PALMD::R_A2D5_VGT_SHADER_STAGES_EN, "VGT_SHADER_STAGES_EN"}, + + // Registers not known to code generation. + {0x2c07, "SPI_SHADER_PGM_RSRC3_PS"}, + {0x2c46, "SPI_SHADER_PGM_RSRC3_VS"}, + {0x2c87, "SPI_SHADER_PGM_RSRC3_GS"}, + {0x2cc7, "SPI_SHADER_PGM_RSRC3_ES"}, + {0x2d07, "SPI_SHADER_PGM_RSRC3_HS"}, + {0x2d47, "SPI_SHADER_PGM_RSRC3_LS"}, + + {0xa1c3, "SPI_SHADER_POS_FORMAT"}, + {0xa1b1, "SPI_VS_OUT_CONFIG"}, + {0xa207, "PA_CL_VS_OUT_CNTL"}, + {0xa204, "PA_CL_CLIP_CNTL"}, + {0xa206, "PA_CL_VTE_CNTL"}, + {0xa2f9, "PA_SU_VTX_CNTL"}, + {0xa293, "PA_SC_MODE_CNTL_1"}, + {0xa2a1, "VGT_PRIMITIVEID_EN"}, + {0x2c81, "SPI_SHADER_PGM_RSRC4_GS"}, + {0x2e18, "COMPUTE_TMPRING_SIZE"}, + {0xa1b5, "SPI_INTERP_CONTROL_0"}, + {0xa1ba, "SPI_TMPRING_SIZE"}, + {0xa1c4, "SPI_SHADER_Z_FORMAT"}, + {0xa1c5, "SPI_SHADER_COL_FORMAT"}, + {0xa203, "DB_SHADER_CONTROL"}, + {0xa08f, "CB_SHADER_MASK"}, + {0xa191, "SPI_PS_INPUT_CNTL_0"}, + {0xa192, "SPI_PS_INPUT_CNTL_1"}, + {0xa193, "SPI_PS_INPUT_CNTL_2"}, + {0xa194, "SPI_PS_INPUT_CNTL_3"}, + {0xa195, "SPI_PS_INPUT_CNTL_4"}, + {0xa196, "SPI_PS_INPUT_CNTL_5"}, + {0xa197, "SPI_PS_INPUT_CNTL_6"}, + {0xa198, "SPI_PS_INPUT_CNTL_7"}, + {0xa199, "SPI_PS_INPUT_CNTL_8"}, + {0xa19a, "SPI_PS_INPUT_CNTL_9"}, + {0xa19b, "SPI_PS_INPUT_CNTL_10"}, + {0xa19c, "SPI_PS_INPUT_CNTL_11"}, + {0xa19d, "SPI_PS_INPUT_CNTL_12"}, + {0xa19e, "SPI_PS_INPUT_CNTL_13"}, + {0xa19f, "SPI_PS_INPUT_CNTL_14"}, + {0xa1a0, "SPI_PS_INPUT_CNTL_15"}, + {0xa1a1, "SPI_PS_INPUT_CNTL_16"}, + {0xa1a2, "SPI_PS_INPUT_CNTL_17"}, + {0xa1a3, "SPI_PS_INPUT_CNTL_18"}, + {0xa1a4, "SPI_PS_INPUT_CNTL_19"}, + {0xa1a5, "SPI_PS_INPUT_CNTL_20"}, + {0xa1a6, "SPI_PS_INPUT_CNTL_21"}, + {0xa1a7, "SPI_PS_INPUT_CNTL_22"}, + {0xa1a8, "SPI_PS_INPUT_CNTL_23"}, + {0xa1a9, "SPI_PS_INPUT_CNTL_24"}, + {0xa1aa, "SPI_PS_INPUT_CNTL_25"}, + {0xa1ab, "SPI_PS_INPUT_CNTL_26"}, + {0xa1ac, "SPI_PS_INPUT_CNTL_27"}, + {0xa1ad, "SPI_PS_INPUT_CNTL_28"}, + {0xa1ae, "SPI_PS_INPUT_CNTL_29"}, + {0xa1af, "SPI_PS_INPUT_CNTL_30"}, + {0xa1b0, "SPI_PS_INPUT_CNTL_31"}, + + {0xa2ce, "VGT_GS_MAX_VERT_OUT"}, + {0xa2ab, "VGT_ESGS_RING_ITEMSIZE"}, + {0xa290, "VGT_GS_MODE"}, + {0xa291, "VGT_GS_ONCHIP_CNTL"}, + {0xa2d7, "VGT_GS_VERT_ITEMSIZE"}, + {0xa2d8, "VGT_GS_VERT_ITEMSIZE_1"}, + {0xa2d9, "VGT_GS_VERT_ITEMSIZE_2"}, + {0xa2da, "VGT_GS_VERT_ITEMSIZE_3"}, + {0xa298, "VGT_GSVS_RING_OFFSET_1"}, + {0xa299, "VGT_GSVS_RING_OFFSET_2"}, + {0xa29a, "VGT_GSVS_RING_OFFSET_3"}, + + {0xa2e4, "VGT_GS_INSTANCE_CNT"}, + {0xa297, "VGT_GS_PER_VS"}, + {0xa29b, "VGT_GS_OUT_PRIM_TYPE"}, + {0xa2ac, "VGT_GSVS_RING_ITEMSIZE"}, + + {0xa2ad, "VGT_REUSE_OFF"}, + {0xa1b8, "SPI_BARYC_CNTL"}, + + {0x2c4c, "SPI_SHADER_USER_DATA_VS_0"}, + {0x2c4d, "SPI_SHADER_USER_DATA_VS_1"}, + {0x2c4e, "SPI_SHADER_USER_DATA_VS_2"}, + {0x2c4f, "SPI_SHADER_USER_DATA_VS_3"}, + {0x2c50, "SPI_SHADER_USER_DATA_VS_4"}, + {0x2c51, "SPI_SHADER_USER_DATA_VS_5"}, + {0x2c52, "SPI_SHADER_USER_DATA_VS_6"}, + {0x2c53, "SPI_SHADER_USER_DATA_VS_7"}, + {0x2c54, "SPI_SHADER_USER_DATA_VS_8"}, + {0x2c55, "SPI_SHADER_USER_DATA_VS_9"}, + {0x2c56, "SPI_SHADER_USER_DATA_VS_10"}, + {0x2c57, "SPI_SHADER_USER_DATA_VS_11"}, + {0x2c58, "SPI_SHADER_USER_DATA_VS_12"}, + {0x2c59, "SPI_SHADER_USER_DATA_VS_13"}, + {0x2c5a, "SPI_SHADER_USER_DATA_VS_14"}, + {0x2c5b, "SPI_SHADER_USER_DATA_VS_15"}, + {0x2c5c, "SPI_SHADER_USER_DATA_VS_16"}, + {0x2c5d, "SPI_SHADER_USER_DATA_VS_17"}, + {0x2c5e, "SPI_SHADER_USER_DATA_VS_18"}, + {0x2c5f, "SPI_SHADER_USER_DATA_VS_19"}, + {0x2c60, "SPI_SHADER_USER_DATA_VS_20"}, + {0x2c61, "SPI_SHADER_USER_DATA_VS_21"}, + {0x2c62, "SPI_SHADER_USER_DATA_VS_22"}, + {0x2c63, "SPI_SHADER_USER_DATA_VS_23"}, + {0x2c64, "SPI_SHADER_USER_DATA_VS_24"}, + {0x2c65, "SPI_SHADER_USER_DATA_VS_25"}, + {0x2c66, "SPI_SHADER_USER_DATA_VS_26"}, + {0x2c67, "SPI_SHADER_USER_DATA_VS_27"}, + {0x2c68, "SPI_SHADER_USER_DATA_VS_28"}, + {0x2c69, "SPI_SHADER_USER_DATA_VS_29"}, + {0x2c6a, "SPI_SHADER_USER_DATA_VS_30"}, + {0x2c6b, "SPI_SHADER_USER_DATA_VS_31"}, + + {0x2ccc, "SPI_SHADER_USER_DATA_ES_0"}, + {0x2ccd, "SPI_SHADER_USER_DATA_ES_1"}, + {0x2cce, "SPI_SHADER_USER_DATA_ES_2"}, + {0x2ccf, "SPI_SHADER_USER_DATA_ES_3"}, + {0x2cd0, "SPI_SHADER_USER_DATA_ES_4"}, + {0x2cd1, "SPI_SHADER_USER_DATA_ES_5"}, + {0x2cd2, "SPI_SHADER_USER_DATA_ES_6"}, + {0x2cd3, "SPI_SHADER_USER_DATA_ES_7"}, + {0x2cd4, "SPI_SHADER_USER_DATA_ES_8"}, + {0x2cd5, "SPI_SHADER_USER_DATA_ES_9"}, + {0x2cd6, "SPI_SHADER_USER_DATA_ES_10"}, + {0x2cd7, "SPI_SHADER_USER_DATA_ES_11"}, + {0x2cd8, "SPI_SHADER_USER_DATA_ES_12"}, + {0x2cd9, "SPI_SHADER_USER_DATA_ES_13"}, + {0x2cda, "SPI_SHADER_USER_DATA_ES_14"}, + {0x2cdb, "SPI_SHADER_USER_DATA_ES_15"}, + {0x2cdc, "SPI_SHADER_USER_DATA_ES_16"}, + {0x2cdd, "SPI_SHADER_USER_DATA_ES_17"}, + {0x2cde, "SPI_SHADER_USER_DATA_ES_18"}, + {0x2cdf, "SPI_SHADER_USER_DATA_ES_19"}, + {0x2ce0, "SPI_SHADER_USER_DATA_ES_20"}, + {0x2ce1, "SPI_SHADER_USER_DATA_ES_21"}, + {0x2ce2, "SPI_SHADER_USER_DATA_ES_22"}, + {0x2ce3, "SPI_SHADER_USER_DATA_ES_23"}, + {0x2ce4, "SPI_SHADER_USER_DATA_ES_24"}, + {0x2ce5, "SPI_SHADER_USER_DATA_ES_25"}, + {0x2ce6, "SPI_SHADER_USER_DATA_ES_26"}, + {0x2ce7, "SPI_SHADER_USER_DATA_ES_27"}, + {0x2ce8, "SPI_SHADER_USER_DATA_ES_28"}, + {0x2ce9, "SPI_SHADER_USER_DATA_ES_29"}, + {0x2cea, "SPI_SHADER_USER_DATA_ES_30"}, + {0x2ceb, "SPI_SHADER_USER_DATA_ES_31"}, + + {0x2c0c, "SPI_SHADER_USER_DATA_PS_0"}, + {0x2c0d, "SPI_SHADER_USER_DATA_PS_1"}, + {0x2c0e, "SPI_SHADER_USER_DATA_PS_2"}, + {0x2c0f, "SPI_SHADER_USER_DATA_PS_3"}, + {0x2c10, "SPI_SHADER_USER_DATA_PS_4"}, + {0x2c11, "SPI_SHADER_USER_DATA_PS_5"}, + {0x2c12, "SPI_SHADER_USER_DATA_PS_6"}, + {0x2c13, "SPI_SHADER_USER_DATA_PS_7"}, + {0x2c14, "SPI_SHADER_USER_DATA_PS_8"}, + {0x2c15, "SPI_SHADER_USER_DATA_PS_9"}, + {0x2c16, "SPI_SHADER_USER_DATA_PS_10"}, + {0x2c17, "SPI_SHADER_USER_DATA_PS_11"}, + {0x2c18, "SPI_SHADER_USER_DATA_PS_12"}, + {0x2c19, "SPI_SHADER_USER_DATA_PS_13"}, + {0x2c1a, "SPI_SHADER_USER_DATA_PS_14"}, + {0x2c1b, "SPI_SHADER_USER_DATA_PS_15"}, + {0x2c1c, "SPI_SHADER_USER_DATA_PS_16"}, + {0x2c1d, "SPI_SHADER_USER_DATA_PS_17"}, + {0x2c1e, "SPI_SHADER_USER_DATA_PS_18"}, + {0x2c1f, "SPI_SHADER_USER_DATA_PS_19"}, + {0x2c20, "SPI_SHADER_USER_DATA_PS_20"}, + {0x2c21, "SPI_SHADER_USER_DATA_PS_21"}, + {0x2c22, "SPI_SHADER_USER_DATA_PS_22"}, + {0x2c23, "SPI_SHADER_USER_DATA_PS_23"}, + {0x2c24, "SPI_SHADER_USER_DATA_PS_24"}, + {0x2c25, "SPI_SHADER_USER_DATA_PS_25"}, + {0x2c26, "SPI_SHADER_USER_DATA_PS_26"}, + {0x2c27, "SPI_SHADER_USER_DATA_PS_27"}, + {0x2c28, "SPI_SHADER_USER_DATA_PS_28"}, + {0x2c29, "SPI_SHADER_USER_DATA_PS_29"}, + {0x2c2a, "SPI_SHADER_USER_DATA_PS_30"}, + {0x2c2b, "SPI_SHADER_USER_DATA_PS_31"}, + + {0x2e40, "COMPUTE_USER_DATA_0"}, + {0x2e41, "COMPUTE_USER_DATA_1"}, + {0x2e42, "COMPUTE_USER_DATA_2"}, + {0x2e43, "COMPUTE_USER_DATA_3"}, + {0x2e44, "COMPUTE_USER_DATA_4"}, + {0x2e45, "COMPUTE_USER_DATA_5"}, + {0x2e46, "COMPUTE_USER_DATA_6"}, + {0x2e47, "COMPUTE_USER_DATA_7"}, + {0x2e48, "COMPUTE_USER_DATA_8"}, + {0x2e49, "COMPUTE_USER_DATA_9"}, + {0x2e4a, "COMPUTE_USER_DATA_10"}, + {0x2e4b, "COMPUTE_USER_DATA_11"}, + {0x2e4c, "COMPUTE_USER_DATA_12"}, + {0x2e4d, "COMPUTE_USER_DATA_13"}, + {0x2e4e, "COMPUTE_USER_DATA_14"}, + {0x2e4f, "COMPUTE_USER_DATA_15"}, + + {0x2e07, "COMPUTE_NUM_THREAD_X"}, + {0x2e08, "COMPUTE_NUM_THREAD_Y"}, + {0x2e09, "COMPUTE_NUM_THREAD_Z"}, + {0xa2db, "VGT_TF_PARAM"}, + {0xa2d6, "VGT_LS_HS_CONFIG"}, + {0xa287, "VGT_HOS_MIN_TESS_LEVEL"}, + {0xa286, "VGT_HOS_MAX_TESS_LEVEL"}, + {0xa2f8, "PA_SC_AA_CONFIG"}, + {0xa310, "PA_SC_SHADER_CONTROL"}, + {0xa313, "PA_SC_CONSERVATIVE_RASTERIZATION_CNTL"}, + + {0x2d0c, "SPI_SHADER_USER_DATA_LS_0"}, + {0x2d0d, "SPI_SHADER_USER_DATA_LS_1"}, + {0x2d0e, "SPI_SHADER_USER_DATA_LS_2"}, + {0x2d0f, "SPI_SHADER_USER_DATA_LS_3"}, + {0x2d10, "SPI_SHADER_USER_DATA_LS_4"}, + {0x2d11, "SPI_SHADER_USER_DATA_LS_5"}, + {0x2d12, "SPI_SHADER_USER_DATA_LS_6"}, + {0x2d13, "SPI_SHADER_USER_DATA_LS_7"}, + {0x2d14, "SPI_SHADER_USER_DATA_LS_8"}, + {0x2d15, "SPI_SHADER_USER_DATA_LS_9"}, + {0x2d16, "SPI_SHADER_USER_DATA_LS_10"}, + {0x2d17, "SPI_SHADER_USER_DATA_LS_11"}, + {0x2d18, "SPI_SHADER_USER_DATA_LS_12"}, + {0x2d19, "SPI_SHADER_USER_DATA_LS_13"}, + {0x2d1a, "SPI_SHADER_USER_DATA_LS_14"}, + {0x2d1b, "SPI_SHADER_USER_DATA_LS_15"}, + {0x2d1c, "SPI_SHADER_USER_DATA_LS_16"}, + {0x2d1d, "SPI_SHADER_USER_DATA_LS_17"}, + {0x2d1e, "SPI_SHADER_USER_DATA_LS_18"}, + {0x2d1f, "SPI_SHADER_USER_DATA_LS_19"}, + {0x2d20, "SPI_SHADER_USER_DATA_LS_20"}, + {0x2d21, "SPI_SHADER_USER_DATA_LS_21"}, + {0x2d22, "SPI_SHADER_USER_DATA_LS_22"}, + {0x2d23, "SPI_SHADER_USER_DATA_LS_23"}, + {0x2d24, "SPI_SHADER_USER_DATA_LS_24"}, + {0x2d25, "SPI_SHADER_USER_DATA_LS_25"}, + {0x2d26, "SPI_SHADER_USER_DATA_LS_26"}, + {0x2d27, "SPI_SHADER_USER_DATA_LS_27"}, + {0x2d28, "SPI_SHADER_USER_DATA_LS_28"}, + {0x2d29, "SPI_SHADER_USER_DATA_LS_29"}, + {0x2d2a, "SPI_SHADER_USER_DATA_LS_30"}, + {0x2d2b, "SPI_SHADER_USER_DATA_LS_31"}, + + {0xa2aa, "IA_MULTI_VGT_PARAM"}, + {0xa2a5, "VGT_GS_MAX_PRIMS_PER_SUBGROUP"}, + {0xa2e6, "VGT_STRMOUT_BUFFER_CONFIG"}, + {0xa2e5, "VGT_STRMOUT_CONFIG"}, + {0xa2b5, "VGT_STRMOUT_VTX_STRIDE_0"}, + {0xa2b9, "VGT_STRMOUT_VTX_STRIDE_1"}, + {0xa2bd, "VGT_STRMOUT_VTX_STRIDE_2"}, + {0xa2c1, "VGT_STRMOUT_VTX_STRIDE_3"}, + {0xa316, "VGT_VERTEX_REUSE_BLOCK_CNTL"}, + + {0, nullptr}}; + auto Entry = RegInfoTable; + for (; Entry->Num && Entry->Num != RegNum; ++Entry) + ; + return Entry->Name; +} + +// Convert the accumulated PAL metadata into an asm directive. +void AMDGPUPALMetadata::toString(std::string &String) { + String.clear(); + if (!BlobType) + return; + raw_string_ostream Stream(String); + if (isLegacy()) { + if (MsgPackDoc.getRoot().getKind() == msgpack::Type::Nil) + return; + // Old linear reg=val format. + Stream << '\t' << AMDGPU::PALMD::AssemblerDirective << ' '; + auto Regs = getRegisters(); + for (auto I = Regs.begin(), E = Regs.end(); I != E; ++I) { + if (I != Regs.begin()) + Stream << ','; + unsigned Reg = I->first.getUInt(); + unsigned Val = I->second.getUInt(); + Stream << "0x" << Twine::utohexstr(Reg) << ",0x" << Twine::utohexstr(Val); + } + Stream << '\n'; + return; + } + + // New msgpack-based format -- output as YAML (with unsigned numbers in hex), + // but first change the registers map to use names. + MsgPackDoc.setHexMode(); + auto &RegsObj = refRegisters(); + auto OrigRegs = RegsObj.getMap(); + RegsObj = MsgPackDoc.getMapNode(); + for (auto I : OrigRegs) { + auto Key = I.first; + if (const char *RegName = getRegisterName(Key.getUInt())) { + std::string KeyName = Key.toString(); + KeyName += " ("; + KeyName += RegName; + KeyName += ')'; + Key = MsgPackDoc.getNode(KeyName, /*Copy=*/true); + } + RegsObj.getMap()[Key] = I.second; + } + + // Output as YAML. + Stream << '\t' << AMDGPU::PALMD::AssemblerDirectiveBegin << '\n'; + MsgPackDoc.toYAML(Stream); + Stream << '\t' << AMDGPU::PALMD::AssemblerDirectiveEnd << '\n'; + + // Restore original registers map. + RegsObj = OrigRegs; +} + +// Convert the accumulated PAL metadata into a binary blob for writing as +// a .note record of the specified AMD type. Returns an empty blob if +// there is no PAL metadata, +void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) { + if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA) + toLegacyBlob(Blob); + else if (Type) + toMsgPackBlob(Blob); +} + +void AMDGPUPALMetadata::toLegacyBlob(std::string &Blob) { + Blob.clear(); + auto Registers = getRegisters(); + if (Registers.getMap().empty()) + return; + raw_string_ostream OS(Blob); + support::endian::Writer EW(OS, support::endianness::little); + for (auto I : Registers.getMap()) { + EW.write(uint32_t(I.first.getUInt())); + EW.write(uint32_t(I.second.getUInt())); + } +} + +void AMDGPUPALMetadata::toMsgPackBlob(std::string &Blob) { + Blob.clear(); + MsgPackDoc.writeToBlob(Blob); +} + +// Set PAL metadata from YAML text. Returns false if failed. +bool AMDGPUPALMetadata::setFromString(StringRef S) { + BlobType = ELF::NT_AMDGPU_METADATA; + if (!MsgPackDoc.fromYAML(S)) + return false; + + // In the registers map, some keys may be of the form "0xa191 + // (SPI_PS_INPUT_CNTL_0)", in which case the YAML input code made it a + // string. We need to turn it into a number. + auto &RegsObj = refRegisters(); + auto OrigRegs = RegsObj; + RegsObj = MsgPackDoc.getMapNode(); + Registers = RegsObj.getMap(); + bool Ok = true; + for (auto I : OrigRegs.getMap()) { + auto Key = I.first; + if (Key.getKind() == msgpack::Type::String) { + StringRef S = Key.getString(); + uint64_t Val; + if (S.consumeInteger(0, Val)) { + Ok = false; + errs() << "Unrecognized PAL metadata register key '" << S << "'\n"; + continue; + } + Key = MsgPackDoc.getNode(uint64_t(Val)); + } + Registers.getMap()[Key] = I.second; + } + return Ok; +} + +// Reference (create if necessary) the node for the registers map. +msgpack::DocNode &AMDGPUPALMetadata::refRegisters() { + auto &N = + MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".registers")]; + N.getMap(/*Convert=*/true); + return N; +} + +// Get (create if necessary) the registers map. +msgpack::MapDocNode AMDGPUPALMetadata::getRegisters() { + if (Registers.isEmpty()) + Registers = refRegisters(); + return Registers.getMap(); +} + +// Return the PAL metadata hardware shader stage name. +static const char *getStageName(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_PS: + return ".ps"; + case CallingConv::AMDGPU_VS: + return ".vs"; + case CallingConv::AMDGPU_GS: + return ".gs"; + case CallingConv::AMDGPU_ES: + return ".es"; + case CallingConv::AMDGPU_HS: + return ".hs"; + case CallingConv::AMDGPU_LS: + return ".ls"; + default: + return ".cs"; + } +} + +// Get (create if necessary) the .hardware_stages entry for the given calling +// convention. +msgpack::MapDocNode AMDGPUPALMetadata::getHwStage(unsigned CC) { + if (HwStages.isEmpty()) + HwStages = MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)["amdpal.pipelines"] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[".hardware_stages"] + .getMap(/*Convert=*/true); + return HwStages.getMap()[getStageName(CC)].getMap(/*Convert=*/true); +} + +// Get .note record vendor name of metadata blob to be emitted. +const char *AMDGPUPALMetadata::getVendor() const { + return isLegacy() ? ElfNote::NoteNameV2 : ElfNote::NoteNameV3; +} + +// Get .note record type of metadata blob to be emitted: +// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or +// ELF::NT_AMDGPU_METADATA (MsgPack format), or +// 0 (no PAL metadata). +unsigned AMDGPUPALMetadata::getType() const { + return BlobType; +} + +// Return whether the blob type is legacy PAL metadata. +bool AMDGPUPALMetadata::isLegacy() const { + return BlobType == ELF::NT_AMD_AMDGPU_PAL_METADATA; +} + +// Set legacy PAL metadata format. +void AMDGPUPALMetadata::setLegacy() { + BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA; +} + diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h new file mode 100644 index 000000000000..0f17c157b206 --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -0,0 +1,135 @@ +//===-- AMDGPUPALMetadata.h - PAL metadata handling -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// PAL metadata handling +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/MsgPackDocument.h" +#include <map> + +namespace llvm { + +class AMDGPUTargetStreamer; +class formatted_raw_ostream; +class MCStreamer; +class Module; + +class AMDGPUPALMetadata { + unsigned BlobType = 0; + msgpack::Document MsgPackDoc; + msgpack::DocNode Registers; + msgpack::DocNode HwStages; + +public: + // Read the amdgpu.pal.metadata supplied by the frontend, ready for + // per-function modification. + void readFromIR(Module &M); + + // Set PAL metadata from a binary blob from the applicable .note record. + // Returns false if bad format. Blob must remain valid for the lifetime of + // the Metadata. + bool setFromBlob(unsigned Type, StringRef Blob); + + // Set the rsrc1 register in the metadata for a particular shader stage. + // In fact this ORs the value into any previous setting of the register. + void setRsrc1(unsigned CC, unsigned Val); + + // Set the rsrc2 register in the metadata for a particular shader stage. + // In fact this ORs the value into any previous setting of the register. + void setRsrc2(unsigned CC, unsigned Val); + + // Set the SPI_PS_INPUT_ENA register in the metadata. + // In fact this ORs the value into any previous setting of the register. + void setSpiPsInputEna(unsigned Val); + + // Set the SPI_PS_INPUT_ADDR register in the metadata. + // In fact this ORs the value into any previous setting of the register. + void setSpiPsInputAddr(unsigned Val); + + // Get a register from the metadata, or 0 if not currently set. + unsigned getRegister(unsigned Reg); + + // Set a register in the metadata. + // In fact this ORs the value into any previous setting of the register. + void setRegister(unsigned Reg, unsigned Val); + + // Set the entry point name for one shader. + void setEntryPoint(unsigned CC, StringRef Name); + + // Set the number of used vgprs in the metadata. This is an optional advisory + // record for logging etc; wave dispatch actually uses the rsrc1 register for + // the shader stage to determine the number of vgprs to allocate. + void setNumUsedVgprs(unsigned CC, unsigned Val); + + // Set the number of used sgprs in the metadata. This is an optional advisory + // record for logging etc; wave dispatch actually uses the rsrc1 register for + // the shader stage to determine the number of sgprs to allocate. + void setNumUsedSgprs(unsigned CC, unsigned Val); + + // Set the scratch size in the metadata. + void setScratchSize(unsigned CC, unsigned Val); + + // Set the hardware register bit in PAL metadata to enable wave32 on the + // shader of the given calling convention. + void setWave32(unsigned CC); + + // Emit the accumulated PAL metadata as asm directives. + // This is called from AMDGPUTargetAsmStreamer::Finish(). + void toString(std::string &S); + + // Set PAL metadata from YAML text. + bool setFromString(StringRef S); + + // Get .note record vendor name of metadata blob to be emitted. + const char *getVendor() const; + + // Get .note record type of metadata blob to be emitted: + // ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or + // ELF::NT_AMDGPU_METADATA (MsgPack format), or + // 0 (no PAL metadata). + unsigned getType() const; + + // Emit the accumulated PAL metadata as a binary blob. + // This is called from AMDGPUTargetELFStreamer::Finish(). + void toBlob(unsigned Type, std::string &S); + + // Get the msgpack::Document for the PAL metadata. + msgpack::Document *getMsgPackDoc() { return &MsgPackDoc; } + + // Set legacy PAL metadata format. + void setLegacy(); + +private: + // Return whether the blob type is legacy PAL metadata. + bool isLegacy() const; + + // Reference (create if necessary) the node for the registers map. + msgpack::DocNode &refRegisters(); + + // Get (create if necessary) the registers map. + msgpack::MapDocNode getRegisters(); + + // Get (create if necessary) the .hardware_stages entry for the given calling + // convention. + msgpack::MapDocNode getHwStage(unsigned CC); + + bool setFromLegacyBlob(StringRef Blob); + bool setFromMsgPackBlob(StringRef Blob); + void toLegacyBlob(std::string &Blob); + void toMsgPackBlob(std::string &Blob); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h index 82ffdef8e674..95ad3f35d18f 100644 --- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -1,9 +1,8 @@ //===--------------------- AMDKernelCodeTInfo.h ---------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -83,6 +82,9 @@ COMPPGM1(priv, compute_pgm_rsrc1_priv, PRIV COMPPGM1(enable_dx10_clamp, compute_pgm_rsrc1_dx10_clamp, DX10_CLAMP), COMPPGM1(debug_mode, compute_pgm_rsrc1_debug_mode, DEBUG_MODE), COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE_MODE), +COMPPGM1(enable_wgp_mode, compute_pgm_rsrc1_wgp_mode, WGP_MODE), +COMPPGM1(enable_mem_ordered, compute_pgm_rsrc1_mem_ordered, MEM_ORDERED), +COMPPGM1(enable_fwd_progress, compute_pgm_rsrc1_fwd_progress, FWD_PROGRESS), // TODO: bulky // TODO: cdbg_user COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN), @@ -107,6 +109,7 @@ CODEPROP(enable_sgpr_private_segment_size, ENABLE_SGPR_PRIVATE_SEGMENT_SIZE), CODEPROP(enable_sgpr_grid_workgroup_count_x, ENABLE_SGPR_GRID_WORKGROUP_COUNT_X), CODEPROP(enable_sgpr_grid_workgroup_count_y, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y), CODEPROP(enable_sgpr_grid_workgroup_count_z, ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z), +CODEPROP(enable_wavefront_size32, ENABLE_WAVEFRONT_SIZE32), CODEPROP(enable_ordered_append_gds, ENABLE_ORDERED_APPEND_GDS), CODEPROP(private_element_size, PRIVATE_ELEMENT_SIZE), CODEPROP(is_ptr64, IS_PTR64), diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp index 20059f4a1ed7..443e2cc45ac0 100644 --- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp @@ -1,9 +1,8 @@ //===- AMDKernelCodeTUtils.cpp --------------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h index ef9f9bdb6bcb..a87325a78df3 100644 --- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h @@ -1,9 +1,8 @@ //===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t -*- C++ -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/VIInstrFormats.td b/lib/Target/AMDGPU/VIInstrFormats.td index 1fd1c1e21527..bd65a495fa72 100644 --- a/lib/Target/AMDGPU/VIInstrFormats.td +++ b/lib/Target/AMDGPU/VIInstrFormats.td @@ -1,9 +1,8 @@ //===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td index b45c8fc9c7d5..ec7d8875a746 100644 --- a/lib/Target/AMDGPU/VIInstructions.td +++ b/lib/Target/AMDGPU/VIInstructions.td @@ -1,9 +1,8 @@ //===-- VIInstructions.td - VI Instruction Defintions ---------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Instruction definitions for VI and newer. diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 68446ab79720..6bc416ed7d4b 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -1,9 +1,8 @@ //===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -15,7 +14,7 @@ class VOP1e <bits<8> op, VOPProfile P> : Enc32 { bits<8> vdst; bits<9> src0; - let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, 0); + let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, ?); let Inst{16-9} = op; let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); let Inst{31-25} = 0x3f; //encoding @@ -48,7 +47,6 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let SubtargetPredicate = isGCN; let VOP1 = 1; let VALU = 1; @@ -144,7 +142,7 @@ defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>; // TODO: Make profile for this, there is VOP3 encoding also def V_READFIRSTLANE_B32 : InstSI <(outs SReg_32:$vdst), - (ins VGPR_32:$src0), + (ins VRegOrLds_32:$src0), "v_readfirstlane_b32 $vdst, $src0", [(set i32:$vdst, (int_amdgcn_readfirstlane i32:$src0))]>, Enc32 { @@ -156,7 +154,6 @@ def V_READFIRSTLANE_B32 : let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let SubtargetPredicate = isGCN; let VOP1 = 1; let VALU = 1; @@ -172,9 +169,16 @@ def V_READFIRSTLANE_B32 : let Inst{31-25} = 0x3f; //encoding } -let SchedRW = [WriteQuarterRate32] in { -defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>; +let SchedRW = [WriteDoubleCvt] in { +defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>; defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>; +defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; +defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>; +defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>; +defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>; +} // End SchedRW = [WriteDoubleCvt] + +let SchedRW = [WriteQuarterRate32] in { defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>; defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>; defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; @@ -186,15 +190,12 @@ defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32>; -defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; -defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>; +} // End SchedRW = [WriteQuarterRate32] + defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>; defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>; defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>; defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>; -defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>; -defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>; -} // End SchedRW = [WriteQuarterRate32] defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>; defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>; @@ -271,6 +272,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let InsDPP = (ins DstRC:$vdst, DstRC:$old, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, @@ -279,6 +281,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let Asm32 = getAsm32<1, 1>.ret; let Asm64 = getAsm64<1, 1, 0, 0, 1>.ret; let AsmDPP = getAsmDPP<1, 1, 0>.ret; + let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret; let AsmSDWA = getAsmSDWA<1, 1>.ret; let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret; @@ -305,41 +308,43 @@ defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>; defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>; -// These instruction only exist on SI and CI -let SubtargetPredicate = isSICI in { - -let SchedRW = [WriteQuarterRate32] in { -defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>; -defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>; -defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>; -defm V_RSQ_CLAMP_F32 : VOP1Inst <"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>; -defm V_RSQ_LEGACY_F32 : VOP1Inst <"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>; -} // End SchedRW = [WriteQuarterRate32] - -let SchedRW = [WriteDouble] in { -defm V_RCP_CLAMP_F64 : VOP1Inst <"v_rcp_clamp_f64", VOP_F64_F64>; -defm V_RSQ_CLAMP_F64 : VOP1Inst <"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>; -} // End SchedRW = [WriteDouble] - -} // End SubtargetPredicate = isSICI - - -let SubtargetPredicate = isCIVI in { - -let SchedRW = [WriteDoubleAdd] in { -defm V_TRUNC_F64 : VOP1Inst <"v_trunc_f64", VOP_F64_F64, ftrunc>; -defm V_CEIL_F64 : VOP1Inst <"v_ceil_f64", VOP_F64_F64, fceil>; -defm V_FLOOR_F64 : VOP1Inst <"v_floor_f64", VOP_F64_F64, ffloor>; -defm V_RNDNE_F64 : VOP1Inst <"v_rndne_f64", VOP_F64_F64, frint>; -} // End SchedRW = [WriteDoubleAdd] - -let SchedRW = [WriteQuarterRate32] in { -defm V_LOG_LEGACY_F32 : VOP1Inst <"v_log_legacy_f32", VOP_F32_F32>; -defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; -} // End SchedRW = [WriteQuarterRate32] - -} // End SubtargetPredicate = isCIVI - +let SubtargetPredicate = isGFX6GFX7 in { + let SchedRW = [WriteQuarterRate32] in { + defm V_LOG_CLAMP_F32 : + VOP1Inst<"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>; + defm V_RCP_CLAMP_F32 : + VOP1Inst<"v_rcp_clamp_f32", VOP_F32_F32>; + defm V_RCP_LEGACY_F32 : + VOP1Inst<"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>; + defm V_RSQ_CLAMP_F32 : + VOP1Inst<"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>; + defm V_RSQ_LEGACY_F32 : + VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>; + } // End SchedRW = [WriteQuarterRate32] + + let SchedRW = [WriteDouble] in { + defm V_RCP_CLAMP_F64 : + VOP1Inst<"v_rcp_clamp_f64", VOP_F64_F64>; + defm V_RSQ_CLAMP_F64 : + VOP1Inst<"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>; + } // End SchedRW = [WriteDouble] +} // End SubtargetPredicate = isGFX6GFX7 + +let SubtargetPredicate = isGFX7GFX8GFX9 in { + let SchedRW = [WriteQuarterRate32] in { + defm V_LOG_LEGACY_F32 : VOP1Inst<"v_log_legacy_f32", VOP_F32_F32>; + defm V_EXP_LEGACY_F32 : VOP1Inst<"v_exp_legacy_f32", VOP_F32_F32>; + } // End SchedRW = [WriteQuarterRate32] +} // End SubtargetPredicate = isGFX7GFX8GFX9 + +let SubtargetPredicate = isGFX7Plus in { + let SchedRW = [WriteDoubleAdd] in { + defm V_TRUNC_F64 : VOP1Inst<"v_trunc_f64", VOP_F64_F64, ftrunc>; + defm V_CEIL_F64 : VOP1Inst<"v_ceil_f64", VOP_F64_F64, fceil>; + defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, frint>; + defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>; + } // End SchedRW = [WriteDoubleAdd] +} // End SubtargetPredicate = isGFX7Plus let SubtargetPredicate = Has16BitInsts in { @@ -393,125 +398,279 @@ def VOP_SWAP_I32 : VOPProfile<[i32, i32, i32, untyped]> { let Ins64 = (ins); } -let SubtargetPredicate = isGFX9 in { - let Constraints = "$vdst = $src1, $vdst1 = $src0", - DisableEncoding="$vdst1,$src1", - SchedRW = [Write64Bit, Write64Bit] in { -// Never VOP3. Takes as long as 2 v_mov_b32s -def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>; +let SubtargetPredicate = isGFX9Plus in { + def V_SWAP_B32 : VOP1_Pseudo<"v_swap_b32", VOP_SWAP_I32, [], 1> { + let Constraints = "$vdst = $src1, $vdst1 = $src0"; + let DisableEncoding = "$vdst1,$src1"; + let SchedRW = [Write64Bit, Write64Bit]; + } + + defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>; + defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>; + defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>; +} // End SubtargetPredicate = isGFX9Plus + +let SubtargetPredicate = isGFX9Only in { + defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>; +} // End SubtargetPredicate = isGFX9Only + +let SubtargetPredicate = isGFX10Plus in { + defm V_PIPEFLUSH : VOP1Inst<"v_pipeflush", VOP_NONE>; + + let Uses = [M0] in { + // FIXME-GFX10: Should V_MOVRELSD_2_B32 be VOP_NO_EXT? + defm V_MOVRELSD_2_B32 : + VOP1Inst<"v_movrelsd_2_b32", VOP_NO_EXT<VOP_I32_I32>>; + + def V_SWAPREL_B32 : VOP1_Pseudo<"v_swaprel_b32", VOP_SWAP_I32, [], 1> { + let Constraints = "$vdst = $src1, $vdst1 = $src0"; + let DisableEncoding = "$vdst1,$src1"; + let SchedRW = [Write64Bit, Write64Bit]; + } + } // End Uses = [M0] +} // End SubtargetPredicate = isGFX10Plus + +//===----------------------------------------------------------------------===// +// Target-specific instruction encodings. +//===----------------------------------------------------------------------===// + +class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> : + VOP_DPP<ps.OpName, p, isDPP16> { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + + bits<8> vdst; + let Inst{8-0} = 0xfa; + let Inst{16-9} = op; + let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; } -defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>; +class VOP1_DPP16<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : + VOP1_DPP<op, ps, p, 1> { + let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); + let SubtargetPredicate = HasDPP16; +} -defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>; -defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>; -defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>; +class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : + VOP_DPP8<ps.OpName, p> { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; -} // End SubtargetPredicate = isGFX9 + bits<8> vdst; + let Inst{8-0} = fi; + let Inst{16-9} = op; + let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; + + let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst); + let SubtargetPredicate = HasDPP8; +} //===----------------------------------------------------------------------===// -// Target +// GFX10. //===----------------------------------------------------------------------===// +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass VOP1Only_Real_gfx10<bits<9> op> { + def _gfx10 : + VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX10>, + VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>; + } + multiclass VOP1_Real_e32_gfx10<bits<9> op> { + def _e32_gfx10 : + VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>, + VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>; + } + multiclass VOP1_Real_e64_gfx10<bits<9> op> { + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3e_gfx10<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + } + multiclass VOP1_Real_sdwa_gfx10<bits<9> op> { + def _sdwa_gfx10 : + VOP_SDWA10_Real<!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP1_SDWA9Ae<op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { + let DecoderNamespace = "SDWA10"; + } + } + multiclass VOP1_Real_dpp_gfx10<bits<9> op> { + def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> { + let DecoderNamespace = "SDWA10"; + } + } + multiclass VOP1_Real_dpp8_gfx10<bits<9> op> { + def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> { + let DecoderNamespace = "DPP8"; + } + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +multiclass VOP1_Real_gfx10_no_dpp<bits<9> op> : + VOP1_Real_e32_gfx10<op>, VOP1_Real_e64_gfx10<op>, + VOP1_Real_sdwa_gfx10<op>; + +multiclass VOP1_Real_gfx10_no_dpp8<bits<9> op> : + VOP1_Real_e32_gfx10<op>, VOP1_Real_e64_gfx10<op>, + VOP1_Real_sdwa_gfx10<op>, VOP1_Real_dpp_gfx10<op>; + +multiclass VOP1_Real_gfx10<bits<9> op> : + VOP1_Real_gfx10_no_dpp8<op>, VOP1_Real_dpp8_gfx10<op>; + +defm V_PIPEFLUSH : VOP1_Real_gfx10<0x01b>; +defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10<0x048>; +defm V_CVT_F16_U16 : VOP1_Real_gfx10<0x050>; +defm V_CVT_F16_I16 : VOP1_Real_gfx10<0x051>; +defm V_CVT_U16_F16 : VOP1_Real_gfx10<0x052>; +defm V_CVT_I16_F16 : VOP1_Real_gfx10<0x053>; +defm V_RCP_F16 : VOP1_Real_gfx10<0x054>; +defm V_SQRT_F16 : VOP1_Real_gfx10<0x055>; +defm V_RSQ_F16 : VOP1_Real_gfx10<0x056>; +defm V_LOG_F16 : VOP1_Real_gfx10<0x057>; +defm V_EXP_F16 : VOP1_Real_gfx10<0x058>; +defm V_FREXP_MANT_F16 : VOP1_Real_gfx10<0x059>; +defm V_FREXP_EXP_I16_F16 : VOP1_Real_gfx10<0x05a>; +defm V_FLOOR_F16 : VOP1_Real_gfx10<0x05b>; +defm V_CEIL_F16 : VOP1_Real_gfx10<0x05c>; +defm V_TRUNC_F16 : VOP1_Real_gfx10<0x05d>; +defm V_RNDNE_F16 : VOP1_Real_gfx10<0x05e>; +defm V_FRACT_F16 : VOP1_Real_gfx10<0x05f>; +defm V_SIN_F16 : VOP1_Real_gfx10<0x060>; +defm V_COS_F16 : VOP1_Real_gfx10<0x061>; +defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10<0x062>; +defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10<0x063>; +defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10<0x064>; + +defm V_SWAP_B32 : VOP1Only_Real_gfx10<0x065>; +defm V_SWAPREL_B32 : VOP1Only_Real_gfx10<0x068>; + //===----------------------------------------------------------------------===// -// SI +// GFX7, GFX10. //===----------------------------------------------------------------------===// -multiclass VOP1_Real_si <bits<9> op> { - let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { - def _e32_si : +let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { + multiclass VOP1_Real_e32_gfx7<bits<9> op> { + def _e32_gfx7 : VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>; - def _e64_si : + } + multiclass VOP1_Real_e64_gfx7<bits<9> op> { + def _e64_gfx7 : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + VOP3e_gfx6_gfx7<{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } -} +} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" -defm V_NOP : VOP1_Real_si <0x0>; -defm V_MOV_B32 : VOP1_Real_si <0x1>; -defm V_CVT_I32_F64 : VOP1_Real_si <0x3>; -defm V_CVT_F64_I32 : VOP1_Real_si <0x4>; -defm V_CVT_F32_I32 : VOP1_Real_si <0x5>; -defm V_CVT_F32_U32 : VOP1_Real_si <0x6>; -defm V_CVT_U32_F32 : VOP1_Real_si <0x7>; -defm V_CVT_I32_F32 : VOP1_Real_si <0x8>; -defm V_MOV_FED_B32 : VOP1_Real_si <0x9>; -defm V_CVT_F16_F32 : VOP1_Real_si <0xa>; -defm V_CVT_F32_F16 : VOP1_Real_si <0xb>; -defm V_CVT_RPI_I32_F32 : VOP1_Real_si <0xc>; -defm V_CVT_FLR_I32_F32 : VOP1_Real_si <0xd>; -defm V_CVT_OFF_F32_I4 : VOP1_Real_si <0xe>; -defm V_CVT_F32_F64 : VOP1_Real_si <0xf>; -defm V_CVT_F64_F32 : VOP1_Real_si <0x10>; -defm V_CVT_F32_UBYTE0 : VOP1_Real_si <0x11>; -defm V_CVT_F32_UBYTE1 : VOP1_Real_si <0x12>; -defm V_CVT_F32_UBYTE2 : VOP1_Real_si <0x13>; -defm V_CVT_F32_UBYTE3 : VOP1_Real_si <0x14>; -defm V_CVT_U32_F64 : VOP1_Real_si <0x15>; -defm V_CVT_F64_U32 : VOP1_Real_si <0x16>; -defm V_FRACT_F32 : VOP1_Real_si <0x20>; -defm V_TRUNC_F32 : VOP1_Real_si <0x21>; -defm V_CEIL_F32 : VOP1_Real_si <0x22>; -defm V_RNDNE_F32 : VOP1_Real_si <0x23>; -defm V_FLOOR_F32 : VOP1_Real_si <0x24>; -defm V_EXP_F32 : VOP1_Real_si <0x25>; -defm V_LOG_CLAMP_F32 : VOP1_Real_si <0x26>; -defm V_LOG_F32 : VOP1_Real_si <0x27>; -defm V_RCP_CLAMP_F32 : VOP1_Real_si <0x28>; -defm V_RCP_LEGACY_F32 : VOP1_Real_si <0x29>; -defm V_RCP_F32 : VOP1_Real_si <0x2a>; -defm V_RCP_IFLAG_F32 : VOP1_Real_si <0x2b>; -defm V_RSQ_CLAMP_F32 : VOP1_Real_si <0x2c>; -defm V_RSQ_LEGACY_F32 : VOP1_Real_si <0x2d>; -defm V_RSQ_F32 : VOP1_Real_si <0x2e>; -defm V_RCP_F64 : VOP1_Real_si <0x2f>; -defm V_RCP_CLAMP_F64 : VOP1_Real_si <0x30>; -defm V_RSQ_F64 : VOP1_Real_si <0x31>; -defm V_RSQ_CLAMP_F64 : VOP1_Real_si <0x32>; -defm V_SQRT_F32 : VOP1_Real_si <0x33>; -defm V_SQRT_F64 : VOP1_Real_si <0x34>; -defm V_SIN_F32 : VOP1_Real_si <0x35>; -defm V_COS_F32 : VOP1_Real_si <0x36>; -defm V_NOT_B32 : VOP1_Real_si <0x37>; -defm V_BFREV_B32 : VOP1_Real_si <0x38>; -defm V_FFBH_U32 : VOP1_Real_si <0x39>; -defm V_FFBL_B32 : VOP1_Real_si <0x3a>; -defm V_FFBH_I32 : VOP1_Real_si <0x3b>; -defm V_FREXP_EXP_I32_F64 : VOP1_Real_si <0x3c>; -defm V_FREXP_MANT_F64 : VOP1_Real_si <0x3d>; -defm V_FRACT_F64 : VOP1_Real_si <0x3e>; -defm V_FREXP_EXP_I32_F32 : VOP1_Real_si <0x3f>; -defm V_FREXP_MANT_F32 : VOP1_Real_si <0x40>; -defm V_CLREXCP : VOP1_Real_si <0x41>; -defm V_MOVRELD_B32 : VOP1_Real_si <0x42>; -defm V_MOVRELS_B32 : VOP1_Real_si <0x43>; -defm V_MOVRELSD_B32 : VOP1_Real_si <0x44>; +multiclass VOP1_Real_gfx7<bits<9> op> : + VOP1_Real_e32_gfx7<op>, VOP1_Real_e64_gfx7<op>; + +multiclass VOP1_Real_gfx7_gfx10<bits<9> op> : + VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>; + +defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>; +defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>; + +defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10<0x017>; +defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10<0x018>; +defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10<0x019>; +defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10<0x01a>; //===----------------------------------------------------------------------===// -// CI +// GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// -multiclass VOP1_Real_ci <bits<9> op> { - let AssemblerPredicates = [isCIOnly], DecoderNamespace = "CI" in { - def _e32_ci : +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass VOP1_Real_e32_gfx6_gfx7<bits<9> op> { + def _e32_gfx6_gfx7 : VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>; - def _e64_ci : + } + multiclass VOP1_Real_e64_gfx6_gfx7<bits<9> op> { + def _e64_gfx6_gfx7 : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + VOP3e_gfx6_gfx7<{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } -} - -defm V_TRUNC_F64 : VOP1_Real_ci <0x17>; -defm V_CEIL_F64 : VOP1_Real_ci <0x18>; -defm V_FLOOR_F64 : VOP1_Real_ci <0x1A>; -defm V_RNDNE_F64 : VOP1_Real_ci <0x19>; -defm V_LOG_LEGACY_F32 : VOP1_Real_ci <0x45>; -defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>; +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass VOP1_Real_gfx6_gfx7<bits<9> op> : + VOP1_Real_e32_gfx6_gfx7<op>, VOP1_Real_e64_gfx6_gfx7<op>; + +multiclass VOP1_Real_gfx6_gfx7_gfx10<bits<9> op> : + VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10<op>; + +multiclass VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<bits<9> op> : + VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10_no_dpp8<op>; + +multiclass VOP1_Real_gfx6_gfx7_gfx10_no_dpp<bits<9> op> : + VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10_no_dpp<op>; + +defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>; +defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>; +defm V_RCP_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x029>; +defm V_RSQ_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x02c>; +defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>; +defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>; +defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>; + +defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10<0x000>; +defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x001>; +defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x003>; +defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x004>; +defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x005>; +defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x006>; +defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x007>; +defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x008>; +defm V_MOV_FED_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x009>; +defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00a>; +defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10<0x00b>; +defm V_CVT_RPI_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00c>; +defm V_CVT_FLR_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00d>; +defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10<0x00e>; +defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x00f>; +defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x010>; +defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10<0x011>; +defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10<0x012>; +defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10<0x013>; +defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10<0x014>; +defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x015>; +defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x016>; +defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x020>; +defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x021>; +defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x022>; +defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x023>; +defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x024>; +defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x025>; +defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x027>; +defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02a>; +defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02b>; +defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02e>; +defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x02f>; +defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x031>; +defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x033>; +defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x034>; +defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x035>; +defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x036>; +defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x037>; +defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x038>; +defm V_FFBH_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x039>; +defm V_FFBL_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x03a>; +defm V_FFBH_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x03b>; +defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03c>; +defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03d>; +defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03e>; +defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x03f>; +defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x040>; +defm V_CLREXCP : VOP1_Real_gfx6_gfx7_gfx10<0x041>; +defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp<0x042>; +defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<0x043>; +defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_no_dpp8<0x044>; //===----------------------------------------------------------------------===// -// VI +// GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : @@ -524,7 +683,7 @@ class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : } multiclass VOP1Only_Real_vi <bits<10> op> { - let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in { def _vi : VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>, VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>; @@ -532,7 +691,7 @@ multiclass VOP1Only_Real_vi <bits<10> op> { } multiclass VOP1_Real_e32e64_vi <bits<10> op> { - let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in { def _e32_vi : VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>, VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>; @@ -649,7 +808,7 @@ def V_MOV_B32_indirect : VPseudoInstSI<(outs), PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst, getVOPSrc0ForVT<i32>.ret:$src0)> { let VOP1 = 1; - let SubtargetPredicate = isVI; + let SubtargetPredicate = isGFX8GFX9; } // This is a pseudo variant of the v_movreld_b32 instruction in which the @@ -672,7 +831,7 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>; def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>; def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>; -let OtherPredicates = [isVI] in { +let OtherPredicates = [isGFX8GFX9] in { def : GCNPat < (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, @@ -690,6 +849,9 @@ def : GCNPat < (as_i1imm $bound_ctrl)) >; +} // End OtherPredicates = [isGFX8GFX9] + +let OtherPredicates = [isGFX8Plus] in { def : GCNPat< (i32 (anyext i16:$src)), (COPY $src) @@ -712,14 +874,14 @@ def : GCNPat < (EXTRACT_SUBREG $src, sub0) >; -} // End OtherPredicates = [isVI] +} // End OtherPredicates = [isGFX8Plus] //===----------------------------------------------------------------------===// // GFX9 //===----------------------------------------------------------------------===// multiclass VOP1_Real_gfx9 <bits<10> op> { - let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in { + let AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9" in { defm NAME : VOP1_Real_e32e64_vi <op>; } @@ -735,3 +897,30 @@ multiclass VOP1_Real_gfx9 <bits<10> op> { } defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; + +//===----------------------------------------------------------------------===// +// GFX10 +//===----------------------------------------------------------------------===// + +let OtherPredicates = [isGFX10Plus] in { +def : GCNPat < + (i32 (int_amdgcn_mov_dpp8 i32:$src, imm:$dpp8)), + (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0)) +>; + +def : GCNPat < + (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, + imm:$bound_ctrl)), + (V_MOV_B32_dpp_gfx10 $src, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl), (i32 0)) +>; + +def : GCNPat < + (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask, + imm:$bank_mask, imm:$bound_ctrl)), + (V_MOV_B32_dpp_gfx10 $old, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl), (i32 0)) +>; +} // End OtherPredicates = [isGFX10Plus] diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index e3fd7b5f9fad..1b30cd2ed516 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -1,9 +1,8 @@ //===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -69,7 +68,6 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let SubtargetPredicate = isGCN; let VOP2 = 1; let VALU = 1; @@ -177,7 +175,9 @@ multiclass VOP2bInst <string opName, let SchedRW = [Write32Bit, WriteSALU] in { let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>, - Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; + Commutable_REV<revOp#"_e32", !eq(revOp, opName)> { + let usesCustomInserter = !eq(P.NumSrcArgs, 2); + } def _sdwa : VOP2_SDWA_Pseudo <opName, P> { let AsmMatchConverter = "cvtSdwaVOP2b"; @@ -192,6 +192,23 @@ multiclass VOP2bInst <string opName, } } +class VOP2bInstAlias <VOP2_Pseudo ps, Instruction inst, + string OpName, string opnd> : + InstAlias <OpName#" "#!subst("vcc", opnd, ps.Pfl.Asm32), + (inst ps.Pfl.DstRC:$vdst, ps.Pfl.Src0RC32:$src0, + ps.Pfl.Src1RC32:$src1)>, + PredicateControl { +} + +multiclass VOP2bInstAliases<VOP2_Pseudo ps, VOP2_Real inst, string OpName> { + let WaveSizePredicate = isWave32 in { + def : VOP2bInstAlias<ps, inst, OpName, "vcc_lo">; + } + let WaveSizePredicate = isWave64 in { + def : VOP2bInstAlias<ps, inst, OpName, "vcc">; + } +} + multiclass VOP2eInst <string opName, VOPProfile P, SDPatternOperator node = null_frag, @@ -216,6 +233,22 @@ multiclass VOP2eInst <string opName, } } +class VOP2eInstAlias <VOP2_Pseudo ps, Instruction inst, string opnd> : + InstAlias <ps.OpName#" "#ps.Pfl.Asm32#", "#opnd, + (inst ps.Pfl.DstRC:$vdst, ps.Pfl.Src0RC32:$src0, + ps.Pfl.Src1RC32:$src1)>, + PredicateControl { +} + +multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> { + let WaveSizePredicate = isWave32 in { + def : VOP2eInstAlias<ps, inst, "vcc_lo">; + } + let WaveSizePredicate = isWave64 in { + def : VOP2eInstAlias<ps, inst, "vcc">; + } +} + class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm); @@ -244,15 +277,22 @@ def VOP_MADMK_F32 : VOP_MADMK <f32>; // FIXME: Remove src2_modifiers. It isn't used, so is wasting memory // and processing time but it makes it easier to convert to mad. -class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { +class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, - 0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; + 0, HasModifiers, HasModifiers, HasOMod, + Src0Mod, Src1Mod, Src2Mod>.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + + let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + Src1ModDPP:$src1_modifiers, Src1DPP:$src1, + VGPR_32:$src2, // stub argument + dpp8:$dpp8, FI:$fi); let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, @@ -260,11 +300,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); - let Asm32 = getAsm32<1, 2, vt>.ret; - let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt>.ret; - let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; - let AsmSDWA = getAsmSDWA<1, 2, vt>.ret; - let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret; + let Asm32 = getAsm32<1, 2, vt0>.ret; + let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt0>.ret; + let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt0>.ret; + let AsmDPP16 = getAsmDPP16<1, 2, HasModifiers, vt0>.ret; + let AsmDPP8 = getAsmDPP8<1, 2, 0, vt0>.ret; + let AsmSDWA = getAsmSDWA<1, 2, vt0>.ret; + let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt0>.ret; let HasSrc2 = 0; let HasSrc2Mods = 0; @@ -272,38 +314,51 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { let HasExtDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 0; + let TieRegDPP = "$src2"; } def VOP_MAC_F16 : VOP_MAC <f16>; def VOP_MAC_F32 : VOP_MAC <f32>; +class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> { + let HasClamp = 0; + let HasExtSDWA = 0; + let HasModifiers = 1; + let HasOpSel = 0; + let IsPacked = 0; +} + +def VOP_DOT_ACC_F32_V2F16 : VOP_DOT_ACC<f32, v2f16> { + let Src0ModDPP = FPVRegInputMods; + let Src1ModDPP = FPVRegInputMods; +} +def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC<i32, i32>; + // Write out to vcc or arbitrary SGPR. -def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { +def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp=*/1> { let Asm32 = "$vdst, vcc, $src0, $src1"; - let Asm64 = "$vdst, $sdst, $src0, $src1"; + let Asm64 = "$vdst, $sdst, $src0, $src1$clamp"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let AsmDPP8 = "$vdst, vcc, $src0, $src1 $dpp8$fi"; + let AsmDPP16 = AsmDPP#"$fi"; let Outs32 = (outs DstRC:$vdst); - let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); } // Write out to vcc or arbitrary SGPR and read in from vcc or // arbitrary SGPR. -def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { - // We use VCSrc_b32 to exclude literal constants, even though the - // encoding normally allows them since the implicit VCC use means - // using one would always violate the constant bus - // restriction. SGPRs are still allowed because it should - // technically be possible to use VCC again as src0. - let Src0RC32 = VCSrc_b32; +def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> { let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; - let Asm64 = "$vdst, $sdst, $src0, $src1, $src2"; + let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let AsmDPP8 = "$vdst, vcc, $src0, $src1, vcc $dpp8$fi"; + let AsmDPP16 = AsmDPP#"$fi"; let Outs32 = (outs DstRC:$vdst); - let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); // Suppress src2 implied by type since the 32-bit encoding uses an // implicit VCC use. @@ -320,20 +375,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + let HasExt = 1; let HasExtDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 1; } -// Read in from vcc or arbitrary SGPR -def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { - let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above. - let Asm32 = "$vdst, $src0, $src1, vcc"; - let Asm64 = "$vdst, $src0, $src1, $src2"; +// Read in from vcc or arbitrary SGPR. +def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/1> { + let Asm32 = "$vdst, $src0, $src1"; + let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2"; let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let AsmDPP8 = "$vdst, $src0, $src1, vcc $dpp8$fi"; + let AsmDPP16 = AsmDPP#"$fi"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst); @@ -349,10 +407,12 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { src0_sel:$src0_sel, src1_sel:$src1_sel); let InsDPP = (ins DstRCDPP:$old, - Src0DPP:$src0, - Src1DPP:$src1, + Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + Src1ModDPP:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + let HasExt = 1; let HasExtDPP = 1; let HasExtSDWA = 1; @@ -362,7 +422,7 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { def VOP_READLANE : VOPProfile<[i32, i32, i32]> { let Outs32 = (outs SReg_32:$vdst); let Outs64 = Outs32; - let Ins32 = (ins VGPR_32:$src0, SCSrc_b32:$src1); + let Ins32 = (ins VRegOrLds_32:$src0, SCSrc_b32:$src1); let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; @@ -393,8 +453,6 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { // VOP2 Instructions //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGCN, Predicates = [isGCN] in { - defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>; @@ -414,9 +472,9 @@ defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>; defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>; defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>; defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>; -defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">; -defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">; -defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">; +defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, lshr_rev, "v_lshr_b32">; +defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, ashr_rev, "v_ashr_i32">; +defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, lshl_rev, "v_lshl_b32">; defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>; defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>; defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>; @@ -442,9 +500,9 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f let SubtargetPredicate = HasAddNoCarryInsts in { -defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32, null_frag, "v_add_u32", 1>; -defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>; -defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>; +defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>; +defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>; +defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>; } } // End isCommutable = 1 @@ -472,32 +530,20 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16 defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>; defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>; -} // End SubtargetPredicate = isGCN, Predicates = [isGCN] - -def : GCNPat< - (AMDGPUadde i32:$src0, i32:$src1, i1:$src2), - (V_ADDC_U32_e64 $src0, $src1, $src2) ->; - -def : GCNPat< - (AMDGPUsube i32:$src0, i32:$src1, i1:$src2), - (V_SUBB_U32_e64 $src0, $src1, $src2) ->; - -// These instructions only exist on SI and CI -let SubtargetPredicate = isSICI, Predicates = [isSICI] in { +let SubtargetPredicate = isGFX6GFX7 in { defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>; defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>; +} // End SubtargetPredicate = isGFX6GFX7 +let SubtargetPredicate = isGFX6GFX7GFX10 in { let isCommutable = 1 in { defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>; -defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>; -defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>; -defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>; +defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>; +defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>; +defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; } // End isCommutable = 1 - -} // End let SubtargetPredicate = SICI, Predicates = [isSICI] +} // End SubtargetPredicate = isGFX6GFX7GFX10 class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> : GCNPat< @@ -508,29 +554,29 @@ class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> : ) >; -let AddedComplexity = 1 in { - def : DivergentBinOp<srl, V_LSHRREV_B32_e64>; - def : DivergentBinOp<sra, V_ASHRREV_I32_e64>; - def : DivergentBinOp<shl, V_LSHLREV_B32_e64>; -} +class DivergentClampingBinOp<SDPatternOperator Op, VOP_Pseudo Inst> : + GCNPat< + (getDivergentFrag<Op>.ret Inst.Pfl.Src0VT:$src0, Inst.Pfl.Src1VT:$src1), + !if(!cast<Commutable_REV>(Inst).IsOrig, + (Inst $src0, $src1, 0), + (Inst $src1, $src0, 0) + ) + >; + +def : DivergentBinOp<srl, V_LSHRREV_B32_e64>; +def : DivergentBinOp<sra, V_ASHRREV_I32_e64>; +def : DivergentBinOp<shl, V_LSHLREV_B32_e64>; let SubtargetPredicate = HasAddNoCarryInsts in { - def : DivergentBinOp<add, V_ADD_U32_e32>; - def : DivergentBinOp<sub, V_SUB_U32_e32>; - def : DivergentBinOp<sub, V_SUBREV_U32_e32>; + def : DivergentClampingBinOp<add, V_ADD_U32_e64>; + def : DivergentClampingBinOp<sub, V_SUB_U32_e64>; } +let SubtargetPredicate = isGFX6GFX7GFX8GFX9, Predicates = [isGFX6GFX7GFX8GFX9] in { +def : DivergentClampingBinOp<add, V_ADD_I32_e64>; +def : DivergentClampingBinOp<sub, V_SUB_I32_e64>; +} -def : DivergentBinOp<add, V_ADD_I32_e32>; - -def : DivergentBinOp<add, V_ADD_I32_e64>; -def : DivergentBinOp<sub, V_SUB_I32_e32>; - -def : DivergentBinOp<sub, V_SUBREV_I32_e32>; - -def : DivergentBinOp<srl, V_LSHRREV_B32_e32>; -def : DivergentBinOp<sra, V_ASHRREV_I32_e32>; -def : DivergentBinOp<shl, V_LSHLREV_B32_e32>; def : DivergentBinOp<adde, V_ADDC_U32_e32>; def : DivergentBinOp<sube, V_SUBB_U32_e32>; @@ -604,56 +650,133 @@ defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>; } // End SubtargetPredicate = HasDLInsts -// Note: 16-bit instructions produce a 0 result in the high 16-bits. -multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> { +let Constraints = "$vdst = $src2", + DisableEncoding="$src2", + isConvertibleToThreeAddress = 1, + isCommutable = 1 in { + let SubtargetPredicate = HasDot5Insts in + defm V_DOT2C_F32_F16 : VOP2Inst_e32<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>; + let SubtargetPredicate = HasDot6Insts in + defm V_DOT4C_I32_I8 : VOP2Inst_e32<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>; + + let SubtargetPredicate = HasDot4Insts in + defm V_DOT2C_I32_I16 : VOP2Inst_e32<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>; + let SubtargetPredicate = HasDot3Insts in + defm V_DOT8C_I32_I4 : VOP2Inst_e32<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>; +} + +let AddedComplexity = 30 in { + def : GCNPat< + (f32 (AMDGPUfdot2 v2f16:$src0, v2f16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))), + (f32 (V_DOT2C_F32_F16_e32 $src0, $src1, $src2)) + > { + let SubtargetPredicate = HasDot5Insts; + } + def : GCNPat< + (i32 (int_amdgcn_sdot4 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))), + (i32 (V_DOT4C_I32_I8_e32 $src0, $src1, $src2)) + > { + let SubtargetPredicate = HasDot6Insts; + } + def : GCNPat< + (i32 (int_amdgcn_sdot2 v2i16:$src0, v2i16:$src1, i32:$src2, (i1 DSTCLAMP.NONE))), + (i32 (V_DOT2C_I32_I16_e32 $src0, $src1, $src2)) + > { + let SubtargetPredicate = HasDot4Insts; + } + def : GCNPat< + (i32 (int_amdgcn_sdot8 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))), + (i32 (V_DOT8C_I32_I4_e32 $src0, $src1, $src2)) + > { + let SubtargetPredicate = HasDot3Insts; + } +} // End AddedComplexity = 30 + +let SubtargetPredicate = isGFX10Plus in { + +def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">; +let FPDPRounding = 1 in +def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">; + +let isCommutable = 1 in { +def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">; +let FPDPRounding = 1 in +def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">; +} // End isCommutable = 1 + +let Constraints = "$vdst = $src2", + DisableEncoding="$src2", + isConvertibleToThreeAddress = 1, + isCommutable = 1 in { +defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>; +} + +} // End SubtargetPredicate = isGFX10Plus + +let SubtargetPredicate = HasPkFmacF16Inst in { +defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>; +} // End SubtargetPredicate = HasPkFmacF16Inst + +// Note: 16-bit instructions produce a 0 result in the high 16-bits +// on GFX8 and GFX9 and preserve high 16 bits on GFX10+ +def ClearHI16 : OutPatFrag<(ops node:$op), + (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>; + +multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst, + bit PreservesHI16 = 0> { def : GCNPat< (op i16:$src0, i16:$src1), - (inst $src0, $src1) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) >; def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), - (inst $src0, $src1) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) >; def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, - (inst $src0, $src1), sub0, + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)), + sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; - } -multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> { +multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst, + bit PreservesHI16 = 0> { def : GCNPat< (op i16:$src0, i16:$src1), - (inst $src1, $src0) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)) >; def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), - (inst $src1, $src0) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)) >; def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, - (inst $src1, $src0), sub0, + !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)), + sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; } class ZExt_i16_i1_Pat <SDNode ext> : GCNPat < (i16 (ext i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) + (V_CNDMASK_B32_e64 (i32 0/*src0mod*/), (i32 0/*src0*/), + (i32 0/*src1mod*/), (i32 1/*src1*/), + $src) >; let Predicates = [Has16BitInsts] in { +let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>; defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>; defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>; @@ -661,6 +784,17 @@ defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>; defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>; defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>; defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>; +} + +let Predicates = [Has16BitInsts, isGFX10Plus] in { +defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64, 1>; +defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64, 1>; +defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64, 1>; +defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64, 1>; +defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64, 1>; +defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64, 1>; +defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64, 1>; +} def : GCNPat < (and i16:$src0, i16:$src1), @@ -677,16 +811,25 @@ def : GCNPat < (V_XOR_B32_e64 $src0, $src1) >; +let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>; defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>; defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>; +} + +let Predicates = [Has16BitInsts, isGFX10Plus] in { +defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64, 1>; +defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64, 1>; +defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64, 1>; +} def : ZExt_i16_i1_Pat<zext>; def : ZExt_i16_i1_Pat<anyext>; def : GCNPat < (i16 (sext i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src) + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 -1), $src) >; // Undo sub x, c -> add x, -c canonicalization since c is more likely @@ -697,105 +840,334 @@ def : GCNPat< (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1) >; -} // End Predicates = [Has16BitInsts] +} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9] + //===----------------------------------------------------------------------===// -// SI +// Target-specific instruction encodings. //===----------------------------------------------------------------------===// -let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { +class VOP2_DPP<bits<6> op, VOP2_Pseudo ps, + string opName = ps.OpName, VOPProfile p = ps.Pfl, + bit IsDPP16 = 0> : + VOP_DPP<opName, p, IsDPP16> { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; -multiclass VOP2_Real_si <bits<6> op> { - def _si : - VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>, - VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; + bits<8> vdst; + bits<8> src1; + let Inst{8-0} = 0xfa; + let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; } -multiclass VOP2_Real_MADK_si <bits<6> op> { - def _si : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>, - VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; +class VOP2_DPP16<bits<6> op, VOP2_Pseudo ps, + string opName = ps.OpName, VOPProfile p = ps.Pfl> : + VOP2_DPP<op, ps, opName, p, 1> { + let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); + let SubtargetPredicate = HasDPP16; } -multiclass VOP2_Real_e32_si <bits<6> op> { - def _e32_si : - VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, - VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>; +class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, + string opName = ps.OpName, VOPProfile p = ps.Pfl> : + VOP_DPP8<ps.OpName, p> { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + + bits<8> vdst; + bits<8> src1; + + let Inst{8-0} = fi; + let Inst{16-9} = !if(p.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; + + let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst); + let SubtargetPredicate = HasDPP8; } -multiclass VOP2_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> { - def _e64_si : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3e_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; -} - -multiclass VOP2be_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> { - def _e64_si : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3be_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; -} - -} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" - -defm V_CNDMASK_B32 : VOP2_Real_e32e64_si <0x0>; -defm V_ADD_F32 : VOP2_Real_e32e64_si <0x3>; -defm V_SUB_F32 : VOP2_Real_e32e64_si <0x4>; -defm V_SUBREV_F32 : VOP2_Real_e32e64_si <0x5>; -defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_si <0x7>; -defm V_MUL_F32 : VOP2_Real_e32e64_si <0x8>; -defm V_MUL_I32_I24 : VOP2_Real_e32e64_si <0x9>; -defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_si <0xa>; -defm V_MUL_U32_U24 : VOP2_Real_e32e64_si <0xb>; -defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_si <0xc>; -defm V_MIN_F32 : VOP2_Real_e32e64_si <0xf>; -defm V_MAX_F32 : VOP2_Real_e32e64_si <0x10>; -defm V_MIN_I32 : VOP2_Real_e32e64_si <0x11>; -defm V_MAX_I32 : VOP2_Real_e32e64_si <0x12>; -defm V_MIN_U32 : VOP2_Real_e32e64_si <0x13>; -defm V_MAX_U32 : VOP2_Real_e32e64_si <0x14>; -defm V_LSHRREV_B32 : VOP2_Real_e32e64_si <0x16>; -defm V_ASHRREV_I32 : VOP2_Real_e32e64_si <0x18>; -defm V_LSHLREV_B32 : VOP2_Real_e32e64_si <0x1a>; -defm V_AND_B32 : VOP2_Real_e32e64_si <0x1b>; -defm V_OR_B32 : VOP2_Real_e32e64_si <0x1c>; -defm V_XOR_B32 : VOP2_Real_e32e64_si <0x1d>; -defm V_MAC_F32 : VOP2_Real_e32e64_si <0x1f>; -defm V_MADMK_F32 : VOP2_Real_MADK_si <0x20>; -defm V_MADAK_F32 : VOP2_Real_MADK_si <0x21>; -defm V_ADD_I32 : VOP2be_Real_e32e64_si <0x25>; -defm V_SUB_I32 : VOP2be_Real_e32e64_si <0x26>; -defm V_SUBREV_I32 : VOP2be_Real_e32e64_si <0x27>; -defm V_ADDC_U32 : VOP2be_Real_e32e64_si <0x28>; -defm V_SUBB_U32 : VOP2be_Real_e32e64_si <0x29>; -defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>; - -defm V_READLANE_B32 : VOP2_Real_si <0x01>; - -let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { -defm V_WRITELANE_B32 : VOP2_Real_si <0x02>; -} - -defm V_MAC_LEGACY_F32 : VOP2_Real_e32e64_si <0x6>; -defm V_MIN_LEGACY_F32 : VOP2_Real_e32e64_si <0xd>; -defm V_MAX_LEGACY_F32 : VOP2_Real_e32e64_si <0xe>; -defm V_LSHR_B32 : VOP2_Real_e32e64_si <0x15>; -defm V_ASHR_I32 : VOP2_Real_e32e64_si <0x17>; -defm V_LSHL_B32 : VOP2_Real_e32e64_si <0x19>; - -defm V_BFM_B32 : VOP2_Real_e32e64_si <0x1e>; -defm V_BCNT_U32_B32 : VOP2_Real_e32e64_si <0x22>; -defm V_MBCNT_LO_U32_B32 : VOP2_Real_e32e64_si <0x23>; -defm V_MBCNT_HI_U32_B32 : VOP2_Real_e32e64_si <0x24>; -defm V_LDEXP_F32 : VOP2_Real_e32e64_si <0x2b>; -defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e32e64_si <0x2c>; -defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e32e64_si <0x2d>; -defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e32e64_si <0x2e>; -defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e32e64_si <0x2f>; -defm V_CVT_PK_U16_U32 : VOP2_Real_e32e64_si <0x30>; -defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>; +//===----------------------------------------------------------------------===// +// GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + //===------------------------------- VOP2 -------------------------------===// + multiclass VOP2Only_Real_MADK_gfx10<bits<6> op> { + def _gfx10 : + VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.GFX10>, + VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; + } + multiclass VOP2Only_Real_MADK_gfx10_with_name<bits<6> op, string opName, + string asmName> { + def _gfx10 : + VOP2_Real<!cast<VOP2_Pseudo>(opName), SIEncodingFamily.GFX10>, + VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(opName).Pfl> { + VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass VOP2_Real_e32_gfx10<bits<6> op> { + def _e32_gfx10 : + VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>; + } + multiclass VOP2_Real_e64_gfx10<bits<6> op> { + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + } + multiclass VOP2_Real_sdwa_gfx10<bits<6> op> { + def _sdwa_gfx10 : + VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { + let DecoderNamespace = "SDWA10"; + } + } + multiclass VOP2_Real_dpp_gfx10<bits<6> op> { + def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { + let DecoderNamespace = "SDWA10"; + } + } + multiclass VOP2_Real_dpp8_gfx10<bits<6> op> { + def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { + let DecoderNamespace = "DPP8"; + } + } + + //===------------------------- VOP2 (with name) -------------------------===// + multiclass VOP2_Real_e32_gfx10_with_name<bits<6> op, string opName, + string asmName> { + def _e32_gfx10 : + VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.GFX10>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl> { + VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass VOP2_Real_e64_gfx10_with_name<bits<6> op, string opName, + string asmName> { + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>, + VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, + !cast<VOP3_Pseudo>(opName#"_e64").Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName#"_e64"); + let AsmString = asmName # ps.AsmOperands; + } + } + let DecoderNamespace = "SDWA10" in { + multiclass VOP2_Real_sdwa_gfx10_with_name<bits<6> op, string opName, + string asmName> { + def _sdwa_gfx10 : + VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, + VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName, + string asmName> { + def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32")> { + VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); + let AsmString = asmName # ps.Pfl.AsmDPP16; + } + } + multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName, + string asmName> { + def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { + VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); + let AsmString = asmName # ps.Pfl.AsmDPP8; + let DecoderNamespace = "DPP8"; + } + } + } // End DecoderNamespace = "SDWA10" + + //===------------------------------ VOP2be ------------------------------===// + multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> { + def _e32_gfx10 : + VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.GFX10>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl> { + VOP2_Pseudo Ps = !cast<VOP2_Pseudo>(opName#"_e32"); + let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); + } + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>, + VOP3be_gfx10<{0, 1, 0, 0, op{5-0}}, + !cast<VOP3_Pseudo>(opName#"_e64").Pfl> { + VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64"); + let AsmString = asmName # Ps.AsmOperands; + } + def _sdwa_gfx10 : + VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, + VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); + let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); + let DecoderNamespace = "SDWA10"; + } + def _dpp_gfx10 : + VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst(", vcc", "", AsmDPP); + let DecoderNamespace = "SDWA10"; + } + def _dpp8_gfx10 : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst(", vcc", "", AsmDPP8); + let DecoderNamespace = "DPP8"; + } + + let WaveSizePredicate = isWave32 in { + def _sdwa_w32_gfx10 : + Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, + VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); + let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands); + let isAsmParserOnly = 1; + let DecoderNamespace = "SDWA10"; + } + def _dpp_w32_gfx10 : + VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP); + let isAsmParserOnly = 1; + } + def _dpp8_w32_gfx10 : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8); + let isAsmParserOnly = 1; + } + } // End WaveSizePredicate = isWave32 + + let WaveSizePredicate = isWave64 in { + def _sdwa_w64_gfx10 : + Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, + VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); + let AsmString = asmName # Ps.AsmOperands; + let isAsmParserOnly = 1; + let DecoderNamespace = "SDWA10"; + } + def _dpp_w64_gfx10 : + VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # AsmDPP; + let isAsmParserOnly = 1; + } + def _dpp8_w64_gfx10 : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # AsmDPP8; + let isAsmParserOnly = 1; + } + } // End WaveSizePredicate = isWave64 + } + //===----------------------------- VOP3Only -----------------------------===// + multiclass VOP3Only_Real_gfx10<bits<10> op> { + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3e_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + } + + //===---------------------------- VOP3beOnly ----------------------------===// + multiclass VOP3beOnly_Real_gfx10<bits<10> op, string opName, string asmName> { + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>, + VOP3be_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> { + VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64"); + let AsmString = asmName # Ps.AsmOperands; + } + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +multiclass Base_VOP2_Real_gfx10<bits<6> op> : + VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>; + +multiclass VOP2_Real_gfx10<bits<6> op> : + VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>, + VOP2_Real_sdwa_gfx10<op>, VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>; + +multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName, + string asmName> : + VOP2_Real_e32_gfx10_with_name<op, opName, asmName>, + VOP2_Real_e64_gfx10_with_name<op, opName, asmName>, + VOP2_Real_sdwa_gfx10_with_name<op, opName, asmName>, + VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>, + VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>; + +defm V_CNDMASK_B32 : Base_VOP2_Real_gfx10<0x001>; +defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>; +defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>; +defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>; +defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10<0x02d>; +defm V_ADD_F16 : VOP2_Real_gfx10<0x032>; +defm V_SUB_F16 : VOP2_Real_gfx10<0x033>; +defm V_SUBREV_F16 : VOP2_Real_gfx10<0x034>; +defm V_MUL_F16 : VOP2_Real_gfx10<0x035>; +defm V_FMAC_F16 : VOP2_Real_gfx10<0x036>; +defm V_FMAMK_F16 : VOP2Only_Real_MADK_gfx10<0x037>; +defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>; +defm V_MAX_F16 : VOP2_Real_gfx10<0x039>; +defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>; +defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; +defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; + +// VOP2 no carry-in, carry-out. +defm V_ADD_NC_U32 : + VOP2_Real_gfx10_with_name<0x025, "V_ADD_U32", "v_add_nc_u32">; +defm V_SUB_NC_U32 : + VOP2_Real_gfx10_with_name<0x026, "V_SUB_U32", "v_sub_nc_u32">; +defm V_SUBREV_NC_U32 : + VOP2_Real_gfx10_with_name<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">; + +// VOP2 carry-in, carry-out. +defm V_ADD_CO_CI_U32 : + VOP2be_Real_gfx10<0x028, "V_ADDC_U32", "v_add_co_ci_u32">; +defm V_SUB_CO_CI_U32 : + VOP2be_Real_gfx10<0x029, "V_SUBB_U32", "v_sub_co_ci_u32">; +defm V_SUBREV_CO_CI_U32 : + VOP2be_Real_gfx10<0x02a, "V_SUBBREV_U32", "v_subrev_co_ci_u32">; + +// VOP3 only. +defm V_BFM_B32 : VOP3Only_Real_gfx10<0x363>; +defm V_BCNT_U32_B32 : VOP3Only_Real_gfx10<0x364>; +defm V_MBCNT_LO_U32_B32 : VOP3Only_Real_gfx10<0x365>; +defm V_MBCNT_HI_U32_B32 : VOP3Only_Real_gfx10<0x366>; +defm V_LDEXP_F32 : VOP3Only_Real_gfx10<0x362>; +defm V_CVT_PKNORM_I16_F32 : VOP3Only_Real_gfx10<0x368>; +defm V_CVT_PKNORM_U16_F32 : VOP3Only_Real_gfx10<0x369>; +defm V_CVT_PK_U16_U32 : VOP3Only_Real_gfx10<0x36a>; +defm V_CVT_PK_I16_I32 : VOP3Only_Real_gfx10<0x36b>; + +// VOP3 carry-in, carry-out. +defm V_ADD_CO_U32 : + VOP3beOnly_Real_gfx10<0x30f, "V_ADD_I32", "v_add_co_u32">; +defm V_SUB_CO_U32 : + VOP3beOnly_Real_gfx10<0x310, "V_SUB_I32", "v_sub_co_u32">; +defm V_SUBREV_CO_U32 : + VOP3beOnly_Real_gfx10<0x319, "V_SUBREV_I32", "v_subrev_co_u32">; + +let SubtargetPredicate = isGFX10Plus in { + defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx10>; + + defm : VOP2bInstAliases< + V_ADDC_U32_e32, V_ADD_CO_CI_U32_e32_gfx10, "v_add_co_ci_u32">; + defm : VOP2bInstAliases< + V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx10, "v_sub_co_ci_u32">; + defm : VOP2bInstAliases< + V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx10, "v_subrev_co_ci_u32">; +} // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// -// VI +// GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : @@ -809,7 +1181,111 @@ class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : let Inst{31} = 0x0; //encoding } -let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass VOP2Only_Real_gfx6_gfx7<bits<6> op> { + def _gfx6_gfx7 : + VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; + } + multiclass VOP2Only_Real_MADK_gfx6_gfx7<bits<6> op> { + def _gfx6_gfx7 : + VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; + } + multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op> { + def _e32_gfx6_gfx7 : + VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>; + } + multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op> { + def _e64_gfx6_gfx7 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + } + multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op> { + def _e64_gfx6_gfx7 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + } +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass VOP2Only_Real_MADK_gfx6_gfx7_gfx10<bits<6> op> : + VOP2Only_Real_MADK_gfx6_gfx7<op>, VOP2Only_Real_MADK_gfx10<op>; + +multiclass VOP2_Real_gfx6_gfx7<bits<6> op> : + VOP2_Real_e32_gfx6_gfx7<op>, VOP2_Real_e64_gfx6_gfx7<op>; + +multiclass VOP2_Real_gfx6_gfx7_gfx10<bits<6> op> : + VOP2_Real_gfx6_gfx7<op>, VOP2_Real_gfx10<op>; + +multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> : + VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>; + +defm V_CNDMASK_B32 : VOP2_Real_gfx6_gfx7<0x000>; +defm V_MIN_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00d>; +defm V_MAX_LEGACY_F32 : VOP2_Real_gfx6_gfx7<0x00e>; +defm V_LSHR_B32 : VOP2_Real_gfx6_gfx7<0x015>; +defm V_ASHR_I32 : VOP2_Real_gfx6_gfx7<0x017>; +defm V_LSHL_B32 : VOP2_Real_gfx6_gfx7<0x019>; +defm V_BFM_B32 : VOP2_Real_gfx6_gfx7<0x01e>; +defm V_BCNT_U32_B32 : VOP2_Real_gfx6_gfx7<0x022>; +defm V_MBCNT_LO_U32_B32 : VOP2_Real_gfx6_gfx7<0x023>; +defm V_MBCNT_HI_U32_B32 : VOP2_Real_gfx6_gfx7<0x024>; +defm V_LDEXP_F32 : VOP2_Real_gfx6_gfx7<0x02b>; +defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_gfx6_gfx7<0x02c>; +defm V_CVT_PKNORM_I16_F32 : VOP2_Real_gfx6_gfx7<0x02d>; +defm V_CVT_PKNORM_U16_F32 : VOP2_Real_gfx6_gfx7<0x02e>; +defm V_CVT_PK_U16_U32 : VOP2_Real_gfx6_gfx7<0x030>; +defm V_CVT_PK_I16_I32 : VOP2_Real_gfx6_gfx7<0x031>; +defm V_ADD_I32 : VOP2be_Real_gfx6_gfx7<0x025>; +defm V_SUB_I32 : VOP2be_Real_gfx6_gfx7<0x026>; +defm V_SUBREV_I32 : VOP2be_Real_gfx6_gfx7<0x027>; +defm V_ADDC_U32 : VOP2be_Real_gfx6_gfx7<0x028>; +defm V_SUBB_U32 : VOP2be_Real_gfx6_gfx7<0x029>; +defm V_SUBBREV_U32 : VOP2be_Real_gfx6_gfx7<0x02a>; + +defm V_READLANE_B32 : VOP2Only_Real_gfx6_gfx7<0x001>; + +let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { + defm V_WRITELANE_B32 : VOP2Only_Real_gfx6_gfx7<0x002>; +} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) + +let SubtargetPredicate = isGFX6GFX7 in { + defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx6_gfx7>; +} // End SubtargetPredicate = isGFX6GFX7 + +defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>; +defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>; +defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>; +defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>; +defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>; +defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>; +defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x009>; +defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x00a>; +defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00b>; +defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00c>; +defm V_MIN_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x00f>; +defm V_MAX_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x010>; +defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x011>; +defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x012>; +defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x013>; +defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x014>; +defm V_LSHRREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x016>; +defm V_ASHRREV_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x018>; +defm V_LSHLREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01a>; +defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01b>; +defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01c>; +defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01d>; +defm V_MAC_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x01f>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x02f>; +defm V_MADMK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x020>; +defm V_MADAK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x021>; + +//===----------------------------------------------------------------------===// +// GFX8, GFX9 (VI). +//===----------------------------------------------------------------------===// + +let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in { multiclass VOP2_Real_MADK_vi <bits<6> op> { def _vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>, @@ -843,7 +1319,7 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op>, VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>; -} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" +} // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" multiclass VOP2_SDWA_Real <bits<6> op> { def _sdwa_vi : @@ -857,7 +1333,7 @@ multiclass VOP2_SDWA9_Real <bits<6> op> { VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; } -let AssemblerPredicates = [isVIOnly] in { +let AssemblerPredicates = [isGFX8Only] in { multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> { def _e32_vi : @@ -865,14 +1341,14 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "VI"; + let DecoderNamespace = "GFX8"; } def _e64_vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>, VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> { VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "VI"; + let DecoderNamespace = "GFX8"; } def _sdwa_vi : VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, @@ -890,7 +1366,7 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName } } -let AssemblerPredicates = [isGFX9] in { +let AssemblerPredicates = [isGFX9Only] in { multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { def _e32_gfx9 : @@ -946,7 +1422,7 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { } } -} // AssemblerPredicates = [isGFX9] +} // AssemblerPredicates = [isGFX9Only] multiclass VOP2_Real_e32e64_vi <bits<6> op> : Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { @@ -1035,7 +1511,7 @@ defm V_MIN_U16 : VOP2_Real_e32e64_vi <0x31>; defm V_MIN_I16 : VOP2_Real_e32e64_vi <0x32>; defm V_LDEXP_F16 : VOP2_Real_e32e64_vi <0x33>; -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8GFX9 in { // Aliases to simplify matching of floating-point instructions that // are VOP2 on SI and VOP3 on VI. @@ -1055,7 +1531,20 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; -} // End SubtargetPredicate = isVI +defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_vi>; + +} // End SubtargetPredicate = isGFX8GFX9 + +let SubtargetPredicate = isGFX9Only in { + +defm : VOP2bInstAliases<V_ADD_I32_e32, V_ADD_CO_U32_e32_gfx9, "v_add_co_u32">; +defm : VOP2bInstAliases<V_ADDC_U32_e32, V_ADDC_CO_U32_e32_gfx9, "v_addc_co_u32">; +defm : VOP2bInstAliases<V_SUB_I32_e32, V_SUB_CO_U32_e32_gfx9, "v_sub_co_u32">; +defm : VOP2bInstAliases<V_SUBB_U32_e32, V_SUBB_CO_U32_e32_gfx9, "v_subb_co_u32">; +defm : VOP2bInstAliases<V_SUBREV_I32_e32, V_SUBREV_CO_U32_e32_gfx9, "v_subrev_co_u32">; +defm : VOP2bInstAliases<V_SUBBREV_U32_e32, V_SUBBREV_CO_U32_e32_gfx9, "v_subbrev_co_u32">; + +} // End SubtargetPredicate = isGFX9Only let SubtargetPredicate = HasDLInsts in { @@ -1063,3 +1552,35 @@ defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>; defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>; } // End SubtargetPredicate = HasDLInsts + +multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> { + def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; +} + +multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> : + VOP2_Real_e32_gfx10<op>, + VOP2_Real_dpp_gfx10<op>, + VOP2_Real_dpp8_gfx10<op>; + +let SubtargetPredicate = HasDot5Insts in { + defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx9<0x37>; + // NB: Opcode conflicts with V_DOT8C_I32_I4 + // This opcode exists in gfx 10.1* only + defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx10<0x02>; +} + +let SubtargetPredicate = HasDot6Insts in { + defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx9<0x39>; + defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx10<0x0d>; +} + +let SubtargetPredicate = HasDot4Insts in { + defm V_DOT2C_I32_I16 : VOP2_Real_DOT_ACC_gfx9<0x38>; +} +let SubtargetPredicate = HasDot3Insts in { + defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx9<0x3a>; +} + +let SubtargetPredicate = HasPkFmacF16Inst in { +defm V_PK_FMAC_F16 : VOP2_Real_e32_vi<0x3c>; +} // End SubtargetPredicate = HasPkFmacF16Inst diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 4b8c1f208a0e..21dbef9240e1 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -1,9 +1,8 @@ //===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -111,6 +110,11 @@ class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> { ret1)); } +class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> { + list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, + imm:$cbsz, imm:$abid, imm:$blgp))]; +} + class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> : VOP3_Pseudo<OpName, P, !if(P.HasOpSel, @@ -121,7 +125,9 @@ class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, getVOP3ModPat<P, node>.ret, !if(P.HasIntClamp, getVOP3ClampPat<P, node>.ret, - getVOP3Pat<P, node>.ret))), + !if (P.IsMAI, + getVOP3MAIPat<P, node>.ret, + getVOP3Pat<P, node>.ret)))), VOP3Only, 0, P.HasOpSel> { let IntClamp = P.HasIntClamp; @@ -144,33 +150,27 @@ def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> { } } -class getVOP3VCC<VOPProfile P, SDPatternOperator node> { - list<dag> ret = - [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)), - (i1 VCC)))]; -} - -class VOP3Features<bit Clamp, bit OpSel, bit Packed> { +class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> { bit HasClamp = Clamp; bit HasOpSel = OpSel; bit IsPacked = Packed; + bit IsMAI = MAI; } -def VOP3_REGULAR : VOP3Features<0, 0, 0>; -def VOP3_CLAMP : VOP3Features<1, 0, 0>; -def VOP3_OPSEL : VOP3Features<1, 1, 0>; -def VOP3_PACKED : VOP3Features<1, 1, 1>; +def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>; +def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>; +def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>; +def VOP3_PACKED : VOP3Features<1, 1, 1, 0>; +def VOP3_MAI : VOP3Features<0, 0, 0, 1>; class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> { let HasClamp = !if(Features.HasClamp, 1, P.HasClamp); let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel); + let IsMAI = !if(Features.IsMAI, 1, P.IsMAI); let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); - let HasModifiers = !if(Features.IsPacked, 1, P.HasModifiers); + let HasModifiers = !if(Features.IsPacked, !if(Features.IsMAI, 0, 1), P.HasModifiers); // FIXME: Hack to stop printing _e64 let Outs64 = (outs DstRC.RegClass:$vdst); @@ -191,8 +191,9 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { // v_div_scale_{f32|f64} do not support input modifiers. let HasModifiers = 0; + let HasClamp = 0; let HasOMod = 0; - let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); let Asm64 = " $vdst, $sdst, $src0, $src1, $src2"; } @@ -212,7 +213,7 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { // FIXME: Hack to stop printing _e64 let DstRC = RegisterOperand<VReg_64>; - let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp"; } @@ -303,7 +304,7 @@ def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_li } // End SchedRW = [WriteDoubleAdd] let SchedRW = [WriteQuarterRate32] in { -def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>>; +def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>; def V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>; def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>; def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>; @@ -315,8 +316,7 @@ let Uses = [VCC, EXEC] in { // if (vcc) // result *= 2^32 // -def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, - getVOP3VCC<VOP_F32_F32_F32_F32_VCC, AMDGPUdiv_fmas>.ret> { +def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, []> { let SchedRW = [WriteFloatFMA]; } // v_div_fmas_f64: @@ -324,8 +324,7 @@ def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, // if (vcc) // result *= 2^64 // -def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, - getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> { +def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []> { let SchedRW = [WriteDouble]; let FPDPRounding = 1; } @@ -386,22 +385,21 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3 } let SchedRW = [Write64Bit] in { -// These instructions only exist on SI and CI -let SubtargetPredicate = isSICI, Predicates = [isSICI] in { +let SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] in { def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, shl>; def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, srl>; def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, sra>; def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; -} // End SubtargetPredicate = isSICI, Predicates = [isSICI] +} // End SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] -let SubtargetPredicate = isVI in { -def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>; -def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>; -def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>; -} // End SubtargetPredicate = isVI +let SubtargetPredicate = isGFX8Plus in { +def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>; +def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshr_rev>; +def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>; +} // End SubtargetPredicate = isGFX8Plus } // End SchedRW = [Write64Bit] -let Predicates = [isVI] in { +let Predicates = [isGFX8Plus] in { def : GCNPat < (getDivergentFrag<shl>.ret i64:$x, i32:$y), (V_LSHLREV_B64 $y, $x) @@ -417,7 +415,13 @@ def : AMDGPUPat < } -let SubtargetPredicate = isCIVI in { +let SchedRW = [Write32Bit] in { +let SubtargetPredicate = isGFX8Plus in { +def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>; +} // End SubtargetPredicate = isGFX8Plus +} // End SchedRW = [Write32Bit] + +let SubtargetPredicate = isGFX7Plus in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; @@ -431,27 +435,27 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } // End SchedRW = [WriteDouble, WriteSALU] } // End isCommutable = 1 -} // End SubtargetPredicate = isCIVI +} // End SubtargetPredicate = isGFX7Plus def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> { - let Predicates = [Has16BitInsts, isVIOnly]; + let Predicates = [Has16BitInsts, isGFX8Only]; let FPDPRounding = 1; } def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> { let renamedInGFX9 = 1; - let Predicates = [Has16BitInsts, isGFX9]; + let Predicates = [Has16BitInsts, isGFX9Plus]; let FPDPRounding = 1; } def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma> { - let Predicates = [Has16BitInsts, isVIOnly]; + let Predicates = [Has16BitInsts, isGFX8Only]; let FPDPRounding = 1; } def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, fma> { let renamedInGFX9 = 1; - let Predicates = [Has16BitInsts, isGFX9]; + let Predicates = [Has16BitInsts, isGFX9Plus]; let FPDPRounding = 1; } @@ -463,36 +467,58 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CL let FPDPRounding = 1 in { def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; let Uses = [M0, EXEC] in { -def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>; +def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>, + [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (f32 VRegSrc_32:$src2), + (i32 imm:$src2_modifiers), + (i1 imm:$high), + (i1 imm:$clamp)))]>; } // End Uses = [M0, EXEC] } // End FPDPRounding = 1 } // End renamedInGFX9 = 1 -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = isGFX9Only in { def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> { let FPDPRounding = 1; } +} // End SubtargetPredicate = isGFX9Only + +let SubtargetPredicate = isGFX9Plus in { def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>; def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>; def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; -} // End SubtargetPredicate = isGFX9 +} // End SubtargetPredicate = isGFX9Plus let Uses = [M0, EXEC], FPDPRounding = 1 in { -def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>; -def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; +def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>, + [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (i1 imm:$high), + (i1 imm:$clamp), + (i32 imm:$omod)))]>; +def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>, + [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (f32 VRegSrc_32:$src2), + (i32 imm:$src2_modifiers), + (i1 imm:$high), + (i1 imm:$clamp), + (i32 imm:$omod)))]>; } // End Uses = [M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isGFX8GFX9 in { def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>; def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>; def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>; +} // End SubtargetPredicate = isGFX8GFX9 -def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>; -} // End SubtargetPredicate = isVI - -let Predicates = [Has16BitInsts] in { +let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in { multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst, SDPatternOperator op3> { @@ -506,7 +532,23 @@ def : GCNPat < defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>; defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>; -} // End Predicates = [Has16BitInsts] +} // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] + +let Predicates = [Has16BitInsts, isGFX10Plus] in { + +multiclass Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2, + Instruction inst, SDPatternOperator op3> { +def : GCNPat < + (op2 (op1 i16:$src0, i16:$src1), i16:$src2), + (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE) +>; + +} + +defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9, zext>; +defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9, sext>; + +} // End Predicates = [Has16BitInsts, isGFX10Plus] class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< (ops node:$x, node:$y, node:$z), @@ -528,7 +570,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< if (!Operands[i]->isDivergent() && !isInlineImmediate(Operands[i].getNode())) { ConstantBusUses++; - if (ConstantBusUses >= 2) + // This uses AMDGPU::V_ADD3_U32, but all three operand instructions + // have the same constant bus limit. + if (ConstantBusUses > Subtarget->getConstantBusLimit(AMDGPU::V_ADD3_U32)) return false; } } @@ -539,7 +583,7 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< let PredicateCodeUsesOperands = 1; } -let SubtargetPredicate = isGFX9 in { +let SubtargetPredicate = isGFX9Plus in { def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>; def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; @@ -589,7 +633,38 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32>; def : ThreeOp_i32_Pats<or, or, V_OR3_B32>; def : ThreeOp_i32_Pats<xor, add, V_XAD_U32>; -} // End SubtargetPredicate = isGFX9 +} // End SubtargetPredicate = isGFX9Plus + +def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> { + let Src0RC64 = VRegSrc_32; + let Src1RC64 = SCSrc_b32; + let Src2RC64 = SCSrc_b32; + let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0, + IntOpSelMods:$src1_modifiers, SCSrc_b32:$src1, + IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2, + VGPR_32:$vdst_in, op_sel:$op_sel); + let HasClamp = 0; + let HasOMod = 0; +} + +let SubtargetPredicate = isGFX10Plus in { + def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32>; + + let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { + def V_PERMLANE16_B32 : VOP3Inst <"v_permlane16_b32", VOP3_PERMLANE_Profile>; + def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>; + } // End $vdst = $vdst_in, DisableEncoding $vdst_in + + def : GCNPat< + (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc), + (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) + >; + def : GCNPat< + (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc), + (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) + >; +} // End SubtargetPredicate = isGFX10Plus //===----------------------------------------------------------------------===// // Integer Clamp Patterns @@ -631,111 +706,239 @@ def : IntClampPat<V_MQSAD_PK_U16_U8, int_amdgcn_mqsad_pk_u16_u8>; def : IntClampPat<V_QSAD_PK_U16_U8, int_amdgcn_qsad_pk_u16_u8>; def : IntClampPat<V_MQSAD_U32_U8, int_amdgcn_mqsad_u32_u8>; + //===----------------------------------------------------------------------===// -// Target +// Target-specific instruction encodings. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// SI +// GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { - -multiclass VOP3_Real_si<bits<9> op> { - def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, - VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>; -} - -multiclass VOP3be_Real_si<bits<9> op> { - def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, - VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>; -} - -} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" - -defm V_MAD_LEGACY_F32 : VOP3_Real_si <0x140>; -defm V_MAD_F32 : VOP3_Real_si <0x141>; -defm V_MAD_I32_I24 : VOP3_Real_si <0x142>; -defm V_MAD_U32_U24 : VOP3_Real_si <0x143>; -defm V_CUBEID_F32 : VOP3_Real_si <0x144>; -defm V_CUBESC_F32 : VOP3_Real_si <0x145>; -defm V_CUBETC_F32 : VOP3_Real_si <0x146>; -defm V_CUBEMA_F32 : VOP3_Real_si <0x147>; -defm V_BFE_U32 : VOP3_Real_si <0x148>; -defm V_BFE_I32 : VOP3_Real_si <0x149>; -defm V_BFI_B32 : VOP3_Real_si <0x14a>; -defm V_FMA_F32 : VOP3_Real_si <0x14b>; -defm V_FMA_F64 : VOP3_Real_si <0x14c>; -defm V_LERP_U8 : VOP3_Real_si <0x14d>; -defm V_ALIGNBIT_B32 : VOP3_Real_si <0x14e>; -defm V_ALIGNBYTE_B32 : VOP3_Real_si <0x14f>; -defm V_MULLIT_F32 : VOP3_Real_si <0x150>; -defm V_MIN3_F32 : VOP3_Real_si <0x151>; -defm V_MIN3_I32 : VOP3_Real_si <0x152>; -defm V_MIN3_U32 : VOP3_Real_si <0x153>; -defm V_MAX3_F32 : VOP3_Real_si <0x154>; -defm V_MAX3_I32 : VOP3_Real_si <0x155>; -defm V_MAX3_U32 : VOP3_Real_si <0x156>; -defm V_MED3_F32 : VOP3_Real_si <0x157>; -defm V_MED3_I32 : VOP3_Real_si <0x158>; -defm V_MED3_U32 : VOP3_Real_si <0x159>; -defm V_SAD_U8 : VOP3_Real_si <0x15a>; -defm V_SAD_HI_U8 : VOP3_Real_si <0x15b>; -defm V_SAD_U16 : VOP3_Real_si <0x15c>; -defm V_SAD_U32 : VOP3_Real_si <0x15d>; -defm V_CVT_PK_U8_F32 : VOP3_Real_si <0x15e>; -defm V_DIV_FIXUP_F32 : VOP3_Real_si <0x15f>; -defm V_DIV_FIXUP_F64 : VOP3_Real_si <0x160>; -defm V_LSHL_B64 : VOP3_Real_si <0x161>; -defm V_LSHR_B64 : VOP3_Real_si <0x162>; -defm V_ASHR_I64 : VOP3_Real_si <0x163>; -defm V_ADD_F64 : VOP3_Real_si <0x164>; -defm V_MUL_F64 : VOP3_Real_si <0x165>; -defm V_MIN_F64 : VOP3_Real_si <0x166>; -defm V_MAX_F64 : VOP3_Real_si <0x167>; -defm V_LDEXP_F64 : VOP3_Real_si <0x168>; -defm V_MUL_LO_U32 : VOP3_Real_si <0x169>; -defm V_MUL_HI_U32 : VOP3_Real_si <0x16a>; -defm V_MUL_LO_I32 : VOP3_Real_si <0x16b>; -defm V_MUL_HI_I32 : VOP3_Real_si <0x16c>; -defm V_DIV_SCALE_F32 : VOP3be_Real_si <0x16d>; -defm V_DIV_SCALE_F64 : VOP3be_Real_si <0x16e>; -defm V_DIV_FMAS_F32 : VOP3_Real_si <0x16f>; -defm V_DIV_FMAS_F64 : VOP3_Real_si <0x170>; -defm V_MSAD_U8 : VOP3_Real_si <0x171>; -defm V_MQSAD_PK_U16_U8 : VOP3_Real_si <0x173>; -defm V_TRIG_PREOP_F64 : VOP3_Real_si <0x174>; +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass VOP3_Real_gfx10<bits<10> op> { + def _gfx10 : + VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX10>, + VOP3e_gfx10<op, !cast<VOP_Pseudo>(NAME).Pfl>; + } + multiclass VOP3_Real_gfx10_with_name<bits<10> op, string opName, + string asmName> { + def _gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(opName), SIEncodingFamily.GFX10>, + VOP3e_gfx10<op, !cast<VOP3_Pseudo>(opName).Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName); + let AsmString = asmName # ps.AsmOperands; + } + } + multiclass VOP3be_Real_gfx10<bits<10> op> { + def _gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>, + VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>; + } + multiclass VOP3Interp_Real_gfx10<bits<10> op> { + def _gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>, + VOP3Interp_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>; + } + multiclass VOP3OpSel_Real_gfx10<bits<10> op> { + def _gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX10>, + VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(NAME).Pfl>; + } + multiclass VOP3OpSel_Real_gfx10_with_name<bits<10> op, string opName, + string asmName> { + def _gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(opName), SIEncodingFamily.GFX10>, + VOP3OpSel_gfx10<op, !cast<VOP3_Pseudo>(opName).Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName); + let AsmString = asmName # ps.AsmOperands; + } + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +defm V_READLANE_B32 : VOP3_Real_gfx10<0x360>; + +let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { + defm V_WRITELANE_B32 : VOP3_Real_gfx10<0x361>; +} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) + +defm V_XOR3_B32 : VOP3_Real_gfx10<0x178>; +defm V_LSHLREV_B64 : VOP3_Real_gfx10<0x2ff>; +defm V_LSHRREV_B64 : VOP3_Real_gfx10<0x300>; +defm V_ASHRREV_I64 : VOP3_Real_gfx10<0x301>; +defm V_PERM_B32 : VOP3_Real_gfx10<0x344>; +defm V_XAD_U32 : VOP3_Real_gfx10<0x345>; +defm V_LSHL_ADD_U32 : VOP3_Real_gfx10<0x346>; +defm V_ADD_LSHL_U32 : VOP3_Real_gfx10<0x347>; +defm V_ADD3_U32 : VOP3_Real_gfx10<0x36d>; +defm V_LSHL_OR_B32 : VOP3_Real_gfx10<0x36f>; +defm V_AND_OR_B32 : VOP3_Real_gfx10<0x371>; +defm V_OR3_B32 : VOP3_Real_gfx10<0x372>; + +// TODO-GFX10: add MC tests for v_add/sub_nc_i16 +defm V_ADD_NC_I16 : + VOP3OpSel_Real_gfx10_with_name<0x30d, "V_ADD_I16", "v_add_nc_i16">; +defm V_SUB_NC_I16 : + VOP3OpSel_Real_gfx10_with_name<0x30e, "V_SUB_I16", "v_sub_nc_i16">; +defm V_SUB_NC_I32 : + VOP3_Real_gfx10_with_name<0x376, "V_SUB_I32_gfx9", "v_sub_nc_i32">; +defm V_ADD_NC_I32 : + VOP3_Real_gfx10_with_name<0x37f, "V_ADD_I32_gfx9", "v_add_nc_i32">; + +defm V_INTERP_P1LL_F16 : VOP3Interp_Real_gfx10<0x342>; +defm V_INTERP_P1LV_F16 : VOP3Interp_Real_gfx10<0x343>; +defm V_INTERP_P2_F16 : VOP3Interp_Real_gfx10<0x35a>; + +defm V_PACK_B32_F16 : VOP3OpSel_Real_gfx10<0x311>; +defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx10<0x312>; +defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx10<0x313>; + +defm V_MIN3_F16 : VOP3OpSel_Real_gfx10<0x351>; +defm V_MIN3_I16 : VOP3OpSel_Real_gfx10<0x352>; +defm V_MIN3_U16 : VOP3OpSel_Real_gfx10<0x353>; +defm V_MAX3_F16 : VOP3OpSel_Real_gfx10<0x354>; +defm V_MAX3_I16 : VOP3OpSel_Real_gfx10<0x355>; +defm V_MAX3_U16 : VOP3OpSel_Real_gfx10<0x356>; +defm V_MED3_F16 : VOP3OpSel_Real_gfx10<0x357>; +defm V_MED3_I16 : VOP3OpSel_Real_gfx10<0x358>; +defm V_MED3_U16 : VOP3OpSel_Real_gfx10<0x359>; +defm V_MAD_U32_U16 : VOP3OpSel_Real_gfx10<0x373>; +defm V_MAD_I32_I16 : VOP3OpSel_Real_gfx10<0x375>; + +defm V_MAD_U16 : + VOP3OpSel_Real_gfx10_with_name<0x340, "V_MAD_U16_gfx9", "v_mad_u16">; +defm V_FMA_F16 : + VOP3OpSel_Real_gfx10_with_name<0x34b, "V_FMA_F16_gfx9", "v_fma_f16">; +defm V_MAD_I16 : + VOP3OpSel_Real_gfx10_with_name<0x35e, "V_MAD_I16_gfx9", "v_mad_i16">; +defm V_DIV_FIXUP_F16 : + VOP3OpSel_Real_gfx10_with_name<0x35f, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">; + +// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these +// (they do not support SDWA or DPP). +defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16_e64", "v_add_nc_u16">; +defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16_e64", "v_sub_nc_u16">; +defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16_e64", "v_mul_lo_u16">; +defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16_e64", "v_lshrrev_b16">; +defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16_e64", "v_ashrrev_i16">; +defm V_MAX_U16 : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16_e64", "v_max_u16">; +defm V_MAX_I16 : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16_e64", "v_max_i16">; +defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16_e64", "v_min_u16">; +defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16_e64", "v_min_i16">; +defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_e64", "v_lshlrev_b16">; +defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>; +defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>; //===----------------------------------------------------------------------===// -// CI +// GFX7, GFX10. //===----------------------------------------------------------------------===// -multiclass VOP3_Real_ci<bits<9> op> { - def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, - VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> { - let AssemblerPredicates = [isCIOnly]; - let DecoderNamespace = "CI"; +let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { + multiclass VOP3_Real_gfx7<bits<10> op> { + def _gfx7 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>; } -} - -multiclass VOP3be_Real_ci<bits<9> op> { - def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, - VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> { - let AssemblerPredicates = [isCIOnly]; - let DecoderNamespace = "CI"; + multiclass VOP3be_Real_gfx7<bits<10> op> { + def _gfx7 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>; } -} +} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" + +multiclass VOP3_Real_gfx7_gfx10<bits<10> op> : + VOP3_Real_gfx7<op>, VOP3_Real_gfx10<op>; + +multiclass VOP3be_Real_gfx7_gfx10<bits<10> op> : + VOP3be_Real_gfx7<op>, VOP3be_Real_gfx10<op>; + +defm V_QSAD_PK_U16_U8 : VOP3_Real_gfx7_gfx10<0x172>; +defm V_MQSAD_U32_U8 : VOP3_Real_gfx7_gfx10<0x175>; +defm V_MAD_U64_U32 : VOP3be_Real_gfx7_gfx10<0x176>; +defm V_MAD_I64_I32 : VOP3be_Real_gfx7_gfx10<0x177>; -defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>; -defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x175>; -defm V_MAD_U64_U32 : VOP3be_Real_ci <0x176>; -defm V_MAD_I64_I32 : VOP3be_Real_ci <0x177>; +//===----------------------------------------------------------------------===// +// GFX6, GFX7, GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { + multiclass VOP3_Real_gfx6_gfx7<bits<10> op> { + def _gfx6_gfx7 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP3e_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>; + } + multiclass VOP3be_Real_gfx6_gfx7<bits<10> op> { + def _gfx6_gfx7 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP3be_gfx6_gfx7<op{8-0}, !cast<VOP3_Pseudo>(NAME).Pfl>; + } +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" + +multiclass VOP3_Real_gfx6_gfx7_gfx10<bits<10> op> : + VOP3_Real_gfx6_gfx7<op>, VOP3_Real_gfx10<op>; + +multiclass VOP3be_Real_gfx6_gfx7_gfx10<bits<10> op> : + VOP3be_Real_gfx6_gfx7<op>, VOP3be_Real_gfx10<op>; + +defm V_LSHL_B64 : VOP3_Real_gfx6_gfx7<0x161>; +defm V_LSHR_B64 : VOP3_Real_gfx6_gfx7<0x162>; +defm V_ASHR_I64 : VOP3_Real_gfx6_gfx7<0x163>; + +defm V_MAD_LEGACY_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x140>; +defm V_MAD_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x141>; +defm V_MAD_I32_I24 : VOP3_Real_gfx6_gfx7_gfx10<0x142>; +defm V_MAD_U32_U24 : VOP3_Real_gfx6_gfx7_gfx10<0x143>; +defm V_CUBEID_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x144>; +defm V_CUBESC_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x145>; +defm V_CUBETC_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x146>; +defm V_CUBEMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x147>; +defm V_BFE_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x148>; +defm V_BFE_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x149>; +defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>; +defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>; +defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>; +defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>; +defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>; +defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>; +defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>; +defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>; +defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>; +defm V_MIN3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x153>; +defm V_MAX3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x154>; +defm V_MAX3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x155>; +defm V_MAX3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x156>; +defm V_MED3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x157>; +defm V_MED3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x158>; +defm V_MED3_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x159>; +defm V_SAD_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x15a>; +defm V_SAD_HI_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x15b>; +defm V_SAD_U16 : VOP3_Real_gfx6_gfx7_gfx10<0x15c>; +defm V_SAD_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x15d>; +defm V_CVT_PK_U8_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x15e>; +defm V_DIV_FIXUP_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x15f>; +defm V_DIV_FIXUP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x160>; +defm V_ADD_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x164>; +defm V_MUL_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x165>; +defm V_MIN_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x166>; +defm V_MAX_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x167>; +defm V_LDEXP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x168>; +defm V_MUL_LO_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x169>; +defm V_MUL_HI_U32 : VOP3_Real_gfx6_gfx7_gfx10<0x16a>; +defm V_MUL_LO_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x16b>; +defm V_MUL_HI_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x16c>; +defm V_DIV_FMAS_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x16f>; +defm V_DIV_FMAS_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x170>; +defm V_MSAD_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x171>; +defm V_MQSAD_PK_U16_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x173>; +defm V_TRIG_PREOP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x174>; +defm V_DIV_SCALE_F32 : VOP3be_Real_gfx6_gfx7_gfx10<0x16d>; +defm V_DIV_SCALE_F64 : VOP3be_Real_gfx6_gfx7_gfx10<0x16e>; //===----------------------------------------------------------------------===// -// VI +// GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// -let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { +let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in { multiclass VOP3_Real_vi<bits<10> op> { def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>, @@ -757,9 +960,9 @@ multiclass VOP3Interp_Real_vi<bits<10> op> { VOP3Interp_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>; } -} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" +} // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" -let AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI" in { +let AssemblerPredicates = [isGFX8Only], DecoderNamespace = "GFX8" in { multiclass VOP3_F16_Real_vi<bits<10> op> { def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, @@ -771,9 +974,9 @@ multiclass VOP3Interp_F16_Real_vi<bits<10> op> { VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>; } -} // End AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI" +} // End AssemblerPredicates = [isGFX8Only], DecoderNamespace = "GFX8" -let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in { +let AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9" in { multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> { def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>, @@ -807,7 +1010,7 @@ multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> { } } -} // End AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" +} // End AssemblerPredicates = [isGFX9Only], DecoderNamespace = "GFX9" defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>; diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index 91b45583c848..55ee5f6577cf 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1,9 +1,8 @@ //===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -70,6 +69,16 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; + +// Undo sub x, c -> add x, -c canonicalization since c is more likely +// an inline immediate than -c. +// The constant will be emitted as a mov, and folded later. +// TODO: We could directly encode the immediate now +def : GCNPat< + (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1), + (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp) +>; + multiclass MadFmaMixPats<SDPatternOperator fma_like, Instruction mix_inst, Instruction mixlo_inst, @@ -239,29 +248,39 @@ class UDot2Pat<Instruction Inst> : GCNPat < (AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)), (and i32:$src1, (i32 65535))) ), - (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0)) ->; + (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { + let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; +} class SDot2Pat<Instruction Inst> : GCNPat < (add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)), (sra i32:$src1, (i32 16))), i32:$src2), (AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16), (sext_inreg i32:$src1, i16))), - (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0)) ->; + (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { + let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; +} -let SubtargetPredicate = HasDotInsts in { +let SubtargetPredicate = HasDot2Insts in { def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>; def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; -def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; -def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot1Insts in { + +def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; +def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; + +} // End SubtargetPredicate = HasDot1Insts + multiclass DotPats<SDPatternOperator dot_op, VOP3PInst dot_inst> { + let SubtargetPredicate = dot_inst.SubtargetPredicate in def : GCNPat < (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)), (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)), @@ -281,12 +300,14 @@ def : UDot2Pat<V_DOT2_U32_U16>; def : SDot2Pat<V_DOT2_I32_I16>; foreach Type = ["U", "I"] in + let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in def : GCNPat < !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, (add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))), (!cast<VOP3PInst>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; foreach Type = ["U", "I"] in + let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in def : GCNPat < !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), [1, 2, 3, 4, 5, 6, 7], lhs, y, @@ -296,19 +317,101 @@ foreach Type = ["U", "I"] in // Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase // in the compile time. Directly handle the pattern generated by the FE here. foreach Type = ["U", "I"] in + let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in def : GCNPat < !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), [7, 1, 2, 3, 4, 5, 6], lhs, y, (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; -} // End SubtargetPredicate = HasDotInsts +def ADst_32 : VOPDstOperand<AGPR_32>; +def ADst_128 : VOPDstOperand<AReg_128>; +def ADst_512 : VOPDstOperand<AReg_512>; +def ADst_1024 : VOPDstOperand<AReg_1024>; + +def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> { + let Src0RC64 = ARegSrc_32; +} + +def VOPProfileAccWrite : VOP3_Profile<VOP_I32_I32, VOP3_MAI> { + let DstRC = ADst_32; + let Src0RC64 = VISrc_b32; +} + +class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC, + RegisterOperand SrcABRC = AVSrc_32> + : VOP3_Profile<P, VOP3_MAI> { + let DstRC = _DstRC; + let Src0RC64 = SrcABRC; + let Src1RC64 = SrcABRC; + let Src2RC64 = _SrcRC; + let HasOpSel = 0; + let HasClamp = 0; + let HasModifiers = 0; + let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp"; + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); +} + +def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>; +def VOPProfileMAI_F32_F32_X16 : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, AISrc_512_f32, ADst_512>; +def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, AISrc_1024_f32, ADst_1024>; +def VOPProfileMAI_I32_I32_X4 : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, AISrc_128_b32, ADst_128>; +def VOPProfileMAI_I32_I32_X16 : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, AISrc_512_b32, ADst_512>; +def VOPProfileMAI_I32_I32_X32 : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, AISrc_1024_b32, ADst_1024>; +def VOPProfileMAI_F32_V2I16_X4 : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, AISrc_128_b32, ADst_128>; +def VOPProfileMAI_F32_V2I16_X16 : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, AISrc_512_b32, ADst_512>; +def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, AISrc_1024_b32, ADst_1024>; +def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; +def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; +def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>; + +let Predicates = [HasMAIInsts] in { +def V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; +def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> { + let isMoveImm = 1; +} + +let isConvergent = 1 in { +def V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>; +def V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>; +def V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>; +def V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>; +def V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>; +def V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>; +def V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>; +def V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>; +def V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>; +def V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>; +def V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>; +def V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>; +def V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>; +def V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>; +def V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>; +def V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>; +def V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>; +def V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>; +def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>; +def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>; +} // End isConvergent = 1 + +} // End SubtargetPredicate = HasMAIInsts + +def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; +def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">; multiclass VOP3P_Real_vi<bits<10> op> { def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> { let AssemblerPredicates = [HasVOP3PInsts]; - let DecoderNamespace = "VI"; + let DecoderNamespace = "GFX8"; + } +} + +multiclass VOP3P_Real_MAI<bits<10> op> { + def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME).Pfl> { + let AssemblerPredicates = [HasMAIInsts]; + let DecoderNamespace = "GFX8"; } } @@ -352,14 +455,97 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; } -let SubtargetPredicate = HasDotInsts in { +let SubtargetPredicate = HasDot2Insts in { defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>; defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>; defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>; -defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>; defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>; -defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>; defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>; -} // End SubtargetPredicate = HasDotInsts +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot1Insts in { + +defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>; +defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>; + +} // End SubtargetPredicate = HasDot1Insts + +let SubtargetPredicate = HasMAIInsts in { + +defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x3d8>; +defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x3d9>; +defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MAI <0x3c0>; +defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MAI <0x3c1>; +defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MAI <0x3c2>; +defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MAI <0x3c4>; +defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MAI <0x3c5>; +defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MAI <0x3c8>; +defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MAI <0x3c9>; +defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MAI <0x3ca>; +defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MAI <0x3cc>; +defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MAI <0x3cd>; +defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MAI <0x3d0>; +defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MAI <0x3d1>; +defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MAI <0x3d2>; +defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MAI <0x3d4>; +defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MAI <0x3d5>; +defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MAI <0x3e8>; +defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MAI <0x3e9>; +defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MAI <0x3eb>; +defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MAI <0x3ec>; +defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x3ed>; + +} // End SubtargetPredicate = HasMAIInsts + +//===----------------------------------------------------------------------===// +// GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + multiclass VOP3P_Real_gfx10<bits<10> op> { + def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>, + VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>; + } +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + +defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x000>; +defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x001>; +defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x002>; +defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x003>; +defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x004>; +defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x005>; +defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x006>; +defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x007>; +defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x008>; +defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x009>; +defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x00a>; +defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x00b>; +defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x00c>; +defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x00d>; +defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x00e>; +defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x00f>; +defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x010>; +defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x011>; +defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x012>; +defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x020>; +defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x021>; +defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x022>; + +let SubtargetPredicate = HasDot2Insts in { + +defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x013>; +defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x014>; +defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x015>; +defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x017>; +defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x019>; + +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot1Insts in { + +defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x016>; +defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x018>; + +} // End SubtargetPredicate = HasDot1Insts diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index 091cac8cd35c..b3513e383d10 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -1,9 +1,8 @@ //===-- VOPCInstructions.td - Vector Instruction Defintions ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -54,14 +53,29 @@ class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> { // an explicit $dst. class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> { - let Asm32 = "vcc, $src0, $src1"; + let Asm32 = "$src0, $src1"; // The destination for 32-bit encoding is implicit. let HasDst32 = 0; - let Outs64 = (outs VOPDstS64:$sdst); + let Outs64 = (outs VOPDstS64orS32:$sdst); list<SchedReadWrite> Schedule = sched; } -class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[]> : +class VOPC_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt0, + ValueType vt1 = vt0> : + VOPC_Profile<sched, vt0, vt1> { + let Outs64 = (outs ); + let OutsSDWA = (outs ); + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, + src0_sel:$src0_sel, src1_sel:$src1_sel); + let Asm64 = !if(isFloatType<Src0VT>.ret, "$src0_modifiers, $src1_modifiers$clamp", + "$src0, $src1"); + let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; + let EmitDst = 0; +} + +class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[], + bit DefVcc = 1> : InstSI<(outs), P.Ins32, "", pattern>, VOP <opName>, SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> { @@ -81,9 +95,7 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[]> : let VALU = 1; let VOPC = 1; let Uses = [EXEC]; - let Defs = [VCC]; - - let SubtargetPredicate = isGCN; + let Defs = !if(DefVcc, [VCC], []); VOPProfile Pfl = P; } @@ -115,8 +127,9 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : } // This class is used only with VOPC instructions. Use $sdst for out operand -class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> : - InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl { +class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, + string Asm32 = ps.Pfl.Asm32, VOPProfile p = ps.Pfl> : + InstAlias <ps.OpName#" "#Asm32, (inst)>, PredicateControl { field bit isCompare; field bit isCommutable; @@ -149,6 +162,27 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> : let SubtargetPredicate = AssemblerPredicate; } +multiclass VOPCInstAliases <string OpName, string Arch> { + def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"), + !cast<Instruction>(OpName#"_e32_"#Arch)>; + let WaveSizePredicate = isWave32 in { + def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"), + !cast<Instruction>(OpName#"_e32_"#Arch), + "vcc_lo, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>; + } + let WaveSizePredicate = isWave64 in { + def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"), + !cast<Instruction>(OpName#"_e32_"#Arch), + "vcc, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>; + } +} + +multiclass VOPCXInstAliases <string OpName, string Arch> { + def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"), + !cast<Instruction>(OpName#"_e32_"#Arch)>; +} + + class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, [(set i1:$sdst, @@ -161,6 +195,10 @@ class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies { [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]); } +class VCMPXNoSDstTable <bit has_sdst, string Name> { + bit HasSDst = has_sdst; + string NoSDstOp = Name; +} multiclass VOPC_Pseudos <string opName, VOPC_Profile P, @@ -169,7 +207,8 @@ multiclass VOPC_Pseudos <string opName, bit DefExec = 0> { def _e32 : VOPC_Pseudo <opName, P>, - Commutable_REV<revOp#"_e32", !eq(revOp, opName)> { + Commutable_REV<revOp#"_e32", !eq(revOp, opName)>, + VCMPXNoSDstTable<1, opName#"_e32"> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = P.Schedule; let isConvergent = DefExec; @@ -178,7 +217,8 @@ multiclass VOPC_Pseudos <string opName, } def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>, - Commutable_REV<revOp#"_e64", !eq(revOp, opName)> { + Commutable_REV<revOp#"_e64", !eq(revOp, opName)>, + VCMPXNoSDstTable<1, opName#"_e64"> { let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; let isCompare = 1; @@ -193,6 +233,44 @@ multiclass VOPC_Pseudos <string opName, } } +let SubtargetPredicate = HasSdstCMPX in { +multiclass VOPCX_Pseudos <string opName, + VOPC_Profile P, VOPC_Profile P_NoSDst, + PatLeaf cond = COND_NULL, + string revOp = opName> : + VOPC_Pseudos <opName, P, cond, revOp, 1> { + + def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>, + Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>, + VCMPXNoSDstTable<0, opName#"_e32"> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isConvergent = 1; + let isCompare = 1; + let isCommutable = 1; + let SubtargetPredicate = HasNoSdstCMPX; + } + + def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>, + Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>, + VCMPXNoSDstTable<0, opName#"_e64"> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isCompare = 1; + let isCommutable = 1; + let SubtargetPredicate = HasNoSdstCMPX; + } + + def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isConvergent = 1; + let isCompare = 1; + let SubtargetPredicate = HasNoSdstCMPX; + } +} +} // End SubtargetPredicate = HasSdstCMPX + def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>; def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>; def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>; @@ -200,6 +278,13 @@ def VOPC_I1_I16_I16 : VOPC_Profile<[Write32Bit], i16>; def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>; def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>; +def VOPC_F16_F16 : VOPC_NoSdst_Profile<[Write32Bit], f16>; +def VOPC_F32_F32 : VOPC_NoSdst_Profile<[Write32Bit], f32>; +def VOPC_F64_F64 : VOPC_NoSdst_Profile<[Write64Bit], f64>; +def VOPC_I16_I16 : VOPC_NoSdst_Profile<[Write32Bit], i16>; +def VOPC_I32_I32 : VOPC_NoSdst_Profile<[Write32Bit], i32>; +def VOPC_I64_I64 : VOPC_NoSdst_Profile<[Write64Bit], i64>; + multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>; @@ -219,22 +304,22 @@ multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opN VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>; multiclass VOPCX_F16 <string opName, string revOp = opName> : - VOPC_Pseudos <opName, VOPC_I1_F16_F16, COND_NULL, revOp, 1>; + VOPCX_Pseudos <opName, VOPC_I1_F16_F16, VOPC_F16_F16, COND_NULL, revOp>; multiclass VOPCX_F32 <string opName, string revOp = opName> : - VOPC_Pseudos <opName, VOPC_I1_F32_F32, COND_NULL, revOp, 1>; + VOPCX_Pseudos <opName, VOPC_I1_F32_F32, VOPC_F32_F32, COND_NULL, revOp>; multiclass VOPCX_F64 <string opName, string revOp = opName> : - VOPC_Pseudos <opName, VOPC_I1_F64_F64, COND_NULL, revOp, 1>; + VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>; multiclass VOPCX_I16 <string opName, string revOp = opName> : - VOPC_Pseudos <opName, VOPC_I1_I16_I16, COND_NULL, revOp, 1>; + VOPCX_Pseudos <opName, VOPC_I1_I16_I16, VOPC_I16_I16, COND_NULL, revOp>; multiclass VOPCX_I32 <string opName, string revOp = opName> : - VOPC_Pseudos <opName, VOPC_I1_I32_I32, COND_NULL, revOp, 1>; + VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>; multiclass VOPCX_I64 <string opName, string revOp = opName> : - VOPC_Pseudos <opName, VOPC_I1_I64_I64, COND_NULL, revOp, 1>; + VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>; //===----------------------------------------------------------------------===// @@ -309,7 +394,7 @@ defm V_CMPX_NEQ_F64 : VOPCX_F64 <"v_cmpx_neq_f64">; defm V_CMPX_NLT_F64 : VOPCX_F64 <"v_cmpx_nlt_f64">; defm V_CMPX_TRU_F64 : VOPCX_F64 <"v_cmpx_tru_f64">; -let SubtargetPredicate = isSICI in { +let SubtargetPredicate = isGFX6GFX7 in { defm V_CMPS_F_F32 : VOPC_F32 <"v_cmps_f_f32">; defm V_CMPS_LT_F32 : VOPC_F32 <"v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; @@ -379,7 +464,7 @@ defm V_CMPSX_NEQ_F64 : VOPCX_F64 <"v_cmpsx_neq_f64">; defm V_CMPSX_NLT_F64 : VOPCX_F64 <"v_cmpsx_nlt_f64">; defm V_CMPSX_TRU_F64 : VOPCX_F64 <"v_cmpsx_tru_f64">; -} // End SubtargetPredicate = isSICI +} // End SubtargetPredicate = isGFX6GFX7 let SubtargetPredicate = Has16BitInsts in { @@ -546,6 +631,18 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> : let HasOMod = 0; } +class VOPC_Class_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt> : + VOPC_Class_Profile<sched, vt> { + let Outs64 = (outs ); + let OutsSDWA = (outs ); + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, + src0_sel:$src0_sel, src1_sel:$src1_sel); + let Asm64 = "$src0_modifiers, $src1"; + let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; + let EmitDst = 0; +} + class getVOPCClassPat64 <VOPProfile P> { list<dag> ret = [(set i1:$sdst, @@ -556,46 +653,85 @@ class getVOPCClassPat64 <VOPProfile P> { // Special case for class instructions which only have modifiers on // the 1st source operand. -multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> { - def _e32 : VOPC_Pseudo <opName, p> { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); +multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec, + bit DefVcc = 1> { + def _e32 : VOPC_Pseudo <opName, p>, + VCMPXNoSDstTable<1, opName#"_e32"> { + let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]), + !if(DefVcc, [VCC], [])); let SchedRW = p.Schedule; let isConvergent = DefExec; } - def _e64 : VOP3_Pseudo<opName, p, getVOPCClassPat64<p>.ret> { + def _e64 : VOP3_Pseudo<opName, p, getVOPCClassPat64<p>.ret>, + VCMPXNoSDstTable<1, opName#"_e64"> { let Defs = !if(DefExec, [EXEC], []); let SchedRW = p.Schedule; } def _sdwa : VOPC_SDWA_Pseudo <opName, p> { - let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]), + !if(DefVcc, [VCC], [])); let SchedRW = p.Schedule; let isConvergent = DefExec; } } +let SubtargetPredicate = HasSdstCMPX in { +multiclass VOPCX_Class_Pseudos <string opName, + VOPC_Profile P, + VOPC_Profile P_NoSDst> : + VOPC_Class_Pseudos <opName, P, 1, 1> { + + def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>, + VCMPXNoSDstTable<0, opName#"_e32"> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isConvergent = 1; + let SubtargetPredicate = HasNoSdstCMPX; + } + + def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>, + VCMPXNoSDstTable<0, opName#"_e64"> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let SubtargetPredicate = HasNoSdstCMPX; + } + + def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isConvergent = 1; + let SubtargetPredicate = HasNoSdstCMPX; + } +} +} // End SubtargetPredicate = HasSdstCMPX + def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>; def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>; def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>; +def VOPC_F16_I32 : VOPC_Class_NoSdst_Profile<[Write32Bit], f16>; +def VOPC_F32_I32 : VOPC_Class_NoSdst_Profile<[Write32Bit], f32>; +def VOPC_F64_I32 : VOPC_Class_NoSdst_Profile<[Write64Bit], f64>; + multiclass VOPC_CLASS_F16 <string opName> : VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>; multiclass VOPCX_CLASS_F16 <string opName> : - VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 1>; + VOPCX_Class_Pseudos <opName, VOPC_I1_F16_I32, VOPC_F16_I32>; multiclass VOPC_CLASS_F32 <string opName> : VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>; multiclass VOPCX_CLASS_F32 <string opName> : - VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>; + VOPCX_Class_Pseudos <opName, VOPC_I1_F32_I32, VOPC_F32_I32>; multiclass VOPC_CLASS_F64 <string opName> : VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 0>; multiclass VOPCX_CLASS_F64 <string opName> : - VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 1>; + VOPCX_Class_Pseudos <opName, VOPC_I1_F64_I32, VOPC_F64_I32>; defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">; defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">; @@ -608,342 +744,471 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; // V_ICMPIntrinsic Pattern. //===----------------------------------------------------------------------===// -class ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat < - (AMDGPUsetcc vt:$src0, vt:$src1, cond), - (inst $src0, $src1) ->; - -def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>; -def : ICMP_Pattern <COND_NE, V_CMP_NE_U32_e64, i32>; -def : ICMP_Pattern <COND_UGT, V_CMP_GT_U32_e64, i32>; -def : ICMP_Pattern <COND_UGE, V_CMP_GE_U32_e64, i32>; -def : ICMP_Pattern <COND_ULT, V_CMP_LT_U32_e64, i32>; -def : ICMP_Pattern <COND_ULE, V_CMP_LE_U32_e64, i32>; -def : ICMP_Pattern <COND_SGT, V_CMP_GT_I32_e64, i32>; -def : ICMP_Pattern <COND_SGE, V_CMP_GE_I32_e64, i32>; -def : ICMP_Pattern <COND_SLT, V_CMP_LT_I32_e64, i32>; -def : ICMP_Pattern <COND_SLE, V_CMP_LE_I32_e64, i32>; - -def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U64_e64, i64>; -def : ICMP_Pattern <COND_NE, V_CMP_NE_U64_e64, i64>; -def : ICMP_Pattern <COND_UGT, V_CMP_GT_U64_e64, i64>; -def : ICMP_Pattern <COND_UGE, V_CMP_GE_U64_e64, i64>; -def : ICMP_Pattern <COND_ULT, V_CMP_LT_U64_e64, i64>; -def : ICMP_Pattern <COND_ULE, V_CMP_LE_U64_e64, i64>; -def : ICMP_Pattern <COND_SGT, V_CMP_GT_I64_e64, i64>; -def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>; -def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>; -def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>; - -def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_e64, i16>; -def : ICMP_Pattern <COND_NE, V_CMP_NE_U16_e64, i16>; -def : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_e64, i16>; -def : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_e64, i16>; -def : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_e64, i16>; -def : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_e64, i16>; -def : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_e64, i16>; -def : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>; -def : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>; -def : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>; - -class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat < - (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), - (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), - (inst $src0_modifiers, $src0, $src1_modifiers, $src1, - DSTCLAMP.NONE) ->; - -def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>; -def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>; -def : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>; -def : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>; -def : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>; -def : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>; - -def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>; -def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>; -def : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>; -def : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>; -def : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>; -def : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>; - -def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>; -def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_e64, f16>; -def : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_e64, f16>; -def : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_e64, f16>; -def : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_e64, f16>; -def : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_e64, f16>; - - -def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>; -def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>; -def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>; -def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F32_e64, f32>; -def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F32_e64, f32>; -def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F32_e64, f32>; - -def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F64_e64, f64>; -def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F64_e64, f64>; -def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F64_e64, f64>; -def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>; -def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>; -def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>; - -def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_e64, f16>; -def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_e64, f16>; -def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_e64, f16>; -def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>; -def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>; -def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>; +// We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith() +// complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place. +multiclass ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> { + let WaveSizePredicate = isWave64 in + def : GCNPat < + (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + (i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64)) + >; + + let WaveSizePredicate = isWave32 in + def : GCNPat < + (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32)) + >; +} + +defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>; +defm : ICMP_Pattern <COND_NE, V_CMP_NE_U32_e64, i32>; +defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U32_e64, i32>; +defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U32_e64, i32>; +defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U32_e64, i32>; +defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U32_e64, i32>; +defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I32_e64, i32>; +defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I32_e64, i32>; +defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I32_e64, i32>; +defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I32_e64, i32>; + +defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U64_e64, i64>; +defm : ICMP_Pattern <COND_NE, V_CMP_NE_U64_e64, i64>; +defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U64_e64, i64>; +defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U64_e64, i64>; +defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U64_e64, i64>; +defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U64_e64, i64>; +defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I64_e64, i64>; +defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>; +defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>; +defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>; + +defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_e64, i16>; +defm : ICMP_Pattern <COND_NE, V_CMP_NE_U16_e64, i16>; +defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_e64, i16>; +defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_e64, i16>; +defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_e64, i16>; +defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_e64, i16>; +defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_e64, i16>; +defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>; +defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>; +defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>; + +multiclass FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> { + let WaveSizePredicate = isWave64 in + def : GCNPat < + (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), + (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), + (i64 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1, + DSTCLAMP.NONE), SReg_64)) + >; + + let WaveSizePredicate = isWave32 in + def : GCNPat < + (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), + (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), + (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1, + DSTCLAMP.NONE), SReg_32)) + >; +} + +defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>; +defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>; +defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>; +defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>; +defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>; +defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>; + +defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>; +defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>; +defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>; +defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>; +defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>; +defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>; + +defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>; +defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_e64, f16>; +defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_e64, f16>; +defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_e64, f16>; +defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_e64, f16>; +defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_e64, f16>; + + +defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>; +defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>; +defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>; +defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F32_e64, f32>; +defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F32_e64, f32>; +defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F32_e64, f32>; + +defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F64_e64, f64>; +defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F64_e64, f64>; +defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F64_e64, f64>; +defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>; +defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>; +defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>; + +defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_e64, f16>; +defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_e64, f16>; +defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_e64, f16>; +defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>; +defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>; +defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>; //===----------------------------------------------------------------------===// -// Target +// Target-specific instruction encodings. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// SI +// GFX10. //===----------------------------------------------------------------------===// -multiclass VOPC_Real_si <bits<9> op> { - let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in { - def _e32_si : - VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, - VOPCe<op{7-0}>; - - def _e64_si : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3a_si <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { - // Encoding used for VOPC instructions encoded as VOP3 - // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst - bits<8> sdst; - let Inst{7-0} = sdst; - } +let AssemblerPredicate = isGFX10Plus in { + multiclass VOPC_Real_gfx10<bits<9> op> { + let DecoderNamespace = "GFX10" in { + def _e32_gfx10 : + VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>, + VOPCe<op{7-0}>; + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } + } // End DecoderNamespace = "GFX10" + + def _sdwa_gfx10 : + VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, + VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + + defm : VOPCInstAliases<NAME, "gfx10">; } - def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"), - !cast<Instruction>(NAME#"_e32_si")> { - let AssemblerPredicate = isSICI; + + multiclass VOPCX_Real_gfx10<bits<9> op> { + let DecoderNamespace = "GFX10" in { + def _e32_gfx10 : + VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>, + VOPCe<op{7-0}> { + let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr) + # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands; + } + + def _e64_gfx10 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>, + VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> { + let Inst{7-0} = ?; // sdst + let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic) + # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands; + } + } // End DecoderNamespace = "GFX10" + + def _sdwa_gfx10 : + VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa")>, + VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Pfl> { + let AsmString = !subst("_nosdst", "", !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Mnemonic) + # "{_sdwa} " # !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").AsmOperands9; + } + + defm : VOPCXInstAliases<NAME, "gfx10">; } -} +} // End AssemblerPredicate = isGFX10Plus + +defm V_CMP_LT_I16 : VOPC_Real_gfx10<0x089>; +defm V_CMP_EQ_I16 : VOPC_Real_gfx10<0x08a>; +defm V_CMP_LE_I16 : VOPC_Real_gfx10<0x08b>; +defm V_CMP_GT_I16 : VOPC_Real_gfx10<0x08c>; +defm V_CMP_NE_I16 : VOPC_Real_gfx10<0x08d>; +defm V_CMP_GE_I16 : VOPC_Real_gfx10<0x08e>; +defm V_CMP_CLASS_F16 : VOPC_Real_gfx10<0x08f>; +defm V_CMPX_LT_I16 : VOPCX_Real_gfx10<0x099>; +defm V_CMPX_EQ_I16 : VOPCX_Real_gfx10<0x09a>; +defm V_CMPX_LE_I16 : VOPCX_Real_gfx10<0x09b>; +defm V_CMPX_GT_I16 : VOPCX_Real_gfx10<0x09c>; +defm V_CMPX_NE_I16 : VOPCX_Real_gfx10<0x09d>; +defm V_CMPX_GE_I16 : VOPCX_Real_gfx10<0x09e>; +defm V_CMPX_CLASS_F16 : VOPCX_Real_gfx10<0x09f>; +defm V_CMP_LT_U16 : VOPC_Real_gfx10<0x0a9>; +defm V_CMP_EQ_U16 : VOPC_Real_gfx10<0x0aa>; +defm V_CMP_LE_U16 : VOPC_Real_gfx10<0x0ab>; +defm V_CMP_GT_U16 : VOPC_Real_gfx10<0x0ac>; +defm V_CMP_NE_U16 : VOPC_Real_gfx10<0x0ad>; +defm V_CMP_GE_U16 : VOPC_Real_gfx10<0x0ae>; +defm V_CMPX_LT_U16 : VOPCX_Real_gfx10<0x0b9>; +defm V_CMPX_EQ_U16 : VOPCX_Real_gfx10<0x0ba>; +defm V_CMPX_LE_U16 : VOPCX_Real_gfx10<0x0bb>; +defm V_CMPX_GT_U16 : VOPCX_Real_gfx10<0x0bc>; +defm V_CMPX_NE_U16 : VOPCX_Real_gfx10<0x0bd>; +defm V_CMPX_GE_U16 : VOPCX_Real_gfx10<0x0be>; +defm V_CMP_F_F16 : VOPC_Real_gfx10<0x0c8>; +defm V_CMP_LT_F16 : VOPC_Real_gfx10<0x0c9>; +defm V_CMP_EQ_F16 : VOPC_Real_gfx10<0x0ca>; +defm V_CMP_LE_F16 : VOPC_Real_gfx10<0x0cb>; +defm V_CMP_GT_F16 : VOPC_Real_gfx10<0x0cc>; +defm V_CMP_LG_F16 : VOPC_Real_gfx10<0x0cd>; +defm V_CMP_GE_F16 : VOPC_Real_gfx10<0x0ce>; +defm V_CMP_O_F16 : VOPC_Real_gfx10<0x0cf>; +defm V_CMPX_F_F16 : VOPCX_Real_gfx10<0x0d8>; +defm V_CMPX_LT_F16 : VOPCX_Real_gfx10<0x0d9>; +defm V_CMPX_EQ_F16 : VOPCX_Real_gfx10<0x0da>; +defm V_CMPX_LE_F16 : VOPCX_Real_gfx10<0x0db>; +defm V_CMPX_GT_F16 : VOPCX_Real_gfx10<0x0dc>; +defm V_CMPX_LG_F16 : VOPCX_Real_gfx10<0x0dd>; +defm V_CMPX_GE_F16 : VOPCX_Real_gfx10<0x0de>; +defm V_CMPX_O_F16 : VOPCX_Real_gfx10<0x0df>; +defm V_CMP_U_F16 : VOPC_Real_gfx10<0x0e8>; +defm V_CMP_NGE_F16 : VOPC_Real_gfx10<0x0e9>; +defm V_CMP_NLG_F16 : VOPC_Real_gfx10<0x0ea>; +defm V_CMP_NGT_F16 : VOPC_Real_gfx10<0x0eb>; +defm V_CMP_NLE_F16 : VOPC_Real_gfx10<0x0ec>; +defm V_CMP_NEQ_F16 : VOPC_Real_gfx10<0x0ed>; +defm V_CMP_NLT_F16 : VOPC_Real_gfx10<0x0ee>; +defm V_CMP_TRU_F16 : VOPC_Real_gfx10<0x0ef>; +defm V_CMPX_U_F16 : VOPCX_Real_gfx10<0x0f8>; +defm V_CMPX_NGE_F16 : VOPCX_Real_gfx10<0x0f9>; +defm V_CMPX_NLG_F16 : VOPCX_Real_gfx10<0x0fa>; +defm V_CMPX_NGT_F16 : VOPCX_Real_gfx10<0x0fb>; +defm V_CMPX_NLE_F16 : VOPCX_Real_gfx10<0x0fc>; +defm V_CMPX_NEQ_F16 : VOPCX_Real_gfx10<0x0fd>; +defm V_CMPX_NLT_F16 : VOPCX_Real_gfx10<0x0fe>; +defm V_CMPX_TRU_F16 : VOPCX_Real_gfx10<0x0ff>; -defm V_CMP_F_F32 : VOPC_Real_si <0x0>; -defm V_CMP_LT_F32 : VOPC_Real_si <0x1>; -defm V_CMP_EQ_F32 : VOPC_Real_si <0x2>; -defm V_CMP_LE_F32 : VOPC_Real_si <0x3>; -defm V_CMP_GT_F32 : VOPC_Real_si <0x4>; -defm V_CMP_LG_F32 : VOPC_Real_si <0x5>; -defm V_CMP_GE_F32 : VOPC_Real_si <0x6>; -defm V_CMP_O_F32 : VOPC_Real_si <0x7>; -defm V_CMP_U_F32 : VOPC_Real_si <0x8>; -defm V_CMP_NGE_F32 : VOPC_Real_si <0x9>; -defm V_CMP_NLG_F32 : VOPC_Real_si <0xa>; -defm V_CMP_NGT_F32 : VOPC_Real_si <0xb>; -defm V_CMP_NLE_F32 : VOPC_Real_si <0xc>; -defm V_CMP_NEQ_F32 : VOPC_Real_si <0xd>; -defm V_CMP_NLT_F32 : VOPC_Real_si <0xe>; -defm V_CMP_TRU_F32 : VOPC_Real_si <0xf>; - -defm V_CMPX_F_F32 : VOPC_Real_si <0x10>; -defm V_CMPX_LT_F32 : VOPC_Real_si <0x11>; -defm V_CMPX_EQ_F32 : VOPC_Real_si <0x12>; -defm V_CMPX_LE_F32 : VOPC_Real_si <0x13>; -defm V_CMPX_GT_F32 : VOPC_Real_si <0x14>; -defm V_CMPX_LG_F32 : VOPC_Real_si <0x15>; -defm V_CMPX_GE_F32 : VOPC_Real_si <0x16>; -defm V_CMPX_O_F32 : VOPC_Real_si <0x17>; -defm V_CMPX_U_F32 : VOPC_Real_si <0x18>; -defm V_CMPX_NGE_F32 : VOPC_Real_si <0x19>; -defm V_CMPX_NLG_F32 : VOPC_Real_si <0x1a>; -defm V_CMPX_NGT_F32 : VOPC_Real_si <0x1b>; -defm V_CMPX_NLE_F32 : VOPC_Real_si <0x1c>; -defm V_CMPX_NEQ_F32 : VOPC_Real_si <0x1d>; -defm V_CMPX_NLT_F32 : VOPC_Real_si <0x1e>; -defm V_CMPX_TRU_F32 : VOPC_Real_si <0x1f>; - -defm V_CMP_F_F64 : VOPC_Real_si <0x20>; -defm V_CMP_LT_F64 : VOPC_Real_si <0x21>; -defm V_CMP_EQ_F64 : VOPC_Real_si <0x22>; -defm V_CMP_LE_F64 : VOPC_Real_si <0x23>; -defm V_CMP_GT_F64 : VOPC_Real_si <0x24>; -defm V_CMP_LG_F64 : VOPC_Real_si <0x25>; -defm V_CMP_GE_F64 : VOPC_Real_si <0x26>; -defm V_CMP_O_F64 : VOPC_Real_si <0x27>; -defm V_CMP_U_F64 : VOPC_Real_si <0x28>; -defm V_CMP_NGE_F64 : VOPC_Real_si <0x29>; -defm V_CMP_NLG_F64 : VOPC_Real_si <0x2a>; -defm V_CMP_NGT_F64 : VOPC_Real_si <0x2b>; -defm V_CMP_NLE_F64 : VOPC_Real_si <0x2c>; -defm V_CMP_NEQ_F64 : VOPC_Real_si <0x2d>; -defm V_CMP_NLT_F64 : VOPC_Real_si <0x2e>; -defm V_CMP_TRU_F64 : VOPC_Real_si <0x2f>; - -defm V_CMPX_F_F64 : VOPC_Real_si <0x30>; -defm V_CMPX_LT_F64 : VOPC_Real_si <0x31>; -defm V_CMPX_EQ_F64 : VOPC_Real_si <0x32>; -defm V_CMPX_LE_F64 : VOPC_Real_si <0x33>; -defm V_CMPX_GT_F64 : VOPC_Real_si <0x34>; -defm V_CMPX_LG_F64 : VOPC_Real_si <0x35>; -defm V_CMPX_GE_F64 : VOPC_Real_si <0x36>; -defm V_CMPX_O_F64 : VOPC_Real_si <0x37>; -defm V_CMPX_U_F64 : VOPC_Real_si <0x38>; -defm V_CMPX_NGE_F64 : VOPC_Real_si <0x39>; -defm V_CMPX_NLG_F64 : VOPC_Real_si <0x3a>; -defm V_CMPX_NGT_F64 : VOPC_Real_si <0x3b>; -defm V_CMPX_NLE_F64 : VOPC_Real_si <0x3c>; -defm V_CMPX_NEQ_F64 : VOPC_Real_si <0x3d>; -defm V_CMPX_NLT_F64 : VOPC_Real_si <0x3e>; -defm V_CMPX_TRU_F64 : VOPC_Real_si <0x3f>; - -defm V_CMPS_F_F32 : VOPC_Real_si <0x40>; -defm V_CMPS_LT_F32 : VOPC_Real_si <0x41>; -defm V_CMPS_EQ_F32 : VOPC_Real_si <0x42>; -defm V_CMPS_LE_F32 : VOPC_Real_si <0x43>; -defm V_CMPS_GT_F32 : VOPC_Real_si <0x44>; -defm V_CMPS_LG_F32 : VOPC_Real_si <0x45>; -defm V_CMPS_GE_F32 : VOPC_Real_si <0x46>; -defm V_CMPS_O_F32 : VOPC_Real_si <0x47>; -defm V_CMPS_U_F32 : VOPC_Real_si <0x48>; -defm V_CMPS_NGE_F32 : VOPC_Real_si <0x49>; -defm V_CMPS_NLG_F32 : VOPC_Real_si <0x4a>; -defm V_CMPS_NGT_F32 : VOPC_Real_si <0x4b>; -defm V_CMPS_NLE_F32 : VOPC_Real_si <0x4c>; -defm V_CMPS_NEQ_F32 : VOPC_Real_si <0x4d>; -defm V_CMPS_NLT_F32 : VOPC_Real_si <0x4e>; -defm V_CMPS_TRU_F32 : VOPC_Real_si <0x4f>; - -defm V_CMPSX_F_F32 : VOPC_Real_si <0x50>; -defm V_CMPSX_LT_F32 : VOPC_Real_si <0x51>; -defm V_CMPSX_EQ_F32 : VOPC_Real_si <0x52>; -defm V_CMPSX_LE_F32 : VOPC_Real_si <0x53>; -defm V_CMPSX_GT_F32 : VOPC_Real_si <0x54>; -defm V_CMPSX_LG_F32 : VOPC_Real_si <0x55>; -defm V_CMPSX_GE_F32 : VOPC_Real_si <0x56>; -defm V_CMPSX_O_F32 : VOPC_Real_si <0x57>; -defm V_CMPSX_U_F32 : VOPC_Real_si <0x58>; -defm V_CMPSX_NGE_F32 : VOPC_Real_si <0x59>; -defm V_CMPSX_NLG_F32 : VOPC_Real_si <0x5a>; -defm V_CMPSX_NGT_F32 : VOPC_Real_si <0x5b>; -defm V_CMPSX_NLE_F32 : VOPC_Real_si <0x5c>; -defm V_CMPSX_NEQ_F32 : VOPC_Real_si <0x5d>; -defm V_CMPSX_NLT_F32 : VOPC_Real_si <0x5e>; -defm V_CMPSX_TRU_F32 : VOPC_Real_si <0x5f>; - -defm V_CMPS_F_F64 : VOPC_Real_si <0x60>; -defm V_CMPS_LT_F64 : VOPC_Real_si <0x61>; -defm V_CMPS_EQ_F64 : VOPC_Real_si <0x62>; -defm V_CMPS_LE_F64 : VOPC_Real_si <0x63>; -defm V_CMPS_GT_F64 : VOPC_Real_si <0x64>; -defm V_CMPS_LG_F64 : VOPC_Real_si <0x65>; -defm V_CMPS_GE_F64 : VOPC_Real_si <0x66>; -defm V_CMPS_O_F64 : VOPC_Real_si <0x67>; -defm V_CMPS_U_F64 : VOPC_Real_si <0x68>; -defm V_CMPS_NGE_F64 : VOPC_Real_si <0x69>; -defm V_CMPS_NLG_F64 : VOPC_Real_si <0x6a>; -defm V_CMPS_NGT_F64 : VOPC_Real_si <0x6b>; -defm V_CMPS_NLE_F64 : VOPC_Real_si <0x6c>; -defm V_CMPS_NEQ_F64 : VOPC_Real_si <0x6d>; -defm V_CMPS_NLT_F64 : VOPC_Real_si <0x6e>; -defm V_CMPS_TRU_F64 : VOPC_Real_si <0x6f>; - -defm V_CMPSX_F_F64 : VOPC_Real_si <0x70>; -defm V_CMPSX_LT_F64 : VOPC_Real_si <0x71>; -defm V_CMPSX_EQ_F64 : VOPC_Real_si <0x72>; -defm V_CMPSX_LE_F64 : VOPC_Real_si <0x73>; -defm V_CMPSX_GT_F64 : VOPC_Real_si <0x74>; -defm V_CMPSX_LG_F64 : VOPC_Real_si <0x75>; -defm V_CMPSX_GE_F64 : VOPC_Real_si <0x76>; -defm V_CMPSX_O_F64 : VOPC_Real_si <0x77>; -defm V_CMPSX_U_F64 : VOPC_Real_si <0x78>; -defm V_CMPSX_NGE_F64 : VOPC_Real_si <0x79>; -defm V_CMPSX_NLG_F64 : VOPC_Real_si <0x7a>; -defm V_CMPSX_NGT_F64 : VOPC_Real_si <0x7b>; -defm V_CMPSX_NLE_F64 : VOPC_Real_si <0x7c>; -defm V_CMPSX_NEQ_F64 : VOPC_Real_si <0x7d>; -defm V_CMPSX_NLT_F64 : VOPC_Real_si <0x7e>; -defm V_CMPSX_TRU_F64 : VOPC_Real_si <0x7f>; - -defm V_CMP_F_I32 : VOPC_Real_si <0x80>; -defm V_CMP_LT_I32 : VOPC_Real_si <0x81>; -defm V_CMP_EQ_I32 : VOPC_Real_si <0x82>; -defm V_CMP_LE_I32 : VOPC_Real_si <0x83>; -defm V_CMP_GT_I32 : VOPC_Real_si <0x84>; -defm V_CMP_NE_I32 : VOPC_Real_si <0x85>; -defm V_CMP_GE_I32 : VOPC_Real_si <0x86>; -defm V_CMP_T_I32 : VOPC_Real_si <0x87>; - -defm V_CMPX_F_I32 : VOPC_Real_si <0x90>; -defm V_CMPX_LT_I32 : VOPC_Real_si <0x91>; -defm V_CMPX_EQ_I32 : VOPC_Real_si <0x92>; -defm V_CMPX_LE_I32 : VOPC_Real_si <0x93>; -defm V_CMPX_GT_I32 : VOPC_Real_si <0x94>; -defm V_CMPX_NE_I32 : VOPC_Real_si <0x95>; -defm V_CMPX_GE_I32 : VOPC_Real_si <0x96>; -defm V_CMPX_T_I32 : VOPC_Real_si <0x97>; - -defm V_CMP_F_I64 : VOPC_Real_si <0xa0>; -defm V_CMP_LT_I64 : VOPC_Real_si <0xa1>; -defm V_CMP_EQ_I64 : VOPC_Real_si <0xa2>; -defm V_CMP_LE_I64 : VOPC_Real_si <0xa3>; -defm V_CMP_GT_I64 : VOPC_Real_si <0xa4>; -defm V_CMP_NE_I64 : VOPC_Real_si <0xa5>; -defm V_CMP_GE_I64 : VOPC_Real_si <0xa6>; -defm V_CMP_T_I64 : VOPC_Real_si <0xa7>; - -defm V_CMPX_F_I64 : VOPC_Real_si <0xb0>; -defm V_CMPX_LT_I64 : VOPC_Real_si <0xb1>; -defm V_CMPX_EQ_I64 : VOPC_Real_si <0xb2>; -defm V_CMPX_LE_I64 : VOPC_Real_si <0xb3>; -defm V_CMPX_GT_I64 : VOPC_Real_si <0xb4>; -defm V_CMPX_NE_I64 : VOPC_Real_si <0xb5>; -defm V_CMPX_GE_I64 : VOPC_Real_si <0xb6>; -defm V_CMPX_T_I64 : VOPC_Real_si <0xb7>; - -defm V_CMP_F_U32 : VOPC_Real_si <0xc0>; -defm V_CMP_LT_U32 : VOPC_Real_si <0xc1>; -defm V_CMP_EQ_U32 : VOPC_Real_si <0xc2>; -defm V_CMP_LE_U32 : VOPC_Real_si <0xc3>; -defm V_CMP_GT_U32 : VOPC_Real_si <0xc4>; -defm V_CMP_NE_U32 : VOPC_Real_si <0xc5>; -defm V_CMP_GE_U32 : VOPC_Real_si <0xc6>; -defm V_CMP_T_U32 : VOPC_Real_si <0xc7>; - -defm V_CMPX_F_U32 : VOPC_Real_si <0xd0>; -defm V_CMPX_LT_U32 : VOPC_Real_si <0xd1>; -defm V_CMPX_EQ_U32 : VOPC_Real_si <0xd2>; -defm V_CMPX_LE_U32 : VOPC_Real_si <0xd3>; -defm V_CMPX_GT_U32 : VOPC_Real_si <0xd4>; -defm V_CMPX_NE_U32 : VOPC_Real_si <0xd5>; -defm V_CMPX_GE_U32 : VOPC_Real_si <0xd6>; -defm V_CMPX_T_U32 : VOPC_Real_si <0xd7>; - -defm V_CMP_F_U64 : VOPC_Real_si <0xe0>; -defm V_CMP_LT_U64 : VOPC_Real_si <0xe1>; -defm V_CMP_EQ_U64 : VOPC_Real_si <0xe2>; -defm V_CMP_LE_U64 : VOPC_Real_si <0xe3>; -defm V_CMP_GT_U64 : VOPC_Real_si <0xe4>; -defm V_CMP_NE_U64 : VOPC_Real_si <0xe5>; -defm V_CMP_GE_U64 : VOPC_Real_si <0xe6>; -defm V_CMP_T_U64 : VOPC_Real_si <0xe7>; - -defm V_CMPX_F_U64 : VOPC_Real_si <0xf0>; -defm V_CMPX_LT_U64 : VOPC_Real_si <0xf1>; -defm V_CMPX_EQ_U64 : VOPC_Real_si <0xf2>; -defm V_CMPX_LE_U64 : VOPC_Real_si <0xf3>; -defm V_CMPX_GT_U64 : VOPC_Real_si <0xf4>; -defm V_CMPX_NE_U64 : VOPC_Real_si <0xf5>; -defm V_CMPX_GE_U64 : VOPC_Real_si <0xf6>; -defm V_CMPX_T_U64 : VOPC_Real_si <0xf7>; - -defm V_CMP_CLASS_F32 : VOPC_Real_si <0x88>; -defm V_CMPX_CLASS_F32 : VOPC_Real_si <0x98>; -defm V_CMP_CLASS_F64 : VOPC_Real_si <0xa8>; -defm V_CMPX_CLASS_F64 : VOPC_Real_si <0xb8>; +//===----------------------------------------------------------------------===// +// GFX6, GFX7, GFX10. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX6GFX7 in { + multiclass VOPC_Real_gfx6_gfx7<bits<9> op> { + let DecoderNamespace = "GFX6GFX7" in { + def _e32_gfx6_gfx7 : + VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>, + VOPCe<op{7-0}>; + def _e64_gfx6_gfx7 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } + } // End DecoderNamespace = "GFX6GFX7" + + defm : VOPCInstAliases<NAME, "gfx6_gfx7">; + } +} // End AssemblerPredicate = isGFX6GFX7 + +multiclass VOPC_Real_gfx6_gfx7_gfx10<bits<9> op> : + VOPC_Real_gfx6_gfx7<op>, VOPC_Real_gfx10<op>; + +multiclass VOPCX_Real_gfx6_gfx7<bits<9> op> : + VOPC_Real_gfx6_gfx7<op>; + +multiclass VOPCX_Real_gfx6_gfx7_gfx10 <bits<9> op> : + VOPC_Real_gfx6_gfx7<op>, VOPCX_Real_gfx10<op>; + +defm V_CMP_F_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x000>; +defm V_CMP_LT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x001>; +defm V_CMP_EQ_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x002>; +defm V_CMP_LE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x003>; +defm V_CMP_GT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x004>; +defm V_CMP_LG_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x005>; +defm V_CMP_GE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x006>; +defm V_CMP_O_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x007>; +defm V_CMP_U_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x008>; +defm V_CMP_NGE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x009>; +defm V_CMP_NLG_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00a>; +defm V_CMP_NGT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00b>; +defm V_CMP_NLE_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00c>; +defm V_CMP_NEQ_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00d>; +defm V_CMP_NLT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00e>; +defm V_CMP_TRU_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x00f>; +defm V_CMPX_F_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x010>; +defm V_CMPX_LT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x011>; +defm V_CMPX_EQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x012>; +defm V_CMPX_LE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x013>; +defm V_CMPX_GT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x014>; +defm V_CMPX_LG_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x015>; +defm V_CMPX_GE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x016>; +defm V_CMPX_O_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x017>; +defm V_CMPX_U_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x018>; +defm V_CMPX_NGE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x019>; +defm V_CMPX_NLG_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01a>; +defm V_CMPX_NGT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01b>; +defm V_CMPX_NLE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01c>; +defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01d>; +defm V_CMPX_NLT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01e>; +defm V_CMPX_TRU_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01f>; +defm V_CMP_F_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x020>; +defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x021>; +defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x022>; +defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x023>; +defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x024>; +defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x025>; +defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x026>; +defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x027>; +defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x028>; +defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x029>; +defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02a>; +defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02b>; +defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02c>; +defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02d>; +defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02e>; +defm V_CMP_TRU_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02f>; +defm V_CMPX_F_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x030>; +defm V_CMPX_LT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x031>; +defm V_CMPX_EQ_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x032>; +defm V_CMPX_LE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x033>; +defm V_CMPX_GT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x034>; +defm V_CMPX_LG_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x035>; +defm V_CMPX_GE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x036>; +defm V_CMPX_O_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x037>; +defm V_CMPX_U_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x038>; +defm V_CMPX_NGE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x039>; +defm V_CMPX_NLG_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03a>; +defm V_CMPX_NGT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03b>; +defm V_CMPX_NLE_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03c>; +defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03d>; +defm V_CMPX_NLT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03e>; +defm V_CMPX_TRU_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x03f>; +defm V_CMPS_F_F32 : VOPC_Real_gfx6_gfx7<0x040>; +defm V_CMPS_LT_F32 : VOPC_Real_gfx6_gfx7<0x041>; +defm V_CMPS_EQ_F32 : VOPC_Real_gfx6_gfx7<0x042>; +defm V_CMPS_LE_F32 : VOPC_Real_gfx6_gfx7<0x043>; +defm V_CMPS_GT_F32 : VOPC_Real_gfx6_gfx7<0x044>; +defm V_CMPS_LG_F32 : VOPC_Real_gfx6_gfx7<0x045>; +defm V_CMPS_GE_F32 : VOPC_Real_gfx6_gfx7<0x046>; +defm V_CMPS_O_F32 : VOPC_Real_gfx6_gfx7<0x047>; +defm V_CMPS_U_F32 : VOPC_Real_gfx6_gfx7<0x048>; +defm V_CMPS_NGE_F32 : VOPC_Real_gfx6_gfx7<0x049>; +defm V_CMPS_NLG_F32 : VOPC_Real_gfx6_gfx7<0x04a>; +defm V_CMPS_NGT_F32 : VOPC_Real_gfx6_gfx7<0x04b>; +defm V_CMPS_NLE_F32 : VOPC_Real_gfx6_gfx7<0x04c>; +defm V_CMPS_NEQ_F32 : VOPC_Real_gfx6_gfx7<0x04d>; +defm V_CMPS_NLT_F32 : VOPC_Real_gfx6_gfx7<0x04e>; +defm V_CMPS_TRU_F32 : VOPC_Real_gfx6_gfx7<0x04f>; +defm V_CMPSX_F_F32 : VOPCX_Real_gfx6_gfx7<0x050>; +defm V_CMPSX_LT_F32 : VOPCX_Real_gfx6_gfx7<0x051>; +defm V_CMPSX_EQ_F32 : VOPCX_Real_gfx6_gfx7<0x052>; +defm V_CMPSX_LE_F32 : VOPCX_Real_gfx6_gfx7<0x053>; +defm V_CMPSX_GT_F32 : VOPCX_Real_gfx6_gfx7<0x054>; +defm V_CMPSX_LG_F32 : VOPCX_Real_gfx6_gfx7<0x055>; +defm V_CMPSX_GE_F32 : VOPCX_Real_gfx6_gfx7<0x056>; +defm V_CMPSX_O_F32 : VOPCX_Real_gfx6_gfx7<0x057>; +defm V_CMPSX_U_F32 : VOPCX_Real_gfx6_gfx7<0x058>; +defm V_CMPSX_NGE_F32 : VOPCX_Real_gfx6_gfx7<0x059>; +defm V_CMPSX_NLG_F32 : VOPCX_Real_gfx6_gfx7<0x05a>; +defm V_CMPSX_NGT_F32 : VOPCX_Real_gfx6_gfx7<0x05b>; +defm V_CMPSX_NLE_F32 : VOPCX_Real_gfx6_gfx7<0x05c>; +defm V_CMPSX_NEQ_F32 : VOPCX_Real_gfx6_gfx7<0x05d>; +defm V_CMPSX_NLT_F32 : VOPCX_Real_gfx6_gfx7<0x05e>; +defm V_CMPSX_TRU_F32 : VOPCX_Real_gfx6_gfx7<0x05f>; +defm V_CMPS_F_F64 : VOPC_Real_gfx6_gfx7<0x060>; +defm V_CMPS_LT_F64 : VOPC_Real_gfx6_gfx7<0x061>; +defm V_CMPS_EQ_F64 : VOPC_Real_gfx6_gfx7<0x062>; +defm V_CMPS_LE_F64 : VOPC_Real_gfx6_gfx7<0x063>; +defm V_CMPS_GT_F64 : VOPC_Real_gfx6_gfx7<0x064>; +defm V_CMPS_LG_F64 : VOPC_Real_gfx6_gfx7<0x065>; +defm V_CMPS_GE_F64 : VOPC_Real_gfx6_gfx7<0x066>; +defm V_CMPS_O_F64 : VOPC_Real_gfx6_gfx7<0x067>; +defm V_CMPS_U_F64 : VOPC_Real_gfx6_gfx7<0x068>; +defm V_CMPS_NGE_F64 : VOPC_Real_gfx6_gfx7<0x069>; +defm V_CMPS_NLG_F64 : VOPC_Real_gfx6_gfx7<0x06a>; +defm V_CMPS_NGT_F64 : VOPC_Real_gfx6_gfx7<0x06b>; +defm V_CMPS_NLE_F64 : VOPC_Real_gfx6_gfx7<0x06c>; +defm V_CMPS_NEQ_F64 : VOPC_Real_gfx6_gfx7<0x06d>; +defm V_CMPS_NLT_F64 : VOPC_Real_gfx6_gfx7<0x06e>; +defm V_CMPS_TRU_F64 : VOPC_Real_gfx6_gfx7<0x06f>; +defm V_CMPSX_F_F64 : VOPCX_Real_gfx6_gfx7<0x070>; +defm V_CMPSX_LT_F64 : VOPCX_Real_gfx6_gfx7<0x071>; +defm V_CMPSX_EQ_F64 : VOPCX_Real_gfx6_gfx7<0x072>; +defm V_CMPSX_LE_F64 : VOPCX_Real_gfx6_gfx7<0x073>; +defm V_CMPSX_GT_F64 : VOPCX_Real_gfx6_gfx7<0x074>; +defm V_CMPSX_LG_F64 : VOPCX_Real_gfx6_gfx7<0x075>; +defm V_CMPSX_GE_F64 : VOPCX_Real_gfx6_gfx7<0x076>; +defm V_CMPSX_O_F64 : VOPCX_Real_gfx6_gfx7<0x077>; +defm V_CMPSX_U_F64 : VOPCX_Real_gfx6_gfx7<0x078>; +defm V_CMPSX_NGE_F64 : VOPCX_Real_gfx6_gfx7<0x079>; +defm V_CMPSX_NLG_F64 : VOPCX_Real_gfx6_gfx7<0x07a>; +defm V_CMPSX_NGT_F64 : VOPCX_Real_gfx6_gfx7<0x07b>; +defm V_CMPSX_NLE_F64 : VOPCX_Real_gfx6_gfx7<0x07c>; +defm V_CMPSX_NEQ_F64 : VOPCX_Real_gfx6_gfx7<0x07d>; +defm V_CMPSX_NLT_F64 : VOPCX_Real_gfx6_gfx7<0x07e>; +defm V_CMPSX_TRU_F64 : VOPCX_Real_gfx6_gfx7<0x07f>; +defm V_CMP_F_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x080>; +defm V_CMP_LT_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x081>; +defm V_CMP_EQ_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x082>; +defm V_CMP_LE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x083>; +defm V_CMP_GT_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x084>; +defm V_CMP_NE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x085>; +defm V_CMP_GE_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x086>; +defm V_CMP_T_I32 : VOPC_Real_gfx6_gfx7_gfx10<0x087>; +defm V_CMP_CLASS_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x088>; +defm V_CMPX_F_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x090>; +defm V_CMPX_LT_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x091>; +defm V_CMPX_EQ_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x092>; +defm V_CMPX_LE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x093>; +defm V_CMPX_GT_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x094>; +defm V_CMPX_NE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x095>; +defm V_CMPX_GE_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x096>; +defm V_CMPX_T_I32 : VOPCX_Real_gfx6_gfx7_gfx10<0x097>; +defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x098>; +defm V_CMP_F_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a0>; +defm V_CMP_LT_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a1>; +defm V_CMP_EQ_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a2>; +defm V_CMP_LE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a3>; +defm V_CMP_GT_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a4>; +defm V_CMP_NE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a5>; +defm V_CMP_GE_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a6>; +defm V_CMP_T_I64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a7>; +defm V_CMP_CLASS_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x0a8>; +defm V_CMPX_F_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b0>; +defm V_CMPX_LT_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b1>; +defm V_CMPX_EQ_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b2>; +defm V_CMPX_LE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b3>; +defm V_CMPX_GT_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b4>; +defm V_CMPX_NE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b5>; +defm V_CMPX_GE_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b6>; +defm V_CMPX_T_I64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b7>; +defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0b8>; +defm V_CMP_F_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c0>; +defm V_CMP_LT_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c1>; +defm V_CMP_EQ_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c2>; +defm V_CMP_LE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c3>; +defm V_CMP_GT_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c4>; +defm V_CMP_NE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c5>; +defm V_CMP_GE_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c6>; +defm V_CMP_T_U32 : VOPC_Real_gfx6_gfx7_gfx10<0x0c7>; +defm V_CMPX_F_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d0>; +defm V_CMPX_LT_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d1>; +defm V_CMPX_EQ_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d2>; +defm V_CMPX_LE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d3>; +defm V_CMPX_GT_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d4>; +defm V_CMPX_NE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d5>; +defm V_CMPX_GE_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d6>; +defm V_CMPX_T_U32 : VOPCX_Real_gfx6_gfx7_gfx10<0x0d7>; +defm V_CMP_F_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e0>; +defm V_CMP_LT_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e1>; +defm V_CMP_EQ_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e2>; +defm V_CMP_LE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e3>; +defm V_CMP_GT_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e4>; +defm V_CMP_NE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e5>; +defm V_CMP_GE_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e6>; +defm V_CMP_T_U64 : VOPC_Real_gfx6_gfx7_gfx10<0x0e7>; +defm V_CMPX_F_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f0>; +defm V_CMPX_LT_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f1>; +defm V_CMPX_EQ_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f2>; +defm V_CMPX_LE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f3>; +defm V_CMPX_GT_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f4>; +defm V_CMPX_NE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f5>; +defm V_CMPX_GE_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f6>; +defm V_CMPX_T_U64 : VOPCX_Real_gfx6_gfx7_gfx10<0x0f7>; //===----------------------------------------------------------------------===// -// VI +// GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// multiclass VOPC_Real_vi <bits<10> op> { - let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + let AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" in { def _e32_vi : VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>, VOPCe<op{7-0}>; @@ -966,9 +1231,8 @@ multiclass VOPC_Real_vi <bits<10> op> { VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; - def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"), - !cast<Instruction>(NAME#"_e32_vi")> { - let AssemblerPredicate = isVI; + let AssemblerPredicate = isGFX8GFX9 in { + defm : VOPCInstAliases<NAME, "vi">; } } diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 7de7d90d27b3..677095a354be 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -1,9 +1,8 @@ //===-- VOPInstructions.td - Vector Instruction Defintions ----------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -91,6 +90,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], let VOP3_OPSEL = isVop3OpSel; let IsPacked = P.IsPacked; + let IsMAI = P.IsMAI; let AsmOperands = !if(isVop3OpSel, P.AsmVOP3OpSel, @@ -100,7 +100,6 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; - let SubtargetPredicate = isGCN; // Because SGPRs may be allowed if there are multiple operands, we // need a post-isel hook to insert copies in order to avoid @@ -190,9 +189,15 @@ class VOP3a<VOPProfile P> : Enc64 { let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); } -class VOP3a_si <bits<9> op, VOPProfile P> : VOP3a<P> { +class VOP3a_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3a<p> { + let Inst{11} = !if(p.HasClamp, clamp{0}, 0); let Inst{25-17} = op; - let Inst{11} = !if(P.HasClamp, clamp{0}, 0); +} + +class VOP3a_gfx10<bits<10> op, VOPProfile p> : VOP3a<p> { + let Inst{15} = !if(p.HasClamp, clamp{0}, 0); + let Inst{25-16} = op; + let Inst{31-26} = 0x35; } class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> { @@ -200,9 +205,14 @@ class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> { let Inst{15} = !if(P.HasClamp, clamp{0}, 0); } -class VOP3e_si <bits<9> op, VOPProfile P> : VOP3a_si <op, P> { +class VOP3e_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3a_gfx6_gfx7<op, p> { bits<8> vdst; - let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0); +} + +class VOP3e_gfx10<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p> { + bits<8> vdst; + let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0); } class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> { @@ -217,6 +227,13 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { let Inst{14} = !if(P.HasDst, src0_modifiers{3}, 0); } +class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { + let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0); + let Inst{12} = !if(p.HasSrc1, src1_modifiers{2}, 0); + let Inst{13} = !if(p.HasSrc2, src2_modifiers{2}, 0); + let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0); +} + // NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { bits<2> attrchan; @@ -236,6 +253,21 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { let Inst{49-41} = src0; } +class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { + bits<6> attr; + bits<2> attrchan; + bits<1> high; + + let Inst{8} = 0; + let Inst{9} = !if(p.HasSrc0Mods, src0_modifiers{1}, 0); + let Inst{37-32} = attr; + let Inst{39-38} = attrchan; + let Inst{40} = !if(p.HasHigh, high, 0); + let Inst{49-41} = src0; + let Inst{61} = 0; + let Inst{62} = !if(p.HasSrc0Mods, src0_modifiers{0}, 0); +} + class VOP3be <VOPProfile P> : Enc64 { bits<8> vdst; bits<2> src0_modifiers; @@ -295,10 +327,51 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 { let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) } -class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> { +class VOP3Pe_MAI <bits<10> op, VOPProfile P> : Enc64 { + bits<8> vdst; + bits<10> src0; + bits<10> src1; + bits<9> src2; + bits<3> blgp; + bits<3> cbsz; + bits<4> abid; + bits<1> clamp; + + let Inst{7-0} = vdst; + + let Inst{10-8} = !if(P.HasSrc1, cbsz, 0); + let Inst{14-11} = !if(P.HasSrc1, abid, 0); + + let Inst{15} = !if(P.HasClamp, clamp{0}, 0); + + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0); + let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0); + let Inst{58-50} = !if(P.HasSrc2, src2, 0); + + let Inst{59} = !if(P.HasSrc0, src0{9}, 0); // acc(0) + let Inst{60} = !if(P.HasSrc1, src1{9}, 0); // acc(1) + + let Inst{63-61} = !if(P.HasSrc1, blgp, 0); +} + + +class VOP3Pe_gfx10 <bits<10> op, VOPProfile P> : VOP3Pe<op, P> { + let Inst{31-26} = 0x33; //encoding +} + +class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> { let Inst{25-17} = op; } +class VOP3be_gfx10<bits<10> op, VOPProfile p> : VOP3be<p> { + bits<1> clamp; + let Inst{15} = !if(p.HasClamp, clamp{0}, 0); + let Inst{25-16} = op; + let Inst{31-26} = 0x35; +} + class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> { bits<1> clamp; let Inst{25-16} = op; @@ -393,7 +466,7 @@ class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> { class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> { bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}} - let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0); + let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, ?); let Inst{47} = !if(P.EmitDst, sdst{7}, 0); } @@ -456,9 +529,8 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : let TSFlags = ps.TSFlags; } -class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : - InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []>, - SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9> { +class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []> { let isPseudo = 0; let isCodeGenOnly = 0; @@ -485,7 +557,20 @@ class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : let TSFlags = ps.TSFlags; } -class VOP_DPPe<VOPProfile P> : Enc64 { +class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : + Base_VOP_SDWA9_Real <ps >, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9>; + +class Base_VOP_SDWA10_Real<VOP_SDWA_Pseudo ps> : Base_VOP_SDWA9_Real<ps> { + let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst); + let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst); + let DecoderNamespace = "SDWA10"; +} + +class VOP_SDWA10_Real<VOP_SDWA_Pseudo ps> : + Base_VOP_SDWA10_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SDWA10>; + +class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 { bits<2> src0_modifiers; bits<8> src0; bits<2> src1_modifiers; @@ -493,9 +578,11 @@ class VOP_DPPe<VOPProfile P> : Enc64 { bits<1> bound_ctrl; bits<4> bank_mask; bits<4> row_mask; + bit fi; let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); let Inst{48-40} = dpp_ctrl; + let Inst{50} = !if(IsDPP16, fi, ?); let Inst{51} = bound_ctrl; let Inst{52} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg let Inst{53} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs @@ -533,8 +620,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst); let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); - let Constraints = !if(P.NumSrcArgs, "$old = $vdst", ""); - let DisableEncoding = !if(P.NumSrcArgs, "$old", ""); + let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); + let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); let DecoderNamespace = "DPP"; VOPProfile Pfl = P; @@ -568,6 +655,67 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> : let TSFlags = ps.TSFlags; } +class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16, + dag InsDPP = !if(IsDPP16, P.InsDPP16, P.InsDPP), + string AsmDPP = !if(IsDPP16, P.AsmDPP16, P.AsmDPP)> : + InstSI <P.OutsDPP, InsDPP, OpName#AsmDPP, []>, + VOP_DPPe<P, IsDPP16> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + + let VALU = 1; + let DPP = 1; + let Size = 8; + + let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", ""); + let SubtargetPredicate = HasDPP; + let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst); + let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, + AMDGPUAsmVariants.Disable); + let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); + let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); + let DecoderNamespace = "DPP"; +} + +class VOP_DPP8e<VOPProfile P> : Enc64 { + bits<8> src0; + bits<24> dpp8; + bits<9> fi; + + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{63-40} = dpp8{23-0}; +} + +class VOP_DPP8<string OpName, VOPProfile P> : + InstSI<P.OutsDPP8, P.InsDPP8, OpName#P.AsmDPP8, []>, + VOP_DPP8e<P> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + + let VALU = 1; + let DPP = 1; + let Size = 8; + + let AsmMatchConverter = "cvtDPP8"; + let SubtargetPredicate = HasDPP8; + let AssemblerPredicate = !if(P.HasExt, HasDPP8, DisableInst); + let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, + AMDGPUAsmVariants.Disable); + let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); + let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); +} + +def DPP8Mode { + int FI_0 = 0xE9; + int FI_1 = 0xEA; +} + class getNumNodeArgs<SDPatternOperator Op> { SDNode N = !cast<SDNode>(Op); SDTypeProfile TP = N.TypeProfile; |