aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp1899
1 files changed, 1111 insertions, 788 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e2f4a0896bc3..094d5cd58673 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -25,6 +26,7 @@
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
@@ -136,6 +138,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
+ addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
+ addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -151,27 +155,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setBooleanVectorContents(ZeroOrOneBooleanContent);
// We need to custom lower vector stores from local memory
- setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v6i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v7i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
- setOperationAction(ISD::LOAD, MVT::i1, Custom);
- setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
+ setOperationAction(ISD::LOAD,
+ {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+ MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
+ MVT::v32i32},
+ Custom);
- setOperationAction(ISD::STORE, MVT::v2i32, Custom);
- setOperationAction(ISD::STORE, MVT::v3i32, Custom);
- setOperationAction(ISD::STORE, MVT::v4i32, Custom);
- setOperationAction(ISD::STORE, MVT::v5i32, Custom);
- setOperationAction(ISD::STORE, MVT::v6i32, Custom);
- setOperationAction(ISD::STORE, MVT::v7i32, Custom);
- setOperationAction(ISD::STORE, MVT::v8i32, Custom);
- setOperationAction(ISD::STORE, MVT::v16i32, Custom);
- setOperationAction(ISD::STORE, MVT::i1, Custom);
- setOperationAction(ISD::STORE, MVT::v32i32, Custom);
+ setOperationAction(ISD::STORE,
+ {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+ MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
+ MVT::v32i32},
+ Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
@@ -198,81 +192,57 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
- setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
- setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+ setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
- setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+ setOperationAction(ISD::SELECT_CC,
+ {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
setOperationAction(ISD::SETCC, MVT::i1, Promote);
- setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
- setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
+ setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
- setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
- setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v16f32, Expand);
+ setOperationAction(ISD::TRUNCATE,
+ {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
+ MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32},
+ Expand);
+ setOperationAction(ISD::FP_ROUND,
+ {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
+ MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32},
+ Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG,
+ {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
+ MVT::v3i16, MVT::v4i16, MVT::Other},
+ Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
- setOperationAction(ISD::BR_CC, MVT::i1, Expand);
- setOperationAction(ISD::BR_CC, MVT::i32, Expand);
- setOperationAction(ISD::BR_CC, MVT::i64, Expand);
- setOperationAction(ISD::BR_CC, MVT::f32, Expand);
- setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+ setOperationAction(ISD::BR_CC,
+ {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
- setOperationAction(ISD::UADDO, MVT::i32, Legal);
- setOperationAction(ISD::USUBO, MVT::i32, Legal);
+ setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
- setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
- setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+ setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i32, Legal);
- setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
- setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
- setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+ setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
+ Expand);
#if 0
- setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
- setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
+ setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i64, Legal);
#endif
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
- for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
- MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
- MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
- MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
- MVT::v8i16, MVT::v8f16, MVT::v16i64, MVT::v16f64,
- MVT::v32i32, MVT::v32f32 }) {
+ for (MVT VT :
+ {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64,
+ MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64,
+ MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64,
+ MVT::v8f64, MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+ MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -372,94 +342,63 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
}
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE,
+ {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
+ Expand);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom);
// Avoid stack access for these.
// TODO: Generalize to more vector types.
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
-
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
-
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+ setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
+ {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
+ MVT::v4i16, MVT::v4f16, MVT::v16i16, MVT::v16f16},
+ Custom);
// Deal with vec3 vector operations when widened to vec4.
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR,
+ {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
// Deal with vec5/6/7 vector operations when widened to vec8.
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR,
+ {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
+ MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
+ Custom);
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling
- setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
- setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
// We can't return success/failure, only the old value,
// let LLVM add the comparison
- setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
- setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
+ Expand);
- if (Subtarget->hasFlatAddressSpace()) {
- setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
- setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
- }
+ if (Subtarget->hasFlatAddressSpace())
+ setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
- setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
- setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+ setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
// FIXME: This should be narrowed to i32, but that only happens if i64 is
// illegal.
// FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
- setOperationAction(ISD::BSWAP, MVT::i64, Legal);
- setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+ setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
// On SI this is s_memtime and s_memrealtime on VI.
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
- setOperationAction(ISD::TRAP, MVT::Other, Custom);
- setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
+ setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
if (Subtarget->has16BitInsts()) {
- setOperationAction(ISD::FPOW, MVT::f16, Promote);
- setOperationAction(ISD::FPOWI, MVT::f16, Promote);
- setOperationAction(ISD::FLOG, MVT::f16, Custom);
- setOperationAction(ISD::FEXP, MVT::f16, Custom);
- setOperationAction(ISD::FLOG10, MVT::f16, Custom);
+ setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
+ setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
}
if (Subtarget->hasMadMacF32Insts())
setOperationAction(ISD::FMAD, MVT::f32, Legal);
- if (!Subtarget->hasBFI()) {
+ if (!Subtarget->hasBFI())
// fcopysign can be done in a single instruction with BFI.
- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
- }
+ setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
if (!Subtarget->hasBCNT(32))
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
@@ -467,15 +406,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (!Subtarget->hasBCNT(64))
setOperationAction(ISD::CTPOP, MVT::i64, Expand);
- if (Subtarget->hasFFBH()) {
- setOperationAction(ISD::CTLZ, MVT::i32, Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
- }
+ if (Subtarget->hasFFBH())
+ setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
- if (Subtarget->hasFFBL()) {
- setOperationAction(ISD::CTTZ, MVT::i32, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
- }
+ if (Subtarget->hasFFBL())
+ setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
// We only really have 32-bit BFE instructions (and 16-bit on VI).
//
@@ -489,84 +424,48 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setHasExtractBitsInsn(true);
// Clamp modifier on add/sub
- if (Subtarget->hasIntClamp()) {
- setOperationAction(ISD::UADDSAT, MVT::i32, Legal);
- setOperationAction(ISD::USUBSAT, MVT::i32, Legal);
- }
-
- if (Subtarget->hasAddNoCarry()) {
- setOperationAction(ISD::SADDSAT, MVT::i16, Legal);
- setOperationAction(ISD::SSUBSAT, MVT::i16, Legal);
- setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
- setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
- }
+ if (Subtarget->hasIntClamp())
+ setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
- setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
- setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
- setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
- setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
+ if (Subtarget->hasAddNoCarry())
+ setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
+ Legal);
+ setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
+ Custom);
// These are really only legal for ieee_mode functions. We should be avoiding
// them for functions that don't have ieee_mode enabled, so just say they are
// legal.
- setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
- setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
+ setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
+ {MVT::f32, MVT::f64}, Legal);
-
- if (Subtarget->haveRoundOpsF64()) {
- setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::f64, Legal);
- setOperationAction(ISD::FRINT, MVT::f64, Legal);
- } else {
- setOperationAction(ISD::FCEIL, MVT::f64, Custom);
- setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
- setOperationAction(ISD::FRINT, MVT::f64, Custom);
- setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
- }
+ if (Subtarget->haveRoundOpsF64())
+ setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FRINT}, MVT::f64, Legal);
+ else
+ setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR},
+ MVT::f64, Custom);
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
- setOperationAction(ISD::FSIN, MVT::f32, Custom);
- setOperationAction(ISD::FCOS, MVT::f32, Custom);
- setOperationAction(ISD::FDIV, MVT::f32, Custom);
+ setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
setOperationAction(ISD::FDIV, MVT::f64, Custom);
if (Subtarget->has16BitInsts()) {
- setOperationAction(ISD::Constant, MVT::i16, Legal);
-
- setOperationAction(ISD::SMIN, MVT::i16, Legal);
- setOperationAction(ISD::SMAX, MVT::i16, Legal);
-
- setOperationAction(ISD::UMIN, MVT::i16, Legal);
- setOperationAction(ISD::UMAX, MVT::i16, Legal);
+ setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
+ ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
+ MVT::i16, Legal);
- setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
- setOperationAction(ISD::ROTR, MVT::i16, Expand);
- setOperationAction(ISD::ROTL, MVT::i16, Expand);
-
- setOperationAction(ISD::SDIV, MVT::i16, Promote);
- setOperationAction(ISD::UDIV, MVT::i16, Promote);
- setOperationAction(ISD::SREM, MVT::i16, Promote);
- setOperationAction(ISD::UREM, MVT::i16, Promote);
- setOperationAction(ISD::UADDSAT, MVT::i16, Legal);
- setOperationAction(ISD::USUBSAT, MVT::i16, Legal);
-
- setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
-
- setOperationAction(ISD::CTTZ, MVT::i16, Promote);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
- setOperationAction(ISD::CTLZ, MVT::i16, Promote);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
- setOperationAction(ISD::CTPOP, MVT::i16, Promote);
+ setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
+ MVT::i16, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
-
- setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+ setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
+ ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
+ ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
+ ISD::CTPOP},
+ MVT::i16, Promote);
setOperationAction(ISD::LOAD, MVT::i16, Custom);
@@ -577,8 +476,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom);
+ setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
// F16 - Constant Actions.
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
@@ -590,22 +488,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
// F16 - VOP1 Actions.
- setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
- setOperationAction(ISD::FCOS, MVT::f16, Custom);
- setOperationAction(ISD::FSIN, MVT::f16, Custom);
+ setOperationAction(
+ {ISD::FP_ROUND, ISD::FCOS, ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
+ MVT::f16, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom);
+ setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
- setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
- setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
- setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
- setOperationAction(ISD::FROUND, MVT::f16, Custom);
+ setOperationAction(
+ {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
+ MVT::f16, Promote);
// F16 - VOP2 Actions.
- setOperationAction(ISD::BR_CC, MVT::f16, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+ setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
setOperationAction(ISD::FDIV, MVT::f16, Custom);
@@ -615,7 +509,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
- MVT::v8f16}) {
+ MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -639,16 +533,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
// v_perm_b32 can handle either of these.
- setOperationAction(ISD::BSWAP, MVT::i16, Legal);
- setOperationAction(ISD::BSWAP, MVT::v2i16, Legal);
+ setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
// XXX - Do these do anything? Vector constants turn into build_vector.
- setOperationAction(ISD::Constant, MVT::v2i16, Legal);
- setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
+ setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
- setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
- setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
+ setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal);
setOperationAction(ISD::STORE, MVT::v2i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
@@ -692,140 +583,98 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v8f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
- setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
+
+ setOperationAction(ISD::STORE, MVT::v16i16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
+ setOperationAction(ISD::STORE, MVT::v16f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
+
+ setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+ MVT::v2i32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
- setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
+ setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+ MVT::v4i32, Expand);
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Expand);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Expand);
+ setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+ MVT::v8i32, Expand);
- if (!Subtarget->hasVOP3PInsts()) {
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
- }
+ if (!Subtarget->hasVOP3PInsts())
+ setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom);
setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
// This isn't really legal, but this avoids the legalizer unrolling it (and
// allows matching fneg (fabs x) patterns)
setOperationAction(ISD::FABS, MVT::v2f16, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
- setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
- setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
+ setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
+ setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
- setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
- setOperationAction(ISD::FMINNUM_IEEE, MVT::v8f16, Custom);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::v8f16, Custom);
+ setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
- setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
- setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
- setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand);
- setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand);
+ setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
- for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) {
- setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom);
+ for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+ setOperationAction(
+ {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
+ Vec16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand);
}
}
if (Subtarget->hasVOP3PInsts()) {
- setOperationAction(ISD::ADD, MVT::v2i16, Legal);
- setOperationAction(ISD::SUB, MVT::v2i16, Legal);
- setOperationAction(ISD::MUL, MVT::v2i16, Legal);
- setOperationAction(ISD::SHL, MVT::v2i16, Legal);
- setOperationAction(ISD::SRL, MVT::v2i16, Legal);
- setOperationAction(ISD::SRA, MVT::v2i16, Legal);
- setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
- setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
- setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
- setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
-
- setOperationAction(ISD::UADDSAT, MVT::v2i16, Legal);
- setOperationAction(ISD::USUBSAT, MVT::v2i16, Legal);
- setOperationAction(ISD::SADDSAT, MVT::v2i16, Legal);
- setOperationAction(ISD::SSUBSAT, MVT::v2i16, Legal);
-
- setOperationAction(ISD::FADD, MVT::v2f16, Legal);
- setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
- setOperationAction(ISD::FMA, MVT::v2f16, Legal);
+ setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
+ ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
+ ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
+ MVT::v2i16, Legal);
- setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
- setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
+ setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
+ ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
+ MVT::v2f16, Legal);
- setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16},
+ Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE,
+ {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
+ MVT::v16f16, MVT::v16i16},
+ Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
-
- for (MVT VT : { MVT::v4i16, MVT::v8i16 }) {
+ for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
// Split vector operations.
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::ADD, VT, Custom);
- setOperationAction(ISD::SUB, VT, Custom);
- setOperationAction(ISD::MUL, VT, Custom);
-
- setOperationAction(ISD::SMIN, VT, Custom);
- setOperationAction(ISD::SMAX, VT, Custom);
- setOperationAction(ISD::UMIN, VT, Custom);
- setOperationAction(ISD::UMAX, VT, Custom);
-
- setOperationAction(ISD::UADDSAT, VT, Custom);
- setOperationAction(ISD::SADDSAT, VT, Custom);
- setOperationAction(ISD::USUBSAT, VT, Custom);
- setOperationAction(ISD::SSUBSAT, VT, Custom);
- }
+ setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
+ ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
+ ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
+ ISD::SSUBSAT},
+ VT, Custom);
- for (MVT VT : { MVT::v4f16, MVT::v8f16 }) {
+ for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
// Split vector operations.
- setOperationAction(ISD::FADD, VT, Custom);
- setOperationAction(ISD::FMUL, VT, Custom);
- setOperationAction(ISD::FMA, VT, Custom);
- setOperationAction(ISD::FCANONICALIZE, VT, Custom);
- }
-
- setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
- setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
+ setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
+ VT, Custom);
- setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
- setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
+ setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
+ Custom);
setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
- setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
- setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+ setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom);
if (Subtarget->hasPackedFP32Ops()) {
- setOperationAction(ISD::FADD, MVT::v2f32, Legal);
- setOperationAction(ISD::FMUL, MVT::v2f32, Legal);
- setOperationAction(ISD::FMA, MVT::v2f32, Legal);
- setOperationAction(ISD::FNEG, MVT::v2f32, Legal);
-
- for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {
- setOperationAction(ISD::FADD, VT, Custom);
- setOperationAction(ISD::FMUL, VT, Custom);
- setOperationAction(ISD::FMA, VT, Custom);
- }
+ setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
+ MVT::v2f32, Legal);
+ setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA},
+ {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
+ Custom);
}
}
- setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
- setOperationAction(ISD::FABS, MVT::v4f16, Custom);
+ setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
@@ -834,107 +683,88 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
} else {
// Legalization hack.
- setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
- setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
+ setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
- setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
- setOperationAction(ISD::FABS, MVT::v2f16, Custom);
+ setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
}
- for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
- MVT::v8i16, MVT::v8f16 }) {
- setOperationAction(ISD::SELECT, VT, Custom);
- }
+ setOperationAction(ISD::SELECT,
+ {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
+ MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
+ Custom);
- setOperationAction(ISD::SMULO, MVT::i64, Custom);
- setOperationAction(ISD::UMULO, MVT::i64, Custom);
+ setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
- if (Subtarget->hasMad64_32()) {
- setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
- setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
- }
+ if (Subtarget->hasMad64_32())
+ setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN,
+ {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
+ MVT::v2i16, MVT::v2f16},
+ Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3i16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN,
+ {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
+ MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
+ MVT::i16, MVT::i8},
+ Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v3i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v3f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID,
+ {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
+ MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
+ MVT::i8},
+ Custom);
- setTargetDAGCombine(ISD::ADD);
- setTargetDAGCombine(ISD::ADDCARRY);
- setTargetDAGCombine(ISD::SUB);
- setTargetDAGCombine(ISD::SUBCARRY);
- setTargetDAGCombine(ISD::FADD);
- setTargetDAGCombine(ISD::FSUB);
- setTargetDAGCombine(ISD::FMINNUM);
- setTargetDAGCombine(ISD::FMAXNUM);
- setTargetDAGCombine(ISD::FMINNUM_IEEE);
- setTargetDAGCombine(ISD::FMAXNUM_IEEE);
- setTargetDAGCombine(ISD::FMA);
- setTargetDAGCombine(ISD::SMIN);
- setTargetDAGCombine(ISD::SMAX);
- setTargetDAGCombine(ISD::UMIN);
- setTargetDAGCombine(ISD::UMAX);
- setTargetDAGCombine(ISD::SETCC);
- setTargetDAGCombine(ISD::AND);
- setTargetDAGCombine(ISD::OR);
- setTargetDAGCombine(ISD::XOR);
- setTargetDAGCombine(ISD::SINT_TO_FP);
- setTargetDAGCombine(ISD::UINT_TO_FP);
- setTargetDAGCombine(ISD::FCANONICALIZE);
- setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
- setTargetDAGCombine(ISD::ZERO_EXTEND);
- setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
- setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
- setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine({ISD::ADD,
+ ISD::ADDCARRY,
+ ISD::SUB,
+ ISD::SUBCARRY,
+ ISD::FADD,
+ ISD::FSUB,
+ ISD::FMINNUM,
+ ISD::FMAXNUM,
+ ISD::FMINNUM_IEEE,
+ ISD::FMAXNUM_IEEE,
+ ISD::FMA,
+ ISD::SMIN,
+ ISD::SMAX,
+ ISD::UMIN,
+ ISD::UMAX,
+ ISD::SETCC,
+ ISD::AND,
+ ISD::OR,
+ ISD::XOR,
+ ISD::SINT_TO_FP,
+ ISD::UINT_TO_FP,
+ ISD::FCANONICALIZE,
+ ISD::SCALAR_TO_VECTOR,
+ ISD::ZERO_EXTEND,
+ ISD::SIGN_EXTEND_INREG,
+ ISD::EXTRACT_VECTOR_ELT,
+ ISD::INSERT_VECTOR_ELT});
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
- setTargetDAGCombine(ISD::LOAD);
- setTargetDAGCombine(ISD::STORE);
- setTargetDAGCombine(ISD::ATOMIC_LOAD);
- setTargetDAGCombine(ISD::ATOMIC_STORE);
- setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
- setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
- setTargetDAGCombine(ISD::ATOMIC_SWAP);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
- setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
- setTargetDAGCombine(ISD::INTRINSIC_VOID);
- setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+ setTargetDAGCombine({ISD::LOAD,
+ ISD::STORE,
+ ISD::ATOMIC_LOAD,
+ ISD::ATOMIC_STORE,
+ ISD::ATOMIC_CMP_SWAP,
+ ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
+ ISD::ATOMIC_SWAP,
+ ISD::ATOMIC_LOAD_ADD,
+ ISD::ATOMIC_LOAD_SUB,
+ ISD::ATOMIC_LOAD_AND,
+ ISD::ATOMIC_LOAD_OR,
+ ISD::ATOMIC_LOAD_XOR,
+ ISD::ATOMIC_LOAD_NAND,
+ ISD::ATOMIC_LOAD_MIN,
+ ISD::ATOMIC_LOAD_MAX,
+ ISD::ATOMIC_LOAD_UMIN,
+ ISD::ATOMIC_LOAD_UMAX,
+ ISD::ATOMIC_LOAD_FADD,
+ ISD::INTRINSIC_VOID,
+ ISD::INTRINSIC_W_CHAIN});
// FIXME: In other contexts we pretend this is a per-function property.
setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
@@ -1118,6 +948,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
unsigned IntrID) const {
+ Info.flags = MachineMemOperand::MONone;
+ if (CI.hasMetadata(LLVMContext::MD_invariant_load))
+ Info.flags |= MachineMemOperand::MOInvariant;
+
if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
@@ -1127,16 +961,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+
if (RsrcIntr->IsImage) {
- Info.ptrVal =
- MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ Info.ptrVal = MFI->getImagePSV(TM);
Info.align.reset();
} else {
- Info.ptrVal =
- MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ Info.ptrVal = MFI->getBufferPSV(TM);
}
- Info.flags = MachineMemOperand::MODereferenceable;
+ Info.flags |= MachineMemOperand::MODereferenceable;
if (Attr.hasFnAttr(Attribute::ReadOnly)) {
unsigned DMaskLanes = 4;
@@ -1178,12 +1013,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
- Info.flags = MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable;
+ Info.flags |= MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable;
// XXX - Should this be volatile without known ordering?
Info.flags |= MachineMemOperand::MOVolatile;
+
+ switch (IntrID) {
+ default:
+ break;
+ case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+ return true;
+ }
+ }
}
return true;
}
@@ -1200,7 +1046,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
if (!Vol->isZero())
@@ -1211,12 +1057,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_buffer_atomic_fadd: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
- Info.ptrVal =
- MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ Info.ptrVal = MFI->getBufferPSV(TM);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
if (!Vol || !Vol->isZero())
@@ -1230,7 +1078,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
if (!Vol->isZero())
@@ -1243,20 +1091,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MOVolatile;
+ Info.flags |= MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
- Info.ptrVal =
- MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+
+ Info.ptrVal = MFI->getImagePSV(TM);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad |
- MachineMemOperand::MODereferenceable;
+ Info.flags |= MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable;
return true;
}
case Intrinsic::amdgcn_global_atomic_fadd:
@@ -1264,15 +1115,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
- case Intrinsic::amdgcn_flat_atomic_fmax: {
+ case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOVolatile;
+ Info.flags |= MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_ds_gws_init:
@@ -1283,18 +1136,29 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_ds_gws_sema_release_all: {
Info.opc = ISD::INTRINSIC_VOID;
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- Info.ptrVal =
- MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ Info.ptrVal = MFI->getGWSPSV(TM);
// This is an abstract access, but we need to specify a type and size.
Info.memVT = MVT::i32;
Info.size = 4;
Info.align = Align(4);
- Info.flags = MachineMemOperand::MOStore;
if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
- Info.flags = MachineMemOperand::MOLoad;
+ Info.flags |= MachineMemOperand::MOLoad;
+ else
+ Info.flags |= MachineMemOperand::MOStore;
+ return true;
+ }
+ case Intrinsic::amdgcn_global_load_lds: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile;
return true;
}
default:
@@ -1319,6 +1183,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_global_atomic_csub: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
@@ -1506,47 +1372,96 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
// Check if alignment requirements for ds_read/write instructions are
// disabled.
- if (Subtarget->hasUnalignedDSAccessEnabled() &&
- !Subtarget->hasLDSMisalignedBug()) {
- if (IsFast)
- *IsFast = Alignment != Align(2);
- return true;
- }
+ if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
+ return false;
+
+ Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
+ if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
+ Alignment < RequiredAlignment)
+ return false;
// Either, the alignment requirements are "enabled", or there is an
// unaligned LDS access related hardware bug though alignment requirements
// are "disabled". In either case, we need to check for proper alignment
// requirements.
//
- if (Size == 64) {
+ switch (Size) {
+ case 64:
+ // SI has a hardware bug in the LDS / GDS bounds checking: if the base
+ // address is negative, then the instruction is incorrectly treated as
+ // out-of-bounds even if base + offsets is in bounds. Split vectorized
+ // loads here to avoid emitting ds_read2_b32. We may re-combine the
+ // load later in the SILoadStoreOptimizer.
+ if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
+ return false;
+
// 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
// can do a 4 byte aligned, 8 byte access in a single operation using
// ds_read2/write2_b32 with adjacent offsets.
- bool AlignedBy4 = Alignment >= Align(4);
- if (IsFast)
- *IsFast = AlignedBy4;
+ RequiredAlignment = Align(4);
+
+ if (Subtarget->hasUnalignedDSAccessEnabled()) {
+ // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
+ // ds_write2_b32 depending on the alignment. In either case with either
+ // alignment there is no faster way of doing this.
+ if (IsFast)
+ *IsFast = true;
+ return true;
+ }
+
+ break;
+ case 96:
+ if (!Subtarget->hasDS96AndDS128())
+ return false;
- return AlignedBy4;
- }
- if (Size == 96) {
// 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
// gfx8 and older.
- bool AlignedBy16 = Alignment >= Align(16);
- if (IsFast)
- *IsFast = AlignedBy16;
- return AlignedBy16;
- }
- if (Size == 128) {
+ if (Subtarget->hasUnalignedDSAccessEnabled()) {
+ // Naturally aligned access is fastest. However, also report it is Fast
+ // if memory is aligned less than DWORD. A narrow load or store will be
+ // be equally slow as a single ds_read_b96/ds_write_b96, but there will
+ // be more of them, so overall we will pay less penalty issuing a single
+ // instruction.
+ if (IsFast)
+ *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
+ return true;
+ }
+
+ break;
+ case 128:
+ if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
+ return false;
+
// 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
// gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
// single operation using ds_read2/write2_b64.
- bool AlignedBy8 = Alignment >= Align(8);
- if (IsFast)
- *IsFast = AlignedBy8;
+ RequiredAlignment = Align(8);
+
+ if (Subtarget->hasUnalignedDSAccessEnabled()) {
+ // Naturally aligned access is fastest. However, also report it is Fast
+ // if memory is aligned less than DWORD. A narrow load or store will be
+ // be equally slow as a single ds_read_b128/ds_write_b128, but there
+ // will be more of them, so overall we will pay less penalty issuing a
+ // single instruction.
+ if (IsFast)
+ *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
+ return true;
+ }
+
+ break;
+ default:
+ if (Size > 32)
+ return false;
- return AlignedBy8;
+ break;
}
+
+ if (IsFast)
+ *IsFast = Alignment >= RequiredAlignment;
+
+ return Alignment >= RequiredAlignment ||
+ Subtarget->hasUnalignedDSAccessEnabled();
}
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
@@ -1571,14 +1486,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
return AlignedBy4;
}
- if (Subtarget->hasUnalignedBufferAccessEnabled() &&
- !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
- // If we have an uniform constant load, it still requires using a slow
+ if (Subtarget->hasUnalignedBufferAccessEnabled()) {
+ // If we have a uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
// Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
- // 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
+ // 2-byte alignment is worse than 1 unless doing a 2-byte access.
*IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
Alignment >= Align(4) : Alignment != Align(2);
@@ -1603,20 +1516,22 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
bool SITargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
bool *IsFast) const {
- if (IsFast)
- *IsFast = false;
+ bool Allow = allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
+ Alignment, Flags, IsFast);
- // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
- // which isn't a simple VT.
- // Until MVT is extended to handle this, simply check for the size and
- // rely on the condition below: allow accesses if the size is a multiple of 4.
- if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
- VT.getStoreSize() > 16)) {
- return false;
+ if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() &&
+ (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
+ // Lie it is fast if +unaligned-access-mode is passed so that DS accesses
+ // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a
+ // misaligned data which is faster than a pair of ds_read_b*/ds_write_b*
+ // which would be equally misaligned.
+ // This is only used by the common passes, selection always calls the
+ // allowsMisalignedMemoryAccessesImpl version.
+ *IsFast = true;
}
- return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
- Alignment, Flags, IsFast);
+ return Allow;
}
EVT SITargetLowering::getOptimalMemOpType(
@@ -1639,9 +1554,7 @@ EVT SITargetLowering::getOptimalMemOpType(
bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
- const Value *Ptr = MemNode->getMemOperand()->getValue();
- const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
- return I && I->getMetadata("amdgpu.noclobber");
+ return MemNode->getMemOperand()->getFlags() & MONoClobber;
}
bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
@@ -1681,6 +1594,15 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}
+bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+ unsigned Index) const {
+ if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+ return false;
+
+ // TODO: Add more cases that are cheap.
+ return Index == 0;
+}
+
bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
if (Subtarget->has16BitInsts() && VT == MVT::i16) {
switch (Op) {
@@ -2106,7 +2028,7 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasDispatchPtr())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
- if (Info.hasQueuePtr())
+ if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
@@ -2153,7 +2075,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(DispatchPtrReg);
}
- if (Info.hasQueuePtr()) {
+ if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
@@ -2190,6 +2112,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
SIMachineFunctionInfo &Info,
CallingConv::ID CallConv,
bool IsShader) const {
+ if (Subtarget->hasUserSGPRInit16Bug()) {
+ // Pad up the used user SGPRs with dead inputs.
+ unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
+
+ // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
+ // rely on it to reach 16 since if we end up having no stack usage, it will
+ // not really be added.
+ unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
+ Info.hasWorkGroupIDY() +
+ Info.hasWorkGroupIDZ() +
+ Info.hasWorkGroupInfo();
+ for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
+ Register Reg = Info.addReservedUserSGPR();
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+ }
+
if (Info.hasWorkGroupIDX()) {
Register Reg = Info.addWorkGroupIDX();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
@@ -2234,6 +2174,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
}
+
+ assert(!Subtarget->hasUserSGPRInit16Bug() || Info.getNumPreloadedSGPRs() >= 16);
}
static void reservePrivateMemoryRegs(const TargetMachine &TM,
@@ -2388,7 +2330,7 @@ SDValue SITargetLowering::LowerFormalArguments(
return DAG.getEntryNode();
}
- Info->allocateModuleLDSGlobal(Fn.getParent());
+ Info->allocateModuleLDSGlobal(Fn);
SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;
@@ -2538,7 +2480,13 @@ SDValue SITargetLowering::LowerFormalArguments(
assert(VA.isRegLoc() && "Parameter must be in a register!");
Register Reg = VA.getLocReg();
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+ const TargetRegisterClass *RC = nullptr;
+ if (AMDGPU::VGPR_32RegClass.contains(Reg))
+ RC = &AMDGPU::VGPR_32RegClass;
+ else if (AMDGPU::SGPR_32RegClass.contains(Reg))
+ RC = &AMDGPU::SGPR_32RegClass;
+ else
+ llvm_unreachable("Unexpected register class in LowerFormalArguments!");
EVT ValVT = VA.getValVT();
Reg = MF.addLiveIn(Reg, RC);
@@ -2657,24 +2605,6 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<SDValue, 48> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
- // Add return address for callable functions.
- if (!Info->isEntryFunction()) {
- const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
- SDValue ReturnAddrReg = CreateLiveInRegister(
- DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
-
- SDValue ReturnAddrVirtualReg =
- DAG.getRegister(MF.getRegInfo().createVirtualRegister(
- CallConv != CallingConv::AMDGPU_Gfx
- ? &AMDGPU::CCR_SGPR_64RegClass
- : &AMDGPU::Gfx_CCR_SGPR_64RegClass),
- MVT::i64);
- Chain =
- DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
- Flag = Chain.getValue(1);
- RetOps.push_back(ReturnAddrVirtualReg);
- }
-
// Copy the result values into the output registers.
for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
++I, ++RealRVLocIdx) {
@@ -2731,15 +2661,8 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
RetOps.push_back(Flag);
unsigned Opc = AMDGPUISD::ENDPGM;
- if (!IsWaveEnd) {
- if (IsShader)
- Opc = AMDGPUISD::RETURN_TO_EPILOG;
- else if (CallConv == CallingConv::AMDGPU_Gfx)
- Opc = AMDGPUISD::RET_GFX_FLAG;
- else
- Opc = AMDGPUISD::RET_FLAG;
- }
-
+ if (!IsWaveEnd)
+ Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
@@ -3321,21 +3244,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
- SDValue PhysReturnAddrReg;
- if (IsTailCall) {
- // Since the return is being combined with the call, we need to pass on the
- // return address.
-
- const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
- SDValue ReturnAddrReg = CreateLiveInRegister(
- DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
-
- PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
- MVT::i64);
- Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
- InFlag = Chain.getValue(1);
- }
-
// We don't usually want to end the call-sequence here because we would tidy
// the frame up *after* the call, however in the ABI-changing tail-call case
// we've carefully laid out the parameters so that when sp is reset they'll be
@@ -3365,8 +3273,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// this information must travel along with the operation for eventual
// consumption by emitEpilogue.
Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
-
- Ops.push_back(PhysReturnAddrReg);
}
// Add argument registers to the end of the list so that they are known live
@@ -4104,6 +4010,21 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src0 = MI.getOperand(1);
+ MachineOperand &Src1 = MI.getOperand(2);
+
+ if (IsAdd && ST.hasLshlAddB64()) {
+ auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
+ Dest.getReg())
+ .add(Src0)
+ .addImm(0)
+ .add(Src1);
+ TII->legalizeOperands(*Add);
+ MI.eraseFromParent();
+ return BB;
+ }
+
const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -4112,10 +4033,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
Register CarryReg = MRI.createVirtualRegister(CarryRC);
Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
- MachineOperand &Dest = MI.getOperand(0);
- MachineOperand &Src0 = MI.getOperand(1);
- MachineOperand &Src1 = MI.getOperand(2);
-
const TargetRegisterClass *Src0RC = Src0.isReg()
? MRI.getRegClass(Src0.getReg())
: &AMDGPU::VReg_64RegClass;
@@ -4390,29 +4307,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::DS_GWS_INIT:
case AMDGPU::DS_GWS_SEMA_BR:
case AMDGPU::DS_GWS_BARRIER:
- if (Subtarget->needsAlignedVGPRs()) {
- // Add implicit aligned super-reg to force alignment on the data operand.
- const DebugLoc &DL = MI.getDebugLoc();
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
- MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
- Register DataReg = Op->getReg();
- bool IsAGPR = TRI->isAGPR(MRI, DataReg);
- Register Undef = MRI.createVirtualRegister(
- IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
- BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
- Register NewVR =
- MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
- : &AMDGPU::VReg_64_Align2RegClass);
- BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR)
- .addReg(DataReg, 0, Op->getSubReg())
- .addImm(AMDGPU::sub0)
- .addReg(Undef)
- .addImm(AMDGPU::sub1);
- Op->setReg(NewVR);
- Op->setSubReg(AMDGPU::sub0);
- MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
- }
+ TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
LLVM_FALLTHROUGH;
case AMDGPU::DS_GWS_SEMA_V:
case AMDGPU::DS_GWS_SEMA_P:
@@ -4500,6 +4395,18 @@ bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
return isTypeLegal(VT.getScalarType());
}
+bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const {
+ switch (Op.getValue(0).getSimpleValueType().SimpleTy) {
+ case MVT::f32:
+ return Subtarget->hasAtomicFaddRtnInsts();
+ case MVT::v2f16:
+ case MVT::f64:
+ return Subtarget->hasGFX90AInsts();
+ default:
+ return false;
+ }
+}
+
bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
// This currently forces unfolding various combinations of fsub into fma with
// free fneg'd operands. As long as we have fast FMA (controlled by
@@ -4560,7 +4467,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
// Otherwise f32 mad is always full rate and returns the same result as
// the separate operations so should be preferred over fma.
- // However does not support denomals.
+ // However does not support denormals.
if (hasFP32Denormals(MF))
return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
@@ -4653,8 +4560,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
- VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8f32 ||
- VT == MVT::v16f32 || VT == MVT::v32f32);
+ VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
+ VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+ VT == MVT::v32f32);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4676,8 +4584,9 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
- VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v8f32 ||
- VT == MVT::v16f32 || VT == MVT::v32f32);
+ VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
+ VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+ VT == MVT::v32f32);
SDValue Lo0, Hi0;
SDValue Op0 = Op.getOperand(0);
@@ -4738,10 +4647,30 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return lowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::SCALAR_TO_VECTOR:
+ return lowerSCALAR_TO_VECTOR(Op, DAG);
case ISD::BUILD_VECTOR:
return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);
+ case ISD::FPTRUNC_ROUND: {
+ unsigned Opc;
+ SDLoc DL(Op);
+
+ if (Op.getOperand(0)->getValueType(0) != MVT::f32)
+ return SDValue();
+
+ // Get the rounding mode from the last operand
+ int RoundMode = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (RoundMode == (int)RoundingMode::TowardPositive)
+ Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD;
+ else if (RoundMode == (int)RoundingMode::TowardNegative)
+ Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD;
+ else
+ return SDValue();
+
+ return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
+ }
case ISD::TRAP:
return lowerTRAP(Op, DAG);
case ISD::DEBUGTRAP:
@@ -5356,7 +5285,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
if (IsIEEEMode)
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
- if (VT == MVT::v4f16 || VT == MVT::v8f16)
+ if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
return splitBinaryVectorOp(Op, DAG);
return Op;
}
@@ -5439,24 +5368,41 @@ SDValue SITargetLowering::lowerTrapEndpgm(
return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
}
+SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
+ const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ uint64_t Offset = getImplicitParameterOffset(MF, Param);
+ SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+}
+
SDValue SITargetLowering::lowerTrapHsaQueuePtr(
SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
- MachineFunction &MF = DAG.getMachineFunction();
- SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- Register UserSGPR = Info->getQueuePtrUserSGPR();
-
SDValue QueuePtr;
- if (UserSGPR == AMDGPU::NoRegister) {
- // We probably are in a function incorrectly marked with
- // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the trap,
- // so just use a null pointer.
- QueuePtr = DAG.getConstant(0, SL, MVT::i64);
+ // For code object version 5, QueuePtr is passed through implicit kernarg.
+ if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+ QueuePtr =
+ loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
} else {
- QueuePtr = CreateLiveInRegister(
- DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ Register UserSGPR = Info->getQueuePtrUserSGPR();
+
+ if (UserSGPR == AMDGPU::NoRegister) {
+ // We probably are in a function incorrectly marked with
+ // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
+ // trap, so just use a null pointer.
+ QueuePtr = DAG.getConstant(0, SL, MVT::i64);
+ } else {
+ QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
+ MVT::i64);
+ }
}
SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
@@ -5532,6 +5478,14 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
}
+ // For code object version 5, private_base and shared_base are passed through
+ // implicit kernargs.
+ if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+ ImplicitParameter Param =
+ (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
+ return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
+ }
+
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Register UserSGPR = Info->getQueuePtrUserSGPR();
@@ -5691,14 +5645,11 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
EVT EltVT = VecVT.getVectorElementType();
unsigned VecSize = VecVT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();
+ SDLoc SL(Op);
-
- assert(VecSize <= 64);
-
+ // Specially handle the case of v4i16 with static indexing.
unsigned NumElts = VecVT.getVectorNumElements();
- SDLoc SL(Op);
auto KIdx = dyn_cast<ConstantSDNode>(Idx);
-
if (NumElts == 4 && EltSize == 16 && KIdx) {
SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
@@ -5726,35 +5677,41 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
}
+ // Static indexing does not lower to stack access, and hence there is no need
+ // for special custom lowering to avoid stack access.
if (isa<ConstantSDNode>(Idx))
return SDValue();
- MVT IntVT = MVT::getIntegerVT(VecSize);
-
- // Avoid stack access for dynamic indexing.
+ // Avoid stack access for dynamic indexing by custom lowering to
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
- // Create a congruent vector with the target value in each element so that
- // the required element can be masked and ORed into the target vector.
- SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
- DAG.getSplatBuildVector(VecVT, SL, InsVal));
+ assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
+
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+ // Convert vector index to bit-index and get the required bit mask.
assert(isPowerOf2_32(EltSize));
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
-
- // Convert vector index to bit-index.
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
-
- SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
DAG.getConstant(0xffff, SL, IntVT),
ScaledIdx);
+ // 1. Create a congruent vector with the target value in each element.
+ SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
+ DAG.getSplatBuildVector(VecVT, SL, InsVal));
+
+ // 2. Mask off all other indicies except the required index within (1).
SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
+
+ // 3. Mask off the required index within the target vector.
+ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
DAG.getNOT(SL, BFM, IntVT), BCVec);
+ // 4. Get (2) and (3) ORed into the target vector.
SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+
return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
}
@@ -5778,17 +5735,35 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
- if (VecSize == 128) {
+ if (VecSize == 128 || VecSize == 256) {
SDValue Lo, Hi;
EVT LoVT, HiVT;
- SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
- Lo =
- DAG.getBitcast(LoVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
- V2, DAG.getConstant(0, SL, MVT::i32)));
- Hi =
- DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
- V2, DAG.getConstant(1, SL, MVT::i32)));
+
+ if (VecSize == 128) {
+ SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
+ Lo = DAG.getBitcast(LoVT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+ DAG.getConstant(0, SL, MVT::i32)));
+ Hi = DAG.getBitcast(HiVT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+ DAG.getConstant(1, SL, MVT::i32)));
+ } else {
+ assert(VecSize == 256);
+
+ SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
+ SDValue Parts[4];
+ for (unsigned P = 0; P < 4; ++P) {
+ Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+ DAG.getConstant(P, SL, MVT::i32));
+ }
+
+ Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
+ Parts[0], Parts[1]));
+ Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
+ Parts[2], Parts[3]));
+ }
+
EVT IdxVT = Idx.getValueType();
unsigned NElem = VecVT.getVectorNumElements();
assert(isPowerOf2_32(NElem));
@@ -5800,10 +5775,19 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
assert(VecSize <= 64);
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+
+ // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
+ SDValue VecBC = peekThroughBitcasts(Vec);
+ if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ SDValue Src = VecBC.getOperand(0);
+ Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
+ Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
+ }
+
unsigned EltSize = EltVT.getSizeInBits();
assert(isPowerOf2_32(EltSize));
- MVT IntVT = MVT::getIntegerVT(VecSize);
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
// Convert vector index to bit-index (* EltSize)
@@ -5877,6 +5861,22 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
}
+SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue SVal = Op.getOperand(0);
+ EVT ResultVT = Op.getValueType();
+ EVT SValVT = SVal.getValueType();
+ SDValue UndefVal = DAG.getUNDEF(SValVT);
+ SDLoc SL(Op);
+
+ SmallVector<SDValue, 8> VElts;
+ VElts.push_back(SVal);
+ for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
+ VElts.push_back(UndefVal);
+
+ return DAG.getBuildVector(ResultVT, SL, VElts);
+}
+
SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);
@@ -5906,6 +5906,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
+ if (VT == MVT::v16i16 || VT == MVT::v16f16) {
+ EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
+ VT.getVectorNumElements() / 4);
+ MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
+
+ SmallVector<SDValue, 4> Parts[4];
+ for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
+ for (unsigned P = 0; P < 4; ++P)
+ Parts[P].push_back(Op.getOperand(I + P * E));
+ }
+ SDValue Casts[4];
+ for (unsigned P = 0; P < 4; ++P) {
+ SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
+ Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
+ }
+
+ SDValue Blend =
+ DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
+ return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+ }
+
assert(VT == MVT::v2f16 || VT == MVT::v2i16);
assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
@@ -6277,6 +6298,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
unsigned IntrOpcode = Intr->BaseOpcode;
bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
+ bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
SmallVector<EVT, 3> ResultTypes(Op->values());
SmallVector<EVT, 3> OrigResultTypes(Op->values());
@@ -6455,6 +6477,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
//
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
+ //
+ // TODO: we can actually allow partial NSA where the final register is a
+ // contiguous set of the remaining addresses.
+ // This could help where there are more addresses than supported.
bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
VAddrs.size() >= 3 &&
VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
@@ -6561,7 +6587,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
int Opcode = -1;
- if (IsGFX10Plus) {
+ if (IsGFX11Plus) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
+ UseNSA ? AMDGPU::MIMGEncGfx11NSA
+ : AMDGPU::MIMGEncGfx11Default,
+ NumVDataDwords, NumVAddrDwords);
+ } else if (IsGFX10Plus) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
UseNSA ? AMDGPU::MIMGEncGfx10NSA
: AMDGPU::MIMGEncGfx10Default,
@@ -6685,6 +6716,32 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
return Loads[0];
}
+SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
+ unsigned Dim,
+ const ArgDescriptor &Arg) const {
+ SDLoc SL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
+ if (MaxID == 0)
+ return DAG.getConstant(0, SL, MVT::i32);
+
+ SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
+ SDLoc(DAG.getEntryNode()), Arg);
+
+ // Don't bother inserting AssertZext for packed IDs since we're emitting the
+ // masking operations anyway.
+ //
+ // TODO: We could assert the top bit is 0 for the source copy.
+ if (Arg.isMasked())
+ return Val;
+
+ // Preserve the known bits after expansion to a copy.
+ EVT SmallVT =
+ EVT::getIntegerVT(*DAG.getContext(), 32 - countLeadingZeros(MaxID));
+ return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
+ DAG.getValueType(SmallVT));
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -6831,26 +6888,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
case Intrinsic::amdgcn_workitem_id_x:
- if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0)
- return DAG.getConstant(0, DL, MVT::i32);
-
- return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
- SDLoc(DAG.getEntryNode()),
- MFI->getArgInfo().WorkItemIDX);
+ return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
case Intrinsic::amdgcn_workitem_id_y:
- if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0)
- return DAG.getConstant(0, DL, MVT::i32);
-
- return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
- SDLoc(DAG.getEntryNode()),
- MFI->getArgInfo().WorkItemIDY);
+ return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
case Intrinsic::amdgcn_workitem_id_z:
- if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0)
- return DAG.getConstant(0, DL, MVT::i32);
-
- return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
- SDLoc(DAG.getEntryNode()),
- MFI->getArgInfo().WorkItemIDZ);
+ return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
case Intrinsic::amdgcn_wavefrontsize:
return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
SDLoc(Op), MVT::i32);
@@ -7157,12 +7199,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
unsigned ShaderType =
SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
unsigned Offset0 = OrderedCountIndex << 2;
- unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
- (Instruction << 4);
+ unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
Offset1 |= (CountDw - 1) << 6;
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
+ Offset1 |= ShaderType << 2;
+
unsigned Offset = Offset0 | (Offset1 << 8);
SDValue Ops[] = {
@@ -7441,7 +7485,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
case Intrinsic::amdgcn_buffer_atomic_fadd:
- if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
+ if (!Op.getValue(0).use_empty() && !hasAtomicFaddRtnForTy(Op)) {
DiagnosticInfoUnsupported
NoFpRet(DAG.getMachineFunction().getFunction(),
"return versions of fp atomics not supported",
@@ -7609,12 +7653,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return SDValue();
}
+ const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
const bool Is64 = NodePtr.getValueType() == MVT::i64;
const unsigned NumVDataDwords = 4;
const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
- const bool UseNSA = Subtarget->hasNSAEncoding() &&
- NumVAddrDwords <= Subtarget->getNSAMaxSize();
+ const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
+ const bool UseNSA =
+ Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
const unsigned BaseOpcodes[2][2] = {
{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
@@ -7622,12 +7668,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
int Opcode;
if (UseNSA) {
Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
- AMDGPU::MIMGEncGfx10NSA, NumVDataDwords,
- NumVAddrDwords);
+ IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
+ : AMDGPU::MIMGEncGfx10NSA,
+ NumVDataDwords, NumVAddrDwords);
} else {
- Opcode = AMDGPU::getMIMGOpcode(
- BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
- PowerOf2Ceil(NumVAddrDwords));
+ Opcode =
+ AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
+ IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default
+ : AMDGPU::MIMGEncGfx10Default,
+ NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
}
assert(Opcode != -1);
@@ -7660,15 +7709,36 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
};
- if (Is64)
- DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2);
- else
+ if (UseNSA && IsGFX11Plus) {
Ops.push_back(NodePtr);
+ Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
+ Ops.push_back(RayOrigin);
+ if (IsA16) {
+ SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
+ DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
+ DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
+ for (unsigned I = 0; I < 3; ++I) {
+ MergedLanes.push_back(DAG.getBitcast(
+ MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
+ {DirLanes[I], InvDirLanes[I]})));
+ }
+ Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
+ } else {
+ Ops.push_back(RayDir);
+ Ops.push_back(RayInvDir);
+ }
+ } else {
+ if (Is64)
+ DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
+ 2);
+ else
+ Ops.push_back(NodePtr);
- Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
- packLanes(RayOrigin, true);
- packLanes(RayDir, true);
- packLanes(RayInvDir, false);
+ Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
+ packLanes(RayOrigin, true);
+ packLanes(RayDir, true);
+ packLanes(RayInvDir, false);
+ }
if (!UseNSA) {
// Build a single vector containing all the operands so far prepared.
@@ -7868,6 +7938,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
switch (IntrinsicID) {
case Intrinsic::amdgcn_exp_compr: {
+ if (!Subtarget->hasCompressedExport()) {
+ DiagnosticInfoUnsupported BadIntrin(
+ DAG.getMachineFunction().getFunction(),
+ "intrinsic not supported on subtarget", DL.getDebugLoc());
+ DAG.getContext()->diagnose(BadIntrin);
+ }
SDValue Src0 = Op.getOperand(4);
SDValue Src1 = Op.getOperand(5);
// Hack around illegal type on SI by directly selecting it.
@@ -8110,6 +8186,160 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
+ case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ unsigned Opc;
+ bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds;
+ unsigned OpOffset = HasVIndex ? 1 : 0;
+ SDValue VOffset = Op.getOperand(5 + OpOffset);
+ auto CVOffset = dyn_cast<ConstantSDNode>(VOffset);
+ bool HasVOffset = !CVOffset || !CVOffset->isZero();
+ unsigned Size = Op->getConstantOperandVal(4);
+
+ switch (Size) {
+ default:
+ return SDValue();
+ case 1:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
+ break;
+ case 2:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
+ break;
+ case 4:
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
+ break;
+ }
+
+ SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+
+ SmallVector<SDValue, 8> Ops;
+
+ if (HasVIndex && HasVOffset)
+ Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
+ { Op.getOperand(5), // VIndex
+ VOffset }));
+ else if (HasVIndex)
+ Ops.push_back(Op.getOperand(5));
+ else if (HasVOffset)
+ Ops.push_back(VOffset);
+
+ Ops.push_back(Op.getOperand(2)); // rsrc
+ Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
+ Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
+ unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
+ Ops.push_back(
+ DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
+ Ops.push_back(
+ DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz
+ Ops.push_back(M0Val.getValue(0)); // Chain
+ Ops.push_back(M0Val.getValue(1)); // Glue
+
+ auto *M = cast<MemSDNode>(Op);
+ MachineMemOperand *LoadMMO = M->getMemOperand();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset);
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ StorePtrI.V = nullptr;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+
+ MachineMemOperand *StoreMMO =
+ MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), LoadMMO->getBaseAlign());
+
+ auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
+ DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+
+ return SDValue(Load, 0);
+ }
+ case Intrinsic::amdgcn_global_load_lds: {
+ unsigned Opc;
+ unsigned Size = Op->getConstantOperandVal(4);
+ switch (Size) {
+ default:
+ return SDValue();
+ case 1:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
+ break;
+ case 2:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
+ break;
+ case 4:
+ Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
+ break;
+ }
+
+ auto *M = cast<MemSDNode>(Op);
+ SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+
+ SmallVector<SDValue, 6> Ops;
+
+ SDValue Addr = Op.getOperand(2); // Global ptr
+ SDValue VOffset;
+ // Try to split SAddr and VOffset. Global and LDS pointers share the same
+ // immediate offset, so we cannot use a regular SelectGlobalSAddr().
+ if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
+ SDValue LHS = Addr.getOperand(0);
+ SDValue RHS = Addr.getOperand(1);
+
+ if (LHS->isDivergent())
+ std::swap(LHS, RHS);
+
+ if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getOperand(0).getValueType() == MVT::i32) {
+ // add (i64 sgpr), (zero_extend (i32 vgpr))
+ Addr = LHS;
+ VOffset = RHS.getOperand(0);
+ }
+ }
+
+ Ops.push_back(Addr);
+ if (!Addr->isDivergent()) {
+ Opc = AMDGPU::getGlobalSaddrOp(Opc);
+ if (!VOffset)
+ VOffset = SDValue(
+ DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
+ DAG.getTargetConstant(0, DL, MVT::i32)), 0);
+ Ops.push_back(VOffset);
+ }
+
+ Ops.push_back(Op.getOperand(5)); // Offset
+ Ops.push_back(Op.getOperand(6)); // CPol
+ Ops.push_back(M0Val.getValue(0)); // Chain
+ Ops.push_back(M0Val.getValue(1)); // Glue
+
+ MachineMemOperand *LoadMMO = M->getMemOperand();
+ MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
+ LoadPtrI.Offset = Op->getConstantOperandVal(5);
+ MachinePointerInfo StorePtrI = LoadPtrI;
+ LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+ StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
+ auto F = LoadMMO->getFlags() &
+ ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
+ LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
+ Size, LoadMMO->getBaseAlign());
+ MachineMemOperand *StoreMMO =
+ MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
+ sizeof(int32_t), Align(4));
+
+ auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+
+ return SDValue(Load, 0);
+ }
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
@@ -8271,7 +8501,7 @@ static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
- if (Ld->getAlignment() < 4 || Ld->isDivergent())
+ if (Ld->getAlign() < Align(4) || Ld->isDivergent())
return SDValue();
// FIXME: Constant loads should all be marked invariant.
@@ -8296,14 +8526,11 @@ SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const
// TODO: Drop only high part of range.
SDValue Ptr = Ld->getBasePtr();
- SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
- MVT::i32, SL, Ld->getChain(), Ptr,
- Ld->getOffset(),
- Ld->getPointerInfo(), MVT::i32,
- Ld->getAlignment(),
- Ld->getMemOperand()->getFlags(),
- Ld->getAAInfo(),
- nullptr); // Drop ranges
+ SDValue NewLoad = DAG.getLoad(
+ ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
+ Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
+ Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
+ nullptr); // Drop ranges
EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
if (MemVT.isFloatingPoint()) {
@@ -8392,17 +8619,16 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented.");
- unsigned Alignment = Load->getAlignment();
+ Align Alignment = Load->getAlign();
unsigned AS = Load->getAddressSpace();
- if (Subtarget->hasLDSMisalignedBug() &&
- AS == AMDGPUAS::FLAT_ADDRESS &&
- Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
+ if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
+ Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
return SplitVectorLoad(Op, DAG);
}
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- // If there is a possibilty that flat instruction access scratch memory
+ // If there is a possibility that flat instruction access scratch memory
// then we need to use the same legalization rules we use for private.
if (AS == AMDGPUAS::FLAT_ADDRESS &&
!Subtarget->hasMultiDwordFlatScratchAddressing())
@@ -8413,7 +8639,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
- if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
+ if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
if (MemVT.isPow2VectorType())
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
@@ -8429,7 +8655,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::GLOBAL_ADDRESS) {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
- Alignment >= 4 && NumElements < 32) {
+ Alignment >= Align(4) && NumElements < 32) {
if (MemVT.isPow2VectorType())
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
@@ -8479,27 +8705,15 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- // Use ds_read_b128 or ds_read_b96 when possible.
- if (Subtarget->hasDS96AndDS128() &&
- ((Subtarget->useDS128() && MemVT.getStoreSize() == 16) ||
- MemVT.getStoreSize() == 12) &&
- allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
- Load->getAlign()))
+ bool Fast = false;
+ auto Flags = Load->getMemOperand()->getFlags();
+ if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
+ Load->getAlign(), Flags, &Fast) &&
+ Fast)
return SDValue();
- if (NumElements > 2)
+ if (MemVT.isVector())
return SplitVectorLoad(Op, DAG);
-
- // SI has a hardware bug in the LDS / GDS boounds checking: if the base
- // address is negative, then the instruction is incorrectly treated as
- // out-of-bounds even if base + offsets is in bounds. Split vectorized
- // loads here to avoid emitting ds_read2_b32. We may re-combine the
- // load later in the SILoadStoreOptimizer.
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
- NumElements == 2 && MemVT.getStoreSize() == 8 &&
- Load->getAlignment() < 8) {
- return SplitVectorLoad(Op, DAG);
- }
}
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
@@ -8514,7 +8728,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
- if (VT.getSizeInBits() == 128)
+ if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
return splitTernaryVectorOp(Op, DAG);
assert(VT.getSizeInBits() == 64);
@@ -8946,13 +9160,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
unsigned AS = Store->getAddressSpace();
if (Subtarget->hasLDSMisalignedBug() &&
AS == AMDGPUAS::FLAT_ADDRESS &&
- Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
+ Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
return SplitVectorStore(Op, DAG);
}
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- // If there is a possibilty that flat instruction access scratch memory
+ // If there is a possibility that flat instruction access scratch memory
// then we need to use the same legalization rules we use for private.
if (AS == AMDGPUAS::FLAT_ADDRESS &&
!Subtarget->hasMultiDwordFlatScratchAddressing())
@@ -8990,39 +9204,21 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- // Use ds_write_b128 or ds_write_b96 when possible.
- if (Subtarget->hasDS96AndDS128() &&
- ((Subtarget->useDS128() && VT.getStoreSize() == 16) ||
- (VT.getStoreSize() == 12)) &&
- allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
- Store->getAlign()))
+ bool Fast = false;
+ auto Flags = Store->getMemOperand()->getFlags();
+ if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
+ Store->getAlign(), Flags, &Fast) &&
+ Fast)
return SDValue();
- if (NumElements > 2)
+ if (VT.isVector())
return SplitVectorStore(Op, DAG);
- // SI has a hardware bug in the LDS / GDS boounds checking: if the base
- // address is negative, then the instruction is incorrectly treated as
- // out-of-bounds even if base + offsets is in bounds. Split vectorized
- // stores here to avoid emitting ds_write2_b32. We may re-combine the
- // store later in the SILoadStoreOptimizer.
- if (!Subtarget->hasUsableDSOffset() &&
- NumElements == 2 && VT.getStoreSize() == 8 &&
- Store->getAlignment() < 8) {
- return SplitVectorStore(Op, DAG);
- }
-
- if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- VT, *Store->getMemOperand())) {
- if (VT.isVector())
- return SplitVectorStore(Op, DAG);
- return expandUnalignedStore(Store, DAG);
- }
-
- return SDValue();
- } else {
- llvm_unreachable("unhandled address space");
+ return expandUnalignedStore(Store, DAG);
}
+
+ // Probably an invalid store. If so we'll end up emitting a selection error.
+ return SDValue();
}
SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
@@ -10041,7 +10237,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
}
}
- // If one half is undef, and one is constant, perfer a splat vector rather
+ // If one half is undef, and one is constant, prefer a splat vector rather
// than the normal qNaN. If it's a register, prefer 0.0 since that's
// cheaper to use and may be free with a packed operation.
if (NewElts[0].isUndef()) {
@@ -10349,7 +10545,8 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
// expanded into a set of cmp/select instructions.
bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
unsigned NumElem,
- bool IsDivergentIdx) {
+ bool IsDivergentIdx,
+ const GCNSubtarget *Subtarget) {
if (UseDivergentRegisterIndexing)
return false;
@@ -10371,10 +10568,18 @@ bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
// Large vectors would yield too many compares and v_cndmask_b32 instructions.
unsigned NumInsts = NumElem /* Number of compares */ +
((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
- return NumInsts <= 16;
+
+ // On some architectures (GFX9) movrel is not available and it's better
+ // to expand.
+ if (!Subtarget->hasMovrel())
+ return NumInsts <= 16;
+
+ // If movrel is available, use it instead of expanding for vector of 8
+ // elements.
+ return NumInsts <= 15;
}
-static bool shouldExpandVectorDynExt(SDNode *N) {
+bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
SDValue Idx = N->getOperand(N->getNumOperands() - 1);
if (isa<ConstantSDNode>(Idx))
return false;
@@ -10385,8 +10590,8 @@ static bool shouldExpandVectorDynExt(SDNode *N) {
unsigned EltSize = EltVT.getSizeInBits();
unsigned NumElem = VecVT.getVectorNumElements();
- return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
- Idx->isDivergent());
+ return SITargetLowering::shouldExpandVectorDynExt(
+ EltSize, NumElem, Idx->isDivergent(), getSubtarget());
}
SDValue SITargetLowering::performExtractVectorEltCombine(
@@ -10450,7 +10655,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
unsigned EltSize = EltVT.getSizeInBits();
// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
- if (::shouldExpandVectorDynExt(N)) {
+ if (shouldExpandVectorDynExt(N)) {
SDLoc SL(N);
SDValue Idx = N->getOperand(1);
SDValue V;
@@ -10513,7 +10718,7 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
// INSERT_VECTOR_ELT (<n x e>, var-idx)
// => BUILD_VECTOR n x select (e, const-idx)
- if (!::shouldExpandVectorDynExt(N))
+ if (!shouldExpandVectorDynExt(N))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -10603,39 +10808,145 @@ static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
}
-SDValue SITargetLowering::performAddCombine(SDNode *N,
+// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
+// multiplies, if any.
+//
+// Full 64-bit multiplies that feed into an addition are lowered here instead
+// of using the generic expansion. The generic expansion ends up with
+// a tree of ADD nodes that prevents us from using the "add" part of the
+// MAD instruction. The expansion produced here results in a chain of ADDs
+// instead of a tree.
+SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
DAGCombinerInfo &DCI) const {
+ assert(N->getOpcode() == ISD::ADD);
+
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
SDLoc SL(N);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
- if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
- && Subtarget->hasMad64_32() &&
- !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
- VT.getScalarSizeInBits() <= 64) {
- if (LHS.getOpcode() != ISD::MUL)
- std::swap(LHS, RHS);
+ if (VT.isVector())
+ return SDValue();
+
+ // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
+ // result in scalar registers for uniform values.
+ if (!N->isDivergent() && Subtarget->hasSMulHi())
+ return SDValue();
+
+ unsigned NumBits = VT.getScalarSizeInBits();
+ if (NumBits <= 32 || NumBits > 64)
+ return SDValue();
+
+ if (LHS.getOpcode() != ISD::MUL) {
+ assert(RHS.getOpcode() == ISD::MUL);
+ std::swap(LHS, RHS);
+ }
+
+ // Avoid the fold if it would unduly increase the number of multiplies due to
+ // multiple uses, except on hardware with full-rate multiply-add (which is
+ // part of full-rate 64-bit ops).
+ if (!Subtarget->hasFullRate64Ops()) {
+ unsigned NumUsers = 0;
+ for (SDNode *Use : LHS->uses()) {
+ // There is a use that does not feed into addition, so the multiply can't
+ // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
+ if (Use->getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
+ // MUL + 3xADD + 3xADDC over 3xMAD.
+ ++NumUsers;
+ if (NumUsers >= 3)
+ return SDValue();
+ }
+ }
+
+ SDValue MulLHS = LHS.getOperand(0);
+ SDValue MulRHS = LHS.getOperand(1);
+ SDValue AddRHS = RHS;
+
+ // Always check whether operands are small unsigned values, since that
+ // knowledge is useful in more cases. Check for small signed values only if
+ // doing so can unlock a shorter code sequence.
+ bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
+ bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
+
+ bool MulSignedLo = false;
+ if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
+ MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
+ numBitsSigned(MulRHS, DAG) <= 32;
+ }
+
+ // The operands and final result all have the same number of bits. If
+ // operands need to be extended, they can be extended with garbage. The
+ // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
+ // truncated away in the end.
+ if (VT != MVT::i64) {
+ MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
+ MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
+ AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
+ }
+
+ // The basic code generated is conceptually straightforward. Pseudo code:
+ //
+ // accum = mad_64_32 lhs.lo, rhs.lo, accum
+ // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
+ // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
+ //
+ // The second and third lines are optional, depending on whether the factors
+ // are {sign,zero}-extended or not.
+ //
+ // The actual DAG is noisier than the pseudo code, but only due to
+ // instructions that disassemble values into low and high parts, and
+ // assemble the final result.
+ SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+ auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
+ auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
+ SDValue Accum =
+ getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
- SDValue MulLHS = LHS.getOperand(0);
- SDValue MulRHS = LHS.getOperand(1);
- SDValue AddRHS = RHS;
+ if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
+ auto AccumLo = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, Zero);
+ auto AccumHi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, One);
- // TODO: Maybe restrict if SGPR inputs.
- if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
- numBitsUnsigned(MulRHS, DAG) <= 32) {
- MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
- MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
- AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
- return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
+ if (!MulLHSUnsigned32) {
+ auto MulLHSHi =
+ DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
+ SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
+ AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
}
- if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) {
- MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
- MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
- AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
- return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
+ if (!MulRHSUnsigned32) {
+ auto MulRHSHi =
+ DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
+ SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
+ AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
+ }
+
+ Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
+ Accum = DAG.getBitcast(MVT::i64, Accum);
+ }
+
+ if (VT != MVT::i64)
+ Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
+ return Accum;
+}
+
+SDValue SITargetLowering::performAddCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc SL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
+ if (Subtarget->hasMad64_32()) {
+ if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+ return Folded;
}
return SDValue();
@@ -10763,7 +11074,7 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N,
SDValue RHS = N->getOperand(1);
// These should really be instruction patterns, but writing patterns with
- // source modiifiers is a pain.
+ // source modifiers is a pain.
// fadd (fadd (a, a), b) -> mad 2.0, a, b
if (LHS.getOpcode() == ISD::FADD) {
@@ -10860,8 +11171,8 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
return SDValue();
// fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
- // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
- // is sufficient to allow generaing fdot2.
+ // regardless of the denorm mode setting. Therefore,
+ // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
const TargetOptions &Options = DAG.getTarget().Options;
if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
(N->getFlags().hasAllowContract() &&
@@ -11562,7 +11873,7 @@ void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
if (DstSize < InitIdx)
return;
- // Create a register for the intialization value.
+ // Create a register for the initialization value.
Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
unsigned NewDst = 0; // Final initialized value will be in here
@@ -11608,7 +11919,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
TII->legalizeOperandsVOP3(MRI, MI);
// Prefer VGPRs over AGPRs in mAI instructions where possible.
- // This saves a chain-copy of registers and better ballance register
+ // This saves a chain-copy of registers and better balance register
// use between vgpr and agpr as agpr tuples tend to be big.
if (MI.getDesc().OpInfo) {
unsigned Opc = MI.getOpcode();
@@ -11633,54 +11944,29 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
// so no use checks are needed.
MRI.setRegClass(Op.getReg(), NewRC);
}
- }
-
- return;
- }
- // Replace unused atomics with the no return version.
- int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
- if (NoRetAtomicOp != -1) {
- if (!Node->hasAnyUseOfValue(0)) {
- int CPolIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::cpol);
- if (CPolIdx != -1) {
- MachineOperand &CPol = MI.getOperand(CPolIdx);
- CPol.setImm(CPol.getImm() & ~AMDGPU::CPol::GLC);
+ // Resolve the rest of AV operands to AGPRs.
+ if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
+ if (Src2->isReg() && Src2->getReg().isVirtual()) {
+ auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
+ if (TRI->isVectorSuperClass(RC)) {
+ auto *NewRC = TRI->getEquivalentAGPRClass(RC);
+ MRI.setRegClass(Src2->getReg(), NewRC);
+ if (Src2->isTied())
+ MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
+ }
+ }
}
- MI.RemoveOperand(0);
- MI.setDesc(TII->get(NoRetAtomicOp));
- return;
}
- // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
- // instruction, because the return type of these instructions is a vec2 of
- // the memory type, so it can be tied to the input operand.
- // This means these instructions always have a use, so we need to add a
- // special case to check if the atomic has only one extract_subreg use,
- // which itself has no uses.
- if ((Node->hasNUsesOfValue(1, 0) &&
- Node->use_begin()->isMachineOpcode() &&
- Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
- !Node->use_begin()->hasAnyUseOfValue(0))) {
- Register Def = MI.getOperand(0).getReg();
-
- // Change this into a noret atomic.
- MI.setDesc(TII->get(NoRetAtomicOp));
- MI.RemoveOperand(0);
-
- // If we only remove the def operand from the atomic instruction, the
- // extract_subreg will be left with a use of a vreg without a def.
- // So we need to insert an implicit_def to avoid machine verifier
- // errors.
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
- TII->get(AMDGPU::IMPLICIT_DEF), Def);
- }
return;
}
- if (TII->isMIMG(MI) && !MI.mayStore())
- AddIMGInit(MI);
+ if (TII->isMIMG(MI)) {
+ if (!MI.mayStore())
+ AddIMGInit(MI);
+ TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
+ }
}
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
@@ -12243,13 +12529,17 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
MachineBasicBlock *Exit = ML->getExitBlock();
if (Pre && Exit) {
- BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
- TII->get(AMDGPU::S_INST_PREFETCH))
- .addImm(1); // prefetch 2 lines behind PC
+ auto PreTerm = Pre->getFirstTerminator();
+ if (PreTerm == Pre->begin() ||
+ std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
+ BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
+ .addImm(1); // prefetch 2 lines behind PC
- BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
- TII->get(AMDGPU::S_INST_PREFETCH))
- .addImm(2); // prefetch 1 line behind PC
+ auto ExitHead = Exit->getFirstNonDebugInstr();
+ if (ExitHead == Exit->end() ||
+ ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
+ BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
+ .addImm(2); // prefetch 1 line behind PC
}
return CacheLineAlign;
@@ -12390,6 +12680,9 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+ unsigned AS = RMW->getPointerAddressSpace();
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS)
+ return AtomicExpansionKind::NotAtomic;
auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) {
OptimizationRemarkEmitter ORE(RMW->getFunction());
@@ -12421,10 +12714,11 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
return AtomicExpansionKind::CmpXChg;
- unsigned AS = RMW->getPointerAddressSpace();
-
if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
- Subtarget->hasAtomicFaddInsts()) {
+ Subtarget->hasAtomicFaddNoRtnInsts()) {
+ if (Subtarget->hasGFX940Insts())
+ return AtomicExpansionKind::None;
+
// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
// floating point atomic instructions. May generate more efficient code,
// but may not respect rounding and denormal modes, and may give incorrect
@@ -12453,8 +12747,8 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
: AtomicExpansionKind::CmpXChg;
}
- // DS FP atomics do repect the denormal mode, but the rounding mode is fixed
- // to round-to-nearest-even.
+ // DS FP atomics do respect the denormal mode, but the rounding mode is
+ // fixed to round-to-nearest-even.
// The only exception is DS_ADD_F64 which never flushes regardless of mode.
if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
if (!Ty->isDoubleTy())
@@ -12479,6 +12773,27 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
}
+TargetLowering::AtomicExpansionKind
+SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+ return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
+ ? AtomicExpansionKind::NotAtomic
+ : AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+ return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
+ ? AtomicExpansionKind::NotAtomic
+ : AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
+ return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
+ ? AtomicExpansionKind::NotAtomic
+ : AtomicExpansionKind::None;
+}
+
const TargetRegisterClass *
SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
@@ -12500,7 +12815,7 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
// always uniform.
static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
unsigned WaveSize) {
- // FIXME: We asssume we never cast the mask results of a control flow
+ // FIXME: We assume we never cast the mask results of a control flow
// intrinsic.
// Early exit if the type won't be consistent as a compile time hack.
IntegerType *IT = dyn_cast<IntegerType>(V->getType());
@@ -12604,7 +12919,7 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
SDValue N1) const {
if (!N0.hasOneUse())
return false;
- // Take care of the oportunity to keep N0 uniform
+ // Take care of the opportunity to keep N0 uniform
if (N0->isDivergent() || !N1->isDivergent())
return true;
// Check if we have a good chance to form the memory access pattern with the
@@ -12612,3 +12927,11 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
return (DAG.isBaseWithConstantOffset(N0) &&
hasMemSDNodeUser(*N0->use_begin()));
}
+
+MachineMemOperand::Flags
+SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
+ // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
+ if (I.getMetadata("amdgpu.noclobber"))
+ return MONoClobber;
+ return MachineMemOperand::MONone;
+}