summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp3005
1 files changed, 2425 insertions, 580 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d45a80057564a..85db14ab66feb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -99,11 +99,6 @@ STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
-static cl::opt<bool>
-EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
- cl::desc("Allow AArch64 SLI/SRI formation"),
- cl::init(false));
-
// FIXME: The necessary dtprel relocations don't seem to be supported
// well in the GNU bfd and gold linkers at the moment. Therefore, by
// default, for now, fall back to GeneralDynamic code generation.
@@ -121,6 +116,18 @@ EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
+/// Returns true if VT's elements occupy the lowest bit positions of its
+/// associated register class without any intervening space.
+///
+/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
+/// same register class, but only nxv8f16 can be treated as a packed vector.
+static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
+ assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected legal vector type!");
+ return VT.isFixedLengthVector() ||
+ VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
+}
+
AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -137,6 +144,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
if (Subtarget->hasFPARMv8()) {
addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+ addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
@@ -153,6 +161,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addDRTypeForNEON(MVT::v1i64);
addDRTypeForNEON(MVT::v1f64);
addDRTypeForNEON(MVT::v4f16);
+ addDRTypeForNEON(MVT::v4bf16);
addQRTypeForNEON(MVT::v4f32);
addQRTypeForNEON(MVT::v2f64);
@@ -161,6 +170,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addQRTypeForNEON(MVT::v4i32);
addQRTypeForNEON(MVT::v2i64);
addQRTypeForNEON(MVT::v8f16);
+ addQRTypeForNEON(MVT::v8bf16);
}
if (Subtarget->hasSVE()) {
@@ -183,21 +193,51 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
+ if (Subtarget->hasBF16()) {
+ addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
+ }
+
+ if (useSVEForFixedLengthVectors()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+ if (useSVEForFixedLengthVectorVT(VT))
+ addRegisterClass(VT, &AArch64::ZPRRegClass);
+
+ for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+ if (useSVEForFixedLengthVectorVT(VT))
+ addRegisterClass(VT, &AArch64::ZPRRegClass);
+ }
+
for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
- setOperationAction(ISD::SMAX, VT, Legal);
- setOperationAction(ISD::UMAX, VT, Legal);
- setOperationAction(ISD::SMIN, VT, Legal);
- setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
}
for (auto VT :
{ MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
+
+ for (auto VT :
+ { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32,
+ MVT::nxv2f64 }) {
+ setCondCodeAction(ISD::SETO, VT, Expand);
+ setCondCodeAction(ISD::SETOLT, VT, Expand);
+ setCondCodeAction(ISD::SETOLE, VT, Expand);
+ setCondCodeAction(ISD::SETULT, VT, Expand);
+ setCondCodeAction(ISD::SETULE, VT, Expand);
+ setCondCodeAction(ISD::SETUGE, VT, Expand);
+ setCondCodeAction(ISD::SETUGT, VT, Expand);
+ setCondCodeAction(ISD::SETUEQ, VT, Expand);
+ setCondCodeAction(ISD::SETUNE, VT, Expand);
+ }
}
// Compute derived properties from the register classes
@@ -211,6 +251,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SETCC, MVT::f16, Custom);
setOperationAction(ISD::SETCC, MVT::f32, Custom);
setOperationAction(ISD::SETCC, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
@@ -266,6 +312,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSUB, MVT::f128, Custom);
setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
setOperationAction(ISD::SETCC, MVT::f128, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
setOperationAction(ISD::BR_CC, MVT::f128, Custom);
setOperationAction(ISD::SELECT, MVT::f128, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
@@ -276,17 +324,31 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
// Variable arguments.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -327,12 +389,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ROTR, VT, Expand);
}
+ // AArch64 doesn't have i32 MULH{S|U}.
+ setOperationAction(ISD::MULHU, MVT::i32, Expand);
+ setOperationAction(ISD::MULHS, MVT::i32, Expand);
+
// AArch64 doesn't have {U|S}MUL_LOHI.
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::CTPOP, MVT::i32, Custom);
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+ setOperationAction(ISD::CTPOP, MVT::i128, Custom);
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
@@ -525,6 +592,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::i128, Custom);
setOperationAction(ISD::STORE, MVT::i128, Custom);
+ // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
+ // custom lowering, as there are no un-paired non-temporal stores and
+ // legalization will break up 256 bit inputs.
+ setOperationAction(ISD::STORE, MVT::v32i8, Custom);
+ setOperationAction(ISD::STORE, MVT::v16i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v16f16, Custom);
+ setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v8f32, Custom);
+ setOperationAction(ISD::STORE, MVT::v4f64, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i64, Custom);
+
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
// This requires the Performance Monitors extension.
if (Subtarget->hasPerfMon())
@@ -574,6 +652,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::i16, Custom);
setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
// Indexed loads and stores are supported.
for (unsigned im = (unsigned)ISD::PRE_INC;
@@ -585,6 +664,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setIndexedLoadAction(im, MVT::f64, Legal);
setIndexedLoadAction(im, MVT::f32, Legal);
setIndexedLoadAction(im, MVT::f16, Legal);
+ setIndexedLoadAction(im, MVT::bf16, Legal);
setIndexedStoreAction(im, MVT::i8, Legal);
setIndexedStoreAction(im, MVT::i16, Legal);
setIndexedStoreAction(im, MVT::i32, Legal);
@@ -592,6 +672,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setIndexedStoreAction(im, MVT::f64, Legal);
setIndexedStoreAction(im, MVT::f32, Legal);
setIndexedStoreAction(im, MVT::f16, Legal);
+ setIndexedStoreAction(im, MVT::bf16, Legal);
}
// Trap.
@@ -769,6 +850,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
+
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
}
for (MVT VT : { MVT::v4f16, MVT::v2f32,
MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
@@ -825,6 +908,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
+ if (Subtarget->hasSVE())
+ setOperationAction(ISD::VSCALE, MVT::i32, Custom);
+
setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
}
@@ -833,11 +919,60 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// splat of 0 or undef) once vector selects supported in SVE codegen. See
// D68877 for more details.
for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
- if (isTypeLegal(VT))
+ if (isTypeLegal(VT)) {
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::SDIV, VT, Custom);
+ setOperationAction(ISD::UDIV, VT, Custom);
+ setOperationAction(ISD::SMIN, VT, Custom);
+ setOperationAction(ISD::UMIN, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, Custom);
+ setOperationAction(ISD::UMAX, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ if (VT.getScalarType() == MVT::i1)
+ setOperationAction(ISD::SETCC, VT, Custom);
+ }
}
+
+ for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32})
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+
+ for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
+ if (isTypeLegal(VT)) {
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Custom);
+ }
+ }
+
+ // NOTE: Currently this has to happen after computeRegisterProperties rather
+ // than the preferred option of combining it with the addRegisterClass call.
+ if (useSVEForFixedLengthVectors()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+ if (useSVEForFixedLengthVectorVT(VT))
+ addTypeForFixedLengthSVE(VT);
+ for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+ if (useSVEForFixedLengthVectorVT(VT))
+ addTypeForFixedLengthSVE(VT);
+
+ // 64bit results can mean a bigger than NEON input.
+ for (auto VT : {MVT::v8i8, MVT::v4i16})
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
+
+ // 128bit results imply a bigger than NEON input.
+ for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ for (auto VT : {MVT::v8f16, MVT::v4f32})
+ setOperationAction(ISD::FP_ROUND, VT, Expand);
+ }
}
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
@@ -922,6 +1057,24 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
}
}
+void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ // By default everything must be expanded.
+ for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
+ setOperationAction(Op, VT, Expand);
+
+ // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+ // Lower fixed length vector operations to scalable equivalents.
+ setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+}
+
void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
addRegisterClass(VT, &AArch64::FPR64RegClass);
addTypeForNEON(VT, MVT::v2i32);
@@ -932,10 +1085,12 @@ void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
addTypeForNEON(VT, MVT::v4i32);
}
-EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
- EVT VT) const {
+EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
+ LLVMContext &C, EVT VT) const {
if (!VT.isVector())
return MVT::i32;
+ if (VT.isScalableVector())
+ return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
return VT.changeVectorElementTypeToInteger();
}
@@ -1035,7 +1190,8 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
}
bool AArch64TargetLowering::targetShrinkDemandedConstant(
- SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ TargetLoweringOpt &TLO) const {
// Delay this optimization to as late as possible.
if (!TLO.LegalOps)
return false;
@@ -1052,7 +1208,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant(
"i32 or i64 is expected after legalization.");
// Exit early if we demand all bits.
- if (Demanded.countPopulation() == Size)
+ if (DemandedBits.countPopulation() == Size)
return false;
unsigned NewOpc;
@@ -1073,7 +1229,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant(
if (!C)
return false;
uint64_t Imm = C->getZExtValue();
- return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
+ return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
}
/// computeKnownBitsForTargetNode - Determine which of the bits specified in
@@ -1177,7 +1333,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
// Same as above but handling LLTs instead.
bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
- LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
bool *Fast) const {
if (Subtarget->requiresStrictAlign())
return false;
@@ -1192,7 +1348,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
// Code that uses clang vector extensions can mark that it
// wants unaligned accesses to be treated as fast by
// underspecifying alignment to be 1 or 2.
- Align <= 2 ||
+ Alignment <= 2 ||
// Disregard v2i64. Memcpy lowering produces those and splitting
// them regresses performance on micro-benchmarks and olden/bh.
@@ -1208,181 +1364,246 @@ AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
}
const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define MAKE_CASE(V) \
+ case V: \
+ return #V;
switch ((AArch64ISD::NodeType)Opcode) {
- case AArch64ISD::FIRST_NUMBER: break;
- case AArch64ISD::CALL: return "AArch64ISD::CALL";
- case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
- case AArch64ISD::ADR: return "AArch64ISD::ADR";
- case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
- case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
- case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
- case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
- case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
- case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
- case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
- case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
- case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
- case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
- case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
- case AArch64ISD::ADC: return "AArch64ISD::ADC";
- case AArch64ISD::SBC: return "AArch64ISD::SBC";
- case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
- case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
- case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
- case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
- case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
- case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
- case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
- case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
- case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
- case AArch64ISD::DUP: return "AArch64ISD::DUP";
- case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
- case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
- case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
- case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
- case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
- case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
- case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
- case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
- case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
- case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
- case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
- case AArch64ISD::BICi: return "AArch64ISD::BICi";
- case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
- case AArch64ISD::BSL: return "AArch64ISD::BSL";
- case AArch64ISD::NEG: return "AArch64ISD::NEG";
- case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
- case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
- case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
- case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
- case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
- case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
- case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
- case AArch64ISD::REV16: return "AArch64ISD::REV16";
- case AArch64ISD::REV32: return "AArch64ISD::REV32";
- case AArch64ISD::REV64: return "AArch64ISD::REV64";
- case AArch64ISD::EXT: return "AArch64ISD::EXT";
- case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
- case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
- case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
- case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
- case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
- case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
- case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
- case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
- case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
- case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
- case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
- case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
- case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
- case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
- case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
- case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
- case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
- case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
- case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
- case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
- case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
- case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
- case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
- case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
- case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
- case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
- case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
- case AArch64ISD::SMAXV_PRED: return "AArch64ISD::SMAXV_PRED";
- case AArch64ISD::UMAXV_PRED: return "AArch64ISD::UMAXV_PRED";
- case AArch64ISD::SMINV_PRED: return "AArch64ISD::SMINV_PRED";
- case AArch64ISD::UMINV_PRED: return "AArch64ISD::UMINV_PRED";
- case AArch64ISD::ORV_PRED: return "AArch64ISD::ORV_PRED";
- case AArch64ISD::EORV_PRED: return "AArch64ISD::EORV_PRED";
- case AArch64ISD::ANDV_PRED: return "AArch64ISD::ANDV_PRED";
- case AArch64ISD::CLASTA_N: return "AArch64ISD::CLASTA_N";
- case AArch64ISD::CLASTB_N: return "AArch64ISD::CLASTB_N";
- case AArch64ISD::LASTA: return "AArch64ISD::LASTA";
- case AArch64ISD::LASTB: return "AArch64ISD::LASTB";
- case AArch64ISD::REV: return "AArch64ISD::REV";
- case AArch64ISD::TBL: return "AArch64ISD::TBL";
- case AArch64ISD::NOT: return "AArch64ISD::NOT";
- case AArch64ISD::BIT: return "AArch64ISD::BIT";
- case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
- case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
- case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
- case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
- case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
- case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
- case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
- case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
- case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
- case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
- case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
- case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
- case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
- case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
- case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
- case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
- case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
- case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
- case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
- case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
- case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
- case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
- case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
- case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
- case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
- case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
- case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
- case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
- case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
- case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
- case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
- case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
- case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
- case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
- case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
- case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
- case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
- case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
- case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
- case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
- case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
- case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
- case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
- case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
- case AArch64ISD::STG: return "AArch64ISD::STG";
- case AArch64ISD::STZG: return "AArch64ISD::STZG";
- case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
- case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
- case AArch64ISD::SUNPKHI: return "AArch64ISD::SUNPKHI";
- case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO";
- case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI";
- case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO";
- case AArch64ISD::INSR: return "AArch64ISD::INSR";
- case AArch64ISD::PTEST: return "AArch64ISD::PTEST";
- case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE";
- case AArch64ISD::GLD1: return "AArch64ISD::GLD1";
- case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED";
- case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW";
- case AArch64ISD::GLD1_UXTW: return "AArch64ISD::GLD1_UXTW";
- case AArch64ISD::GLD1_SXTW_SCALED: return "AArch64ISD::GLD1_SXTW_SCALED";
- case AArch64ISD::GLD1_UXTW_SCALED: return "AArch64ISD::GLD1_UXTW_SCALED";
- case AArch64ISD::GLD1_IMM: return "AArch64ISD::GLD1_IMM";
- case AArch64ISD::GLD1S: return "AArch64ISD::GLD1S";
- case AArch64ISD::GLD1S_SCALED: return "AArch64ISD::GLD1S_SCALED";
- case AArch64ISD::GLD1S_SXTW: return "AArch64ISD::GLD1S_SXTW";
- case AArch64ISD::GLD1S_UXTW: return "AArch64ISD::GLD1S_UXTW";
- case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED";
- case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED";
- case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM";
- case AArch64ISD::SST1: return "AArch64ISD::SST1";
- case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED";
- case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW";
- case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW";
- case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED";
- case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED";
- case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
- case AArch64ISD::LDP: return "AArch64ISD::LDP";
- case AArch64ISD::STP: return "AArch64ISD::STP";
- }
+ case AArch64ISD::FIRST_NUMBER:
+ break;
+ MAKE_CASE(AArch64ISD::CALL)
+ MAKE_CASE(AArch64ISD::ADRP)
+ MAKE_CASE(AArch64ISD::ADR)
+ MAKE_CASE(AArch64ISD::ADDlow)
+ MAKE_CASE(AArch64ISD::LOADgot)
+ MAKE_CASE(AArch64ISD::RET_FLAG)
+ MAKE_CASE(AArch64ISD::BRCOND)
+ MAKE_CASE(AArch64ISD::CSEL)
+ MAKE_CASE(AArch64ISD::FCSEL)
+ MAKE_CASE(AArch64ISD::CSINV)
+ MAKE_CASE(AArch64ISD::CSNEG)
+ MAKE_CASE(AArch64ISD::CSINC)
+ MAKE_CASE(AArch64ISD::THREAD_POINTER)
+ MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
+ MAKE_CASE(AArch64ISD::ADD_PRED)
+ MAKE_CASE(AArch64ISD::SDIV_PRED)
+ MAKE_CASE(AArch64ISD::UDIV_PRED)
+ MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::SHL_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::SRL_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::SRA_MERGE_OP1)
+ MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::ADC)
+ MAKE_CASE(AArch64ISD::SBC)
+ MAKE_CASE(AArch64ISD::ADDS)
+ MAKE_CASE(AArch64ISD::SUBS)
+ MAKE_CASE(AArch64ISD::ADCS)
+ MAKE_CASE(AArch64ISD::SBCS)
+ MAKE_CASE(AArch64ISD::ANDS)
+ MAKE_CASE(AArch64ISD::CCMP)
+ MAKE_CASE(AArch64ISD::CCMN)
+ MAKE_CASE(AArch64ISD::FCCMP)
+ MAKE_CASE(AArch64ISD::FCMP)
+ MAKE_CASE(AArch64ISD::STRICT_FCMP)
+ MAKE_CASE(AArch64ISD::STRICT_FCMPE)
+ MAKE_CASE(AArch64ISD::DUP)
+ MAKE_CASE(AArch64ISD::DUPLANE8)
+ MAKE_CASE(AArch64ISD::DUPLANE16)
+ MAKE_CASE(AArch64ISD::DUPLANE32)
+ MAKE_CASE(AArch64ISD::DUPLANE64)
+ MAKE_CASE(AArch64ISD::MOVI)
+ MAKE_CASE(AArch64ISD::MOVIshift)
+ MAKE_CASE(AArch64ISD::MOVIedit)
+ MAKE_CASE(AArch64ISD::MOVImsl)
+ MAKE_CASE(AArch64ISD::FMOV)
+ MAKE_CASE(AArch64ISD::MVNIshift)
+ MAKE_CASE(AArch64ISD::MVNImsl)
+ MAKE_CASE(AArch64ISD::BICi)
+ MAKE_CASE(AArch64ISD::ORRi)
+ MAKE_CASE(AArch64ISD::BSP)
+ MAKE_CASE(AArch64ISD::NEG)
+ MAKE_CASE(AArch64ISD::EXTR)
+ MAKE_CASE(AArch64ISD::ZIP1)
+ MAKE_CASE(AArch64ISD::ZIP2)
+ MAKE_CASE(AArch64ISD::UZP1)
+ MAKE_CASE(AArch64ISD::UZP2)
+ MAKE_CASE(AArch64ISD::TRN1)
+ MAKE_CASE(AArch64ISD::TRN2)
+ MAKE_CASE(AArch64ISD::REV16)
+ MAKE_CASE(AArch64ISD::REV32)
+ MAKE_CASE(AArch64ISD::REV64)
+ MAKE_CASE(AArch64ISD::EXT)
+ MAKE_CASE(AArch64ISD::VSHL)
+ MAKE_CASE(AArch64ISD::VLSHR)
+ MAKE_CASE(AArch64ISD::VASHR)
+ MAKE_CASE(AArch64ISD::VSLI)
+ MAKE_CASE(AArch64ISD::VSRI)
+ MAKE_CASE(AArch64ISD::CMEQ)
+ MAKE_CASE(AArch64ISD::CMGE)
+ MAKE_CASE(AArch64ISD::CMGT)
+ MAKE_CASE(AArch64ISD::CMHI)
+ MAKE_CASE(AArch64ISD::CMHS)
+ MAKE_CASE(AArch64ISD::FCMEQ)
+ MAKE_CASE(AArch64ISD::FCMGE)
+ MAKE_CASE(AArch64ISD::FCMGT)
+ MAKE_CASE(AArch64ISD::CMEQz)
+ MAKE_CASE(AArch64ISD::CMGEz)
+ MAKE_CASE(AArch64ISD::CMGTz)
+ MAKE_CASE(AArch64ISD::CMLEz)
+ MAKE_CASE(AArch64ISD::CMLTz)
+ MAKE_CASE(AArch64ISD::FCMEQz)
+ MAKE_CASE(AArch64ISD::FCMGEz)
+ MAKE_CASE(AArch64ISD::FCMGTz)
+ MAKE_CASE(AArch64ISD::FCMLEz)
+ MAKE_CASE(AArch64ISD::FCMLTz)
+ MAKE_CASE(AArch64ISD::SADDV)
+ MAKE_CASE(AArch64ISD::UADDV)
+ MAKE_CASE(AArch64ISD::SRHADD)
+ MAKE_CASE(AArch64ISD::URHADD)
+ MAKE_CASE(AArch64ISD::SMINV)
+ MAKE_CASE(AArch64ISD::UMINV)
+ MAKE_CASE(AArch64ISD::SMAXV)
+ MAKE_CASE(AArch64ISD::UMAXV)
+ MAKE_CASE(AArch64ISD::SMAXV_PRED)
+ MAKE_CASE(AArch64ISD::UMAXV_PRED)
+ MAKE_CASE(AArch64ISD::SMINV_PRED)
+ MAKE_CASE(AArch64ISD::UMINV_PRED)
+ MAKE_CASE(AArch64ISD::ORV_PRED)
+ MAKE_CASE(AArch64ISD::EORV_PRED)
+ MAKE_CASE(AArch64ISD::ANDV_PRED)
+ MAKE_CASE(AArch64ISD::CLASTA_N)
+ MAKE_CASE(AArch64ISD::CLASTB_N)
+ MAKE_CASE(AArch64ISD::LASTA)
+ MAKE_CASE(AArch64ISD::LASTB)
+ MAKE_CASE(AArch64ISD::REV)
+ MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
+ MAKE_CASE(AArch64ISD::TBL)
+ MAKE_CASE(AArch64ISD::FADD_PRED)
+ MAKE_CASE(AArch64ISD::FADDA_PRED)
+ MAKE_CASE(AArch64ISD::FADDV_PRED)
+ MAKE_CASE(AArch64ISD::FMA_PRED)
+ MAKE_CASE(AArch64ISD::FMAXV_PRED)
+ MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
+ MAKE_CASE(AArch64ISD::FMINV_PRED)
+ MAKE_CASE(AArch64ISD::FMINNMV_PRED)
+ MAKE_CASE(AArch64ISD::NOT)
+ MAKE_CASE(AArch64ISD::BIT)
+ MAKE_CASE(AArch64ISD::CBZ)
+ MAKE_CASE(AArch64ISD::CBNZ)
+ MAKE_CASE(AArch64ISD::TBZ)
+ MAKE_CASE(AArch64ISD::TBNZ)
+ MAKE_CASE(AArch64ISD::TC_RETURN)
+ MAKE_CASE(AArch64ISD::PREFETCH)
+ MAKE_CASE(AArch64ISD::SITOF)
+ MAKE_CASE(AArch64ISD::UITOF)
+ MAKE_CASE(AArch64ISD::NVCAST)
+ MAKE_CASE(AArch64ISD::SQSHL_I)
+ MAKE_CASE(AArch64ISD::UQSHL_I)
+ MAKE_CASE(AArch64ISD::SRSHR_I)
+ MAKE_CASE(AArch64ISD::URSHR_I)
+ MAKE_CASE(AArch64ISD::SQSHLU_I)
+ MAKE_CASE(AArch64ISD::WrapperLarge)
+ MAKE_CASE(AArch64ISD::LD2post)
+ MAKE_CASE(AArch64ISD::LD3post)
+ MAKE_CASE(AArch64ISD::LD4post)
+ MAKE_CASE(AArch64ISD::ST2post)
+ MAKE_CASE(AArch64ISD::ST3post)
+ MAKE_CASE(AArch64ISD::ST4post)
+ MAKE_CASE(AArch64ISD::LD1x2post)
+ MAKE_CASE(AArch64ISD::LD1x3post)
+ MAKE_CASE(AArch64ISD::LD1x4post)
+ MAKE_CASE(AArch64ISD::ST1x2post)
+ MAKE_CASE(AArch64ISD::ST1x3post)
+ MAKE_CASE(AArch64ISD::ST1x4post)
+ MAKE_CASE(AArch64ISD::LD1DUPpost)
+ MAKE_CASE(AArch64ISD::LD2DUPpost)
+ MAKE_CASE(AArch64ISD::LD3DUPpost)
+ MAKE_CASE(AArch64ISD::LD4DUPpost)
+ MAKE_CASE(AArch64ISD::LD1LANEpost)
+ MAKE_CASE(AArch64ISD::LD2LANEpost)
+ MAKE_CASE(AArch64ISD::LD3LANEpost)
+ MAKE_CASE(AArch64ISD::LD4LANEpost)
+ MAKE_CASE(AArch64ISD::ST2LANEpost)
+ MAKE_CASE(AArch64ISD::ST3LANEpost)
+ MAKE_CASE(AArch64ISD::ST4LANEpost)
+ MAKE_CASE(AArch64ISD::SMULL)
+ MAKE_CASE(AArch64ISD::UMULL)
+ MAKE_CASE(AArch64ISD::FRECPE)
+ MAKE_CASE(AArch64ISD::FRECPS)
+ MAKE_CASE(AArch64ISD::FRSQRTE)
+ MAKE_CASE(AArch64ISD::FRSQRTS)
+ MAKE_CASE(AArch64ISD::STG)
+ MAKE_CASE(AArch64ISD::STZG)
+ MAKE_CASE(AArch64ISD::ST2G)
+ MAKE_CASE(AArch64ISD::STZ2G)
+ MAKE_CASE(AArch64ISD::SUNPKHI)
+ MAKE_CASE(AArch64ISD::SUNPKLO)
+ MAKE_CASE(AArch64ISD::UUNPKHI)
+ MAKE_CASE(AArch64ISD::UUNPKLO)
+ MAKE_CASE(AArch64ISD::INSR)
+ MAKE_CASE(AArch64ISD::PTEST)
+ MAKE_CASE(AArch64ISD::PTRUE)
+ MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
+ MAKE_CASE(AArch64ISD::ST1_PRED)
+ MAKE_CASE(AArch64ISD::SST1_PRED)
+ MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
+ MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
+ MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
+ MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
+ MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
+ MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
+ MAKE_CASE(AArch64ISD::SSTNT1_PRED)
+ MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
+ MAKE_CASE(AArch64ISD::LDP)
+ MAKE_CASE(AArch64ISD::STP)
+ MAKE_CASE(AArch64ISD::STNP)
+ MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
+ MAKE_CASE(AArch64ISD::INDEX_VECTOR)
+ }
+#undef MAKE_CASE
return nullptr;
}
@@ -1454,12 +1675,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
return BB;
}
-MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad(
- MachineInstr &MI, MachineBasicBlock *BB) const {
- MI.eraseFromParent();
- return BB;
-}
-
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
@@ -1478,8 +1693,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case AArch64::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
- case AArch64::CATCHPAD:
- return EmitLoweredCatchPad(MI, BB);
}
}
@@ -1668,6 +1881,17 @@ static bool isCMN(SDValue Op, ISD::CondCode CC) {
(CC == ISD::SETEQ || CC == ISD::SETNE);
}
+static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
+ SelectionDAG &DAG, SDValue Chain,
+ bool IsSignaling) {
+ EVT VT = LHS.getValueType();
+ assert(VT != MVT::f128);
+ assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
+ unsigned Opcode =
+ IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
+ return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
+}
+
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG) {
EVT VT = LHS.getValueType();
@@ -1699,14 +1923,22 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
Opcode = AArch64ISD::ADDS;
LHS = LHS.getOperand(1);
- } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
- !isUnsignedIntSetCC(CC)) {
- // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
- // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
- // of the signed comparisons.
- Opcode = AArch64ISD::ANDS;
- RHS = LHS.getOperand(1);
- LHS = LHS.getOperand(0);
+ } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
+ if (LHS.getOpcode() == ISD::AND) {
+ // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+ // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+ // of the signed comparisons.
+ const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
+ DAG.getVTList(VT, MVT_CC),
+ LHS.getOperand(0),
+ LHS.getOperand(1));
+ // Replace all users of (and X, Y) with newly generated (ands X, Y)
+ DAG.ReplaceAllUsesWith(LHS, ANDSNode);
+ return ANDSNode.getValue(1);
+ } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
+ // Use result of ANDS
+ return LHS.getValue(1);
+ }
}
return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
@@ -2284,18 +2516,16 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
- SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned Offset = IsStrict ? 1 : 0;
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
-}
-
-// Returns true if the given Op is the overflow flag result of an overflow
-// intrinsic operation.
-static bool isOverflowIntrOpRes(SDValue Op) {
- unsigned Opc = Op.getOpcode();
- return (Op.getResNo() == 1 &&
- (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
- Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
+ SDValue Result;
+ SDLoc dl(Op);
+ std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops,
+ CallOptions, dl, Chain);
+ return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
}
static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
@@ -2310,7 +2540,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
// (csel 1, 0, invert(cc), overflow_op_bool)
// ... which later gets transformed to just a cset instruction with an
// inverted condition code, rather than a cset + eor sequence.
- if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
+ if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
return SDValue();
@@ -2483,21 +2713,32 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
SelectionDAG &DAG) const {
- if (Op.getOperand(0).getValueType() != MVT::f128) {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+ EVT SrcVT = SrcVal.getValueType();
+
+ if (SrcVT != MVT::f128) {
+ // Expand cases where the input is a vector bigger than NEON.
+ if (useSVEForFixedLengthVectorVT(SrcVT))
+ return SDValue();
+
// It's legal except when f128 is involved
return Op;
}
RTLIB::Libcall LC;
- LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+ LC = RTLIB::getFPROUND(SrcVT, Op.getValueType());
// FP_ROUND node has a second operand indicating whether it is known to be
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
- SDValue SrcVal = Op.getOperand(0);
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions,
- SDLoc(Op)).first;
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ SDValue Result;
+ SDLoc dl(Op);
+ std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
+ CallOptions, dl, Chain);
+ return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
}
SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
@@ -2542,32 +2783,34 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
SelectionDAG &DAG) const {
- if (Op.getOperand(0).getValueType().isVector())
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+
+ if (SrcVal.getValueType().isVector())
return LowerVectorFP_TO_INT(Op, DAG);
// f16 conversions are promoted to f32 when full fp16 is not supported.
- if (Op.getOperand(0).getValueType() == MVT::f16 &&
- !Subtarget->hasFullFP16()) {
+ if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
+ assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
SDLoc dl(Op);
return DAG.getNode(
Op.getOpcode(), dl, Op.getValueType(),
- DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
+ DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
}
- if (Op.getOperand(0).getValueType() != MVT::f128) {
+ if (SrcVal.getValueType() != MVT::f128) {
// It's legal except when f128 is involved
return Op;
}
RTLIB::Libcall LC;
- if (Op.getOpcode() == ISD::FP_TO_SINT)
- LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+ if (Op.getOpcode() == ISD::FP_TO_SINT ||
+ Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
+ LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType());
else
- LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+ LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType());
- SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
- MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first;
+ return LowerF128Call(Op, DAG, LC);
}
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -2603,18 +2846,22 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
if (Op.getValueType().isVector())
return LowerVectorINT_TO_FP(Op, DAG);
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+
// f16 conversions are promoted to f32 when full fp16 is not supported.
if (Op.getValueType() == MVT::f16 &&
!Subtarget->hasFullFP16()) {
+ assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
SDLoc dl(Op);
return DAG.getNode(
ISD::FP_ROUND, dl, MVT::f16,
- DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
+ DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
DAG.getIntPtrConstant(0, dl));
}
// i128 conversions are libcalls.
- if (Op.getOperand(0).getValueType() == MVT::i128)
+ if (SrcVal.getValueType() == MVT::i128)
return SDValue();
// Other conversions are legal, unless it's to the completely software-based
@@ -2623,10 +2870,11 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
return Op;
RTLIB::Libcall LC;
- if (Op.getOpcode() == ISD::SINT_TO_FP)
- LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+ if (Op.getOpcode() == ISD::SINT_TO_FP ||
+ Op.getOpcode() == ISD::STRICT_SINT_TO_FP)
+ LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType());
else
- LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+ LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType());
return LowerF128Call(Op, DAG, LC);
}
@@ -2666,7 +2914,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
}
static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
- if (Op.getValueType() != MVT::f16)
+ EVT OpVT = Op.getValueType();
+ if (OpVT != MVT::f16 && OpVT != MVT::bf16)
return SDValue();
assert(Op.getOperand(0).getValueType() == MVT::i16);
@@ -2675,7 +2924,7 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
return SDValue(
- DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
+ DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
0);
}
@@ -2804,16 +3053,19 @@ SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
// so that the shift + and get folded into a bitfield extract.
SDLoc dl(Op);
- SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
- DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
- MVT::i64));
+ SDValue Chain = Op.getOperand(0);
+ SDValue FPCR_64 = DAG.getNode(
+ ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
+ {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
+ Chain = FPCR_64.getValue(1);
SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
DAG.getConstant(1U << 22, dl, MVT::i32));
SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
DAG.getConstant(22, dl, MVT::i32));
- return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
- DAG.getConstant(3, dl, MVT::i32));
+ SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+ DAG.getConstant(3, dl, MVT::i32));
+ return DAG.getMergeValues({AND, Chain}, dl);
}
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
@@ -2885,6 +3137,12 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
}
+static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
+ int Pattern) {
+ return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
+ DAG.getTargetConstant(Pattern, DL, MVT::i32));
+}
+
SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -2972,6 +3230,26 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::aarch64_sve_ptrue:
return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
Op.getOperand(1));
+ case Intrinsic::aarch64_sve_dupq_lane:
+ return LowerDUPQLane(Op, DAG);
+ case Intrinsic::aarch64_sve_convert_from_svbool:
+ return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_convert_to_svbool: {
+ EVT OutVT = Op.getValueType();
+ EVT InVT = Op.getOperand(1).getValueType();
+ // Return the operand if the cast isn't changing type,
+ // i.e. <n x 16 x i1> -> <n x 16 x i1>
+ if (InVT == OutVT)
+ return Op.getOperand(1);
+ // Otherwise, zero the newly introduced lanes.
+ SDValue Reinterpret =
+ DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1));
+ SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all);
+ SDValue MaskReinterpret =
+ DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask);
+ return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret);
+ }
case Intrinsic::aarch64_sve_insr: {
SDValue Scalar = Op.getOperand(2);
@@ -3004,6 +3282,29 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
"llvm.eh.recoverfp must take a function as the first argument");
return IncomingFPOp;
}
+
+ case Intrinsic::aarch64_neon_vsri:
+ case Intrinsic::aarch64_neon_vsli: {
+ EVT Ty = Op.getValueType();
+
+ if (!Ty.isVector())
+ report_fatal_error("Unexpected type for aarch64_neon_vsli");
+
+ assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
+
+ bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
+ unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
+ return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3));
+ }
+
+ case Intrinsic::aarch64_neon_srhadd:
+ case Intrinsic::aarch64_neon_urhadd: {
+ bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd;
+ unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
+ return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ }
}
}
@@ -3058,10 +3359,13 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
EVT MemVT = StoreNode->getMemoryVT();
if (VT.isVector()) {
+ if (useSVEForFixedLengthVectorVT(VT))
+ return LowerFixedLengthVectorStoreToSVE(Op, DAG);
+
unsigned AS = StoreNode->getAddressSpace();
- unsigned Align = StoreNode->getAlignment();
- if (Align < MemVT.getStoreSize() &&
- !allowsMisalignedMemoryAccesses(MemVT, AS, Align,
+ Align Alignment = StoreNode->getAlign();
+ if (Alignment < MemVT.getStoreSize() &&
+ !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
StoreNode->getMemOperand()->getFlags(),
nullptr)) {
return scalarizeVectorStore(StoreNode, DAG);
@@ -3070,6 +3374,30 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
if (StoreNode->isTruncatingStore()) {
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
}
+ // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
+ // the custom lowering, as there are no un-paired non-temporal stores and
+ // legalization will break up 256 bit inputs.
+ if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
+ MemVT.getVectorElementCount().Min % 2u == 0 &&
+ ((MemVT.getScalarSizeInBits() == 8u ||
+ MemVT.getScalarSizeInBits() == 16u ||
+ MemVT.getScalarSizeInBits() == 32u ||
+ MemVT.getScalarSizeInBits() == 64u))) {
+ SDValue Lo =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
+ MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+ StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
+ SDValue Hi = DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, Dl,
+ MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+ StoreNode->getValue(),
+ DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
+ SDValue Result = DAG.getMemIntrinsicNode(
+ AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
+ {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+ StoreNode->getMemoryVT(), StoreNode->getMemOperand());
+ return Result;
+ }
} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
SDValue Lo =
@@ -3104,6 +3432,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::GlobalTLSAddress:
return LowerGlobalTLSAddress(Op, DAG);
case ISD::SETCC:
+ case ISD::STRICT_FSETCC:
+ case ISD::STRICT_FSETCCS:
return LowerSETCC(Op, DAG);
case ISD::BR_CC:
return LowerBR_CC(Op, DAG);
@@ -3138,14 +3468,19 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::UMULO:
return LowerXALUO(Op, DAG);
case ISD::FADD:
+ if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
case ISD::FSUB:
return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
case ISD::FMUL:
return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+ case ISD::FMA:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
case ISD::FDIV:
return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
case ISD::FP_ROUND:
+ case ISD::STRICT_FP_ROUND:
return LowerFP_ROUND(Op, DAG);
case ISD::FP_EXTEND:
return LowerFP_EXTEND(Op, DAG);
@@ -3169,6 +3504,20 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerSPLAT_VECTOR(Op, DAG);
case ISD::EXTRACT_SUBVECTOR:
return LowerEXTRACT_SUBVECTOR(Op, DAG);
+ case ISD::INSERT_SUBVECTOR:
+ return LowerINSERT_SUBVECTOR(Op, DAG);
+ case ISD::SDIV:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED);
+ case ISD::UDIV:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED);
+ case ISD::SMIN:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1);
+ case ISD::UMIN:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1);
+ case ISD::SMAX:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1);
+ case ISD::UMAX:
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL:
@@ -3190,9 +3539,13 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerPREFETCH(Op, DAG);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
+ case ISD::STRICT_SINT_TO_FP:
+ case ISD::STRICT_UINT_TO_FP:
return LowerINT_TO_FP(Op, DAG);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::STRICT_FP_TO_UINT:
return LowerFP_TO_INT(Op, DAG);
case ISD::FSINCOS:
return LowerFSINCOS(Op, DAG);
@@ -3218,9 +3571,68 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerATOMIC_LOAD_AND(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::VSCALE:
+ return LowerVSCALE(Op, DAG);
+ case ISD::TRUNCATE:
+ return LowerTRUNCATE(Op, DAG);
+ case ISD::LOAD:
+ if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+ return LowerFixedLengthVectorLoadToSVE(Op, DAG);
+ llvm_unreachable("Unexpected request to lower ISD::LOAD");
+ case ISD::ADD:
+ if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
+ llvm_unreachable("Unexpected request to lower ISD::ADD");
}
}
+bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
+ // Prefer NEON unless larger SVE registers are available.
+ return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
+}
+
+bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
+ if (!useSVEForFixedLengthVectors())
+ return false;
+
+ if (!VT.isFixedLengthVector())
+ return false;
+
+ // Fixed length predicates should be promoted to i8.
+ // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
+ if (VT.getVectorElementType() == MVT::i1)
+ return false;
+
+ // Don't use SVE for vectors we cannot scalarize if required.
+ switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ case MVT::f16:
+ case MVT::f32:
+ case MVT::f64:
+ break;
+ }
+
+ // Ensure NEON MVTs only belong to a single register class.
+ if (VT.getSizeInBits() <= 128)
+ return false;
+
+ // Don't use SVE for types that don't fit.
+ if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
+ return false;
+
+ // TODO: Perhaps an artificial restriction, but worth having whilst getting
+ // the base fixed length SVE support in place.
+ if (!VT.isPow2VectorType())
+ return false;
+
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
@@ -3231,9 +3643,6 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
switch (CC) {
default:
report_fatal_error("Unsupported calling convention.");
- case CallingConv::AArch64_SVE_VectorCall:
- // Calling SVE functions is currently not yet supported.
- report_fatal_error("Unsupported calling convention.");
case CallingConv::WebKit_JS:
return CC_AArch64_WebKit_JS;
case CallingConv::GHC:
@@ -3256,6 +3665,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::CFGuard_Check:
return CC_AArch64_Win64_CFGuard_Check;
case CallingConv::AArch64_VectorCall:
+ case CallingConv::AArch64_SVE_VectorCall:
return CC_AArch64_AAPCS;
}
}
@@ -3343,7 +3753,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
RC = &AArch64::GPR32RegClass;
else if (RegVT == MVT::i64)
RC = &AArch64::GPR64RegClass;
- else if (RegVT == MVT::f16)
+ else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
RC = &AArch64::FPR16RegClass;
else if (RegVT == MVT::f32)
RC = &AArch64::FPR32RegClass;
@@ -3374,7 +3784,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
case CCValAssign::Indirect:
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
- llvm_unreachable("Spilling of SVE vectors not yet implemented");
+ break;
case CCValAssign::BCvt:
ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
break;
@@ -3391,7 +3801,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
} else { // VA.isRegLoc()
assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
unsigned ArgOffset = VA.getLocMemOffset();
- unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
+ unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
+ ? VA.getLocVT().getSizeInBits()
+ : VA.getValVT().getSizeInBits()) / 8;
uint32_t BEAlign = 0;
if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
@@ -3417,7 +3829,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
case CCValAssign::Indirect:
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
- llvm_unreachable("Spilling of SVE vectors not yet implemented");
+ MemVT = VA.getLocVT();
+ break;
case CCValAssign::SExt:
ExtType = ISD::SEXTLOAD;
break;
@@ -3435,6 +3848,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
MemVT);
}
+
+ if (VA.getLocInfo() == CCValAssign::Indirect) {
+ assert(VA.getValVT().isScalableVector() &&
+ "Only scalable vectors can be passed indirectly");
+ // If value is passed via pointer - do a load.
+ ArgValue =
+ DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
+ }
+
if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
ArgValue, DAG.getValueType(MVT::i32));
@@ -3550,7 +3972,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
// The extra size here, if triggered, will always be 8.
MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
} else
- GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
+ GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
@@ -3582,7 +4004,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
int FPRIdx = 0;
if (FPRSaveSize != 0) {
- FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
+ FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
@@ -3703,6 +4125,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
CallingConv::ID CallerCC = CallerF.getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
+ // When using the Windows calling convention on a non-windows OS, we want
+ // to back up and restore X18 in such functions; we can't do a tail call
+ // from those functions.
+ if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
+ CalleeCC != CallingConv::Win64)
+ return false;
+
// Byval parameters hand the function a pointer directly into the stack area
// we want to reuse during a tail call. Working around this *is* possible (see
// X86) but less efficient and uglier in LowerCall.
@@ -3795,6 +4224,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ // If any of the arguments is passed indirectly, it must be SVE, so the
+ // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
+ // allocate space on the stack. That is why we determine this explicitly here
+ // the call cannot be a tailcall.
+ if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
+ assert((A.getLocInfo() != CCValAssign::Indirect ||
+ A.getValVT().isScalableVector()) &&
+ "Expected value to be scalable");
+ return A.getLocInfo() == CCValAssign::Indirect;
+ }))
+ return false;
+
// If the stack arguments for this call do not fit into our own save area then
// the call cannot be made tail.
if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
@@ -3873,7 +4314,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Check if it's really possible to do a tail call.
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
- if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
+ if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
@@ -3983,7 +4424,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVector<SDValue, 8> MemOpChains;
auto PtrVT = getPointerTy(DAG.getDataLayout());
- if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
+ if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
@@ -4035,7 +4476,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
case CCValAssign::Indirect:
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
- llvm_unreachable("Spilling of SVE vectors not yet implemented");
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
+ Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
+ int FI = MFI.CreateStackObject(
+ VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
+ MFI.setStackID(FI, TargetStackID::SVEVector);
+
+ SDValue SpillSlot = DAG.getFrameIndex(
+ FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+ Chain = DAG.getStore(
+ Chain, DL, Arg, SpillSlot,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ Arg = SpillSlot;
+ break;
}
if (VA.isRegLoc()) {
@@ -4071,7 +4525,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
RegsToPass.emplace_back(VA.getLocReg(), Arg);
RegsUsed.insert(VA.getLocReg());
const TargetOptions &Options = DAG.getTarget().Options;
- if (Options.EnableDebugEntryValues)
+ if (Options.EmitCallSiteInfo)
CSInfo.emplace_back(VA.getLocReg(), i);
}
} else {
@@ -4083,8 +4537,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// FIXME: This works on big-endian for composite byvals, which are the
// common case. It should also work for fundamental types too.
uint32_t BEAlign = 0;
- unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
- : VA.getValVT().getSizeInBits();
+ unsigned OpSize;
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ OpSize = VA.getLocVT().getSizeInBits();
+ else
+ OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+ : VA.getValVT().getSizeInBits();
OpSize = (OpSize + 7) / 8;
if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
!Flags.isInConsecutiveRegs()) {
@@ -4120,10 +4578,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue SizeNode =
DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
SDValue Cpy = DAG.getMemcpy(
- Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+ Chain, DL, DstAddr, Arg, SizeNode,
+ Outs[i].Flags.getNonZeroByValAlign(),
/*isVol = */ false, /*AlwaysInline = */ false,
- /*isTailCall = */ false,
- DstInfo, MachinePointerInfo());
+ /*isTailCall = */ false, DstInfo, MachinePointerInfo());
MemOpChains.push_back(Cpy);
} else {
@@ -4257,6 +4715,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+ DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
InFlag = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
@@ -4422,7 +4881,7 @@ SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
SelectionDAG &DAG,
unsigned Flag) const {
- return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+ return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
N->getOffset(), Flag);
}
@@ -4913,7 +5372,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
// instruction.
- if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
+ if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
@@ -4997,8 +5456,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
Cmp);
}
- assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
- LHS.getValueType() == MVT::f64);
+ assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
+ LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two branches to implement.
@@ -5124,6 +5583,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::i64)
UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
return UaddLV;
+ } else if (VT == MVT::i128) {
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
+
+ SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
+ SDValue UaddLV = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
+
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
}
assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
@@ -5154,9 +5622,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType().isVector())
return LowerVSETCC(Op, DAG);
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ bool IsStrict = Op->isStrictFPOpcode();
+ bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
+ unsigned OpNo = IsStrict ? 1 : 0;
+ SDValue Chain;
+ if (IsStrict)
+ Chain = Op.getOperand(0);
+ SDValue LHS = Op.getOperand(OpNo + 0);
+ SDValue RHS = Op.getOperand(OpNo + 1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
SDLoc dl(Op);
// We chose ZeroOrOneBooleanContents, so use zero and one.
@@ -5167,13 +5641,14 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets picked up by the next if statement.
if (LHS.getValueType() == MVT::f128) {
- softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
+ IsSignaling);
// If softenSetCCOperands returned a scalar, use it.
if (!RHS.getNode()) {
assert(LHS.getValueType() == Op.getValueType() &&
"Unexpected setcc expansion!");
- return LHS;
+ return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
}
}
@@ -5185,7 +5660,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
// matched to a single CSINC instruction.
- return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+ SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+ return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
}
// Now we know we're dealing with FP values.
@@ -5194,10 +5670,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
// and do the comparison.
- SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+ SDValue Cmp;
+ if (IsStrict)
+ Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
+ else
+ Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
+ SDValue Res;
if (CC2 == AArch64CC::AL) {
changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
CC2);
@@ -5206,7 +5687,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
// matched to a single CSINC instruction.
- return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+ Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
} else {
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
// totally clean. Some of them require two CSELs to implement. As is in
@@ -5219,8 +5700,9 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
- return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+ Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
}
+ return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
}
SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
@@ -5429,9 +5911,17 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
SDValue FVal = Op->getOperand(2);
SDLoc DL(Op);
+ EVT Ty = Op.getValueType();
+ if (Ty.isScalableVector()) {
+ SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
+ MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
+ SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
+ return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
+ }
+
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
// instruction.
- if (isOverflowIntrOpRes(CCVal)) {
+ if (ISD::isOverflowIntrOpRes(CCVal)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
return SDValue();
@@ -5642,9 +6132,9 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
- DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize,
- false, false, false, MachinePointerInfo(DestSV),
- MachinePointerInfo(SrcSV));
+ DAG.getConstant(VaListSize, DL, MVT::i32),
+ Align(PtrSize), false, false, false,
+ MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
}
SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
@@ -5656,7 +6146,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue Addr = Op.getOperand(1);
- unsigned Align = Op.getConstantOperandVal(3);
+ MaybeAlign Align(Op.getConstantOperandVal(3));
unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
auto PtrVT = getPointerTy(DAG.getDataLayout());
auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
@@ -5665,12 +6155,11 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
Chain = VAList.getValue(1);
VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
- if (Align > MinSlotSize) {
- assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+ if (Align && *Align > MinSlotSize) {
VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
- DAG.getConstant(Align - 1, DL, PtrVT));
+ DAG.getConstant(Align->value() - 1, DL, PtrVT));
VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
- DAG.getConstant(-(int64_t)Align, DL, PtrVT));
+ DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
}
Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
@@ -7001,7 +7490,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
// vrev <4 x i16> -> REV32
if (VT.getVectorElementType() == MVT::i16 ||
- VT.getVectorElementType() == MVT::f16)
+ VT.getVectorElementType() == MVT::f16 ||
+ VT.getVectorElementType() == MVT::bf16)
return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
// vrev <4 x i8> -> REV16
assert(VT.getVectorElementType() == MVT::i8);
@@ -7014,7 +7504,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
unsigned Opcode;
if (EltTy == MVT::i8)
Opcode = AArch64ISD::DUPLANE8;
- else if (EltTy == MVT::i16 || EltTy == MVT::f16)
+ else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
Opcode = AArch64ISD::DUPLANE16;
else if (EltTy == MVT::i32 || EltTy == MVT::f32)
Opcode = AArch64ISD::DUPLANE32;
@@ -7121,7 +7611,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
static unsigned getDUPLANEOp(EVT EltType) {
if (EltType == MVT::i8)
return AArch64ISD::DUPLANE8;
- if (EltType == MVT::i16 || EltType == MVT::f16)
+ if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
return AArch64ISD::DUPLANE16;
if (EltType == MVT::i32 || EltType == MVT::f32)
return AArch64ISD::DUPLANE32;
@@ -7330,18 +7820,16 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
// Extend input splat value where needed to fit into a GPR (32b or 64b only)
// FPRs don't have this restriction.
switch (ElemVT.getSimpleVT().SimpleTy) {
- case MVT::i8:
- case MVT::i16:
- case MVT::i32:
- SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
- return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
- case MVT::i64:
- SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
- return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
case MVT::i1: {
+ // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
+ // lowering code.
+ if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
+ if (ConstVal->isOne())
+ return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
+ // TODO: Add special case for constant false
+ }
// The general case of i1. There isn't any natural way to do this,
// so we use some trickery with whilelo.
- // TODO: Add special cases for splat of constant true/false.
SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
DAG.getValueType(MVT::i1));
@@ -7350,15 +7838,76 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
DAG.getConstant(0, dl, MVT::i64), SplatVal);
}
- // TODO: we can support float types, but haven't added patterns yet.
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
+ break;
+ case MVT::i64:
+ SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
+ break;
case MVT::f16:
+ case MVT::bf16:
case MVT::f32:
case MVT::f64:
+ // Fine as is
+ break;
default:
report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
}
+
+ return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
+}
+
+SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+
+ EVT VT = Op.getValueType();
+ if (!isTypeLegal(VT) || !VT.isScalableVector())
+ return SDValue();
+
+ // Current lowering only supports the SVE-ACLE types.
+ if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ // The DUPQ operation is indepedent of element type so normalise to i64s.
+ SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
+ SDValue Idx128 = Op.getOperand(2);
+
+ // DUPQ can be used when idx is in range.
+ auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
+ if (CIdx && (CIdx->getZExtValue() <= 3)) {
+ SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
+ SDNode *DUPQ =
+ DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
+ return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
+ }
+
+ // The ACLE says this must produce the same result as:
+ // svtbl(data, svadd_x(svptrue_b64(),
+ // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
+ // index * 2))
+ SDValue One = DAG.getConstant(1, DL, MVT::i64);
+ SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
+
+ // create the vector 0,1,0,1,...
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR,
+ DL, MVT::nxv2i64, Zero, One);
+ SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
+
+ // create the vector idx64,idx64+1,idx64,idx64+1,...
+ SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
+ SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
+ SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
+
+ // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
+ SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
+ return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
}
+
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
APInt &UndefBits) {
EVT VT = BVN->getValueType(0);
@@ -7609,8 +8158,10 @@ static unsigned getIntrinsicID(const SDNode *N) {
// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
-// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
-// Also, logical shift right -> sri, with the same structure.
+// BUILD_VECTORs with constant element C1, C2 is a constant, and:
+// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
+// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
+// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
@@ -7619,49 +8170,70 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
- // Is the first op an AND?
- const SDValue And = N->getOperand(0);
- if (And.getOpcode() != ISD::AND)
+ SDValue And;
+ SDValue Shift;
+
+ SDValue FirstOp = N->getOperand(0);
+ unsigned FirstOpc = FirstOp.getOpcode();
+ SDValue SecondOp = N->getOperand(1);
+ unsigned SecondOpc = SecondOp.getOpcode();
+
+ // Is one of the operands an AND or a BICi? The AND may have been optimised to
+ // a BICi in order to use an immediate instead of a register.
+ // Is the other operand an shl or lshr? This will have been turned into:
+ // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
+ if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
+ (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
+ And = FirstOp;
+ Shift = SecondOp;
+
+ } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
+ (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
+ And = SecondOp;
+ Shift = FirstOp;
+ } else
return SDValue();
- // Is the second op an shl or lshr?
- SDValue Shift = N->getOperand(1);
- // This will have been turned into: AArch64ISD::VSHL vector, #shift
- // or AArch64ISD::VLSHR vector, #shift
- unsigned ShiftOpc = Shift.getOpcode();
- if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
- return SDValue();
- bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
+ bool IsAnd = And.getOpcode() == ISD::AND;
+ bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
// Is the shift amount constant?
ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
if (!C2node)
return SDValue();
- // Is the and mask vector all constant?
uint64_t C1;
- if (!isAllConstantBuildVector(And.getOperand(1), C1))
- return SDValue();
+ if (IsAnd) {
+ // Is the and mask vector all constant?
+ if (!isAllConstantBuildVector(And.getOperand(1), C1))
+ return SDValue();
+ } else {
+ // Reconstruct the corresponding AND immediate from the two BICi immediates.
+ ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
+ ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
+ assert(C1nodeImm && C1nodeShift);
+ C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
+ }
- // Is C1 == ~C2, taking into account how much one can shift elements of a
- // particular size?
+ // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
+ // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
+ // how much one can shift elements of a particular size?
uint64_t C2 = C2node->getZExtValue();
unsigned ElemSizeInBits = VT.getScalarSizeInBits();
if (C2 > ElemSizeInBits)
return SDValue();
- unsigned ElemMask = (1 << ElemSizeInBits) - 1;
- if ((C1 & ElemMask) != (~C2 & ElemMask))
+
+ APInt C1AsAPInt(ElemSizeInBits, C1);
+ APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
+ : APInt::getLowBitsSet(ElemSizeInBits, C2);
+ if (C1AsAPInt != RequiredC1)
return SDValue();
SDValue X = And.getOperand(0);
SDValue Y = Shift.getOperand(0);
- unsigned Intrin =
- IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
- SDValue ResultSLI =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
- Shift.getOperand(1));
+ unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
+ SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
LLVM_DEBUG(N->dump(&DAG));
@@ -7675,10 +8247,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
- if (EnableAArch64SlrGeneration) {
- if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
- return Res;
- }
+ if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
+ return Res;
EVT VT = Op.getValueType();
@@ -7966,8 +8536,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
if (VT.getVectorElementType().isFloatingPoint()) {
SmallVector<SDValue, 8> Ops;
EVT EltTy = VT.getVectorElementType();
- assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
- "Unsupported floating-point vector type");
+ assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
+ EltTy == MVT::f64) && "Unsupported floating-point vector type");
LLVM_DEBUG(
dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
"BITCASTS, and try again\n");
@@ -8086,11 +8656,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
- VT == MVT::v8f16)
+ VT == MVT::v8f16 || VT == MVT::v8bf16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
- VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+ VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
+ VT != MVT::v4bf16)
return SDValue();
// For V64 types, we perform insertion by expanding the value
@@ -8120,11 +8691,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
- VT == MVT::v8f16)
+ VT == MVT::v8f16 || VT == MVT::v8bf16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
- VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+ VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
+ VT != MVT::v4bf16)
return SDValue();
// For V64 types, we perform extraction by expanding the value
@@ -8144,32 +8716,57 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
- EVT VT = Op.getOperand(0).getValueType();
- SDLoc dl(Op);
- // Just in case...
- if (!VT.isVector())
- return SDValue();
-
- ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
- if (!Cst)
- return SDValue();
- unsigned Val = Cst->getZExtValue();
+ assert(Op.getValueType().isFixedLengthVector() &&
+ "Only cases that extract a fixed length vector are supported!");
+ EVT InVT = Op.getOperand(0).getValueType();
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
unsigned Size = Op.getValueSizeInBits();
+ if (InVT.isScalableVector()) {
+ // This will be matched by custom code during ISelDAGToDAG.
+ if (Idx == 0 && isPackedVectorType(InVT, DAG))
+ return Op;
+
+ return SDValue();
+ }
+
// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
- if (Val == 0)
+ if (Idx == 0 && InVT.getSizeInBits() <= 128)
return Op;
// If this is extracting the upper 64-bits of a 128-bit vector, we match
// that directly.
- if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
+ if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64)
+ return Op;
+
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getValueType().isScalableVector() &&
+ "Only expect to lower inserts into scalable vectors!");
+
+ EVT InVT = Op.getOperand(1).getValueType();
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+
+ // We don't have any patterns for scalable vector yet.
+ if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT))
+ return SDValue();
+
+ // This will be matched by custom code during ISelDAGToDAG.
+ if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
return Op;
return SDValue();
}
bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
+ // Currently no fixed length shuffles that require SVE are legal.
+ if (useSVEForFixedLengthVectorVT(VT))
+ return false;
+
if (VT.getVectorNumElements() == 4 &&
(VT.is128BitVector() || VT.is64BitVector())) {
unsigned PFIndexes[4];
@@ -8249,6 +8846,81 @@ static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
}
+// Attempt to form urhadd(OpA, OpB) from
+// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)).
+// The original form of this expression is
+// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function
+// is called the srl will have been lowered to AArch64ISD::VLSHR and the
+// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)).
+// This pass can also recognize a variant of this pattern that uses sign
+// extension instead of zero extension and form a srhadd(OpA, OpB) from it.
+SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ if (!VT.isVector() || VT.isScalableVector())
+ return Op;
+
+ if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
+ return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
+
+ // Since we are looking for a right shift by a constant value of 1 and we are
+ // operating on types at least 16 bits in length (sign/zero extended OpA and
+ // OpB, which are at least 8 bits), it follows that the truncate will always
+ // discard the shifted-in bit and therefore the right shift will be logical
+ // regardless of the signedness of OpA and OpB.
+ SDValue Shift = Op.getOperand(0);
+ if (Shift.getOpcode() != AArch64ISD::VLSHR)
+ return Op;
+
+ // Is the right shift using an immediate value of 1?
+ uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
+ if (ShiftAmount != 1)
+ return Op;
+
+ SDValue Sub = Shift->getOperand(0);
+ if (Sub.getOpcode() != ISD::SUB)
+ return Op;
+
+ SDValue Xor = Sub.getOperand(1);
+ if (Xor.getOpcode() != ISD::XOR)
+ return Op;
+
+ SDValue ExtendOpA = Xor.getOperand(0);
+ SDValue ExtendOpB = Sub.getOperand(0);
+ unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
+ unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
+ if (!(ExtendOpAOpc == ExtendOpBOpc &&
+ (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
+ return Op;
+
+ // Is the result of the right shift being truncated to the same value type as
+ // the original operands, OpA and OpB?
+ SDValue OpA = ExtendOpA.getOperand(0);
+ SDValue OpB = ExtendOpB.getOperand(0);
+ EVT OpAVT = OpA.getValueType();
+ assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
+ if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
+ return Op;
+
+ // Is the XOR using a constant amount of all ones in the right hand side?
+ uint64_t C;
+ if (!isAllConstantBuildVector(Xor.getOperand(1), C))
+ return Op;
+
+ unsigned ElemSizeInBits = VT.getScalarSizeInBits();
+ APInt CAsAPInt(ElemSizeInBits, C);
+ if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
+ return Op;
+
+ SDLoc DL(Op);
+ bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
+ unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
+ SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB);
+
+ return ResultURHADD;
+}
+
SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
@@ -8264,6 +8936,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
llvm_unreachable("unexpected shift opcode");
case ISD::SHL:
+ if (VT.isScalableVector())
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1);
+
if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
DAG.getConstant(Cnt, DL, MVT::i32));
@@ -8273,6 +8948,12 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
Op.getOperand(0), Op.getOperand(1));
case ISD::SRA:
case ISD::SRL:
+ if (VT.isScalableVector()) {
+ unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1
+ : AArch64ISD::SRL_MERGE_OP1;
+ return LowerToPredicatedOp(Op, DAG, Opc);
+ }
+
// Right shift immediate
if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
unsigned Opc =
@@ -8395,6 +9076,12 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
SelectionDAG &DAG) const {
+ if (Op.getValueType().isScalableVector()) {
+ if (Op.getOperand(0).getValueType().isFloatingPoint())
+ return Op;
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
+ }
+
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
@@ -8570,7 +9257,8 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
- unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ MaybeAlign Align =
+ cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
EVT VT = Node->getValueType(0);
if (DAG.getMachineFunction().getFunction().hasFnAttribute(
@@ -8580,7 +9268,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
- DAG.getConstant(-(uint64_t)Align, dl, VT));
+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
@@ -8595,7 +9283,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
- DAG.getConstant(-(uint64_t)Align, dl, VT));
+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
@@ -8605,6 +9293,41 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
return DAG.getMergeValues(Ops, dl);
}
+SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT != MVT::i64 && "Expected illegal VSCALE node");
+
+ SDLoc DL(Op);
+ APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
+ return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
+ DL, VT);
+}
+
+/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
+template <unsigned NumVecs>
+static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info,
+ const CallInst &CI) {
+ Info.opc = ISD::INTRINSIC_VOID;
+ // Retrieve EC from first vector argument.
+ const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType());
+ ElementCount EC = VT.getVectorElementCount();
+#ifndef NDEBUG
+ // Check the assumption that all input vectors are the same type.
+ for (unsigned I = 0; I < NumVecs; ++I)
+ assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) &&
+ "Invalid type.");
+#endif
+ // memVT is `NumVecs * VT`.
+ Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
+ EC * NumVecs);
+ Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
+ Info.offset = 0;
+ Info.align.reset();
+ Info.flags = MachineMemOperand::MOStore;
+ return true;
+}
+
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
/// specified in the intrinsic calls.
@@ -8614,6 +9337,12 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
unsigned Intrinsic) const {
auto &DL = I.getModule()->getDataLayout();
switch (Intrinsic) {
+ case Intrinsic::aarch64_sve_st2:
+ return setInfoSVEStN<2>(Info, I);
+ case Intrinsic::aarch64_sve_st3:
+ return setInfoSVEStN<3>(Info, I);
+ case Intrinsic::aarch64_sve_st4:
+ return setInfoSVEStN<4>(Info, I);
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
@@ -8670,7 +9399,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+ Info.align = DL.getABITypeAlign(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
}
@@ -8681,7 +9410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
- Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+ Info.align = DL.getABITypeAlign(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
}
@@ -8706,21 +9435,25 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::aarch64_sve_ldnt1: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.memVT = MVT::getVT(I.getType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
- Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
+ Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+ Info.flags = MachineMemOperand::MOLoad;
+ if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
+ Info.flags |= MachineMemOperand::MONonTemporal;
return true;
}
case Intrinsic::aarch64_sve_stnt1: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.memVT = MVT::getVT(I.getOperand(0)->getType());
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
- Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
- Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
+ Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+ Info.flags = MachineMemOperand::MOStore;
+ if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
+ Info.flags |= MachineMemOperand::MONonTemporal;
return true;
}
default:
@@ -8895,21 +9628,22 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
/// or upper half of the vector elements.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
- auto *FullVT = cast<VectorType>(FullV->getType());
- auto *HalfVT = cast<VectorType>(HalfV->getType());
- return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth();
+ auto *FullTy = FullV->getType();
+ auto *HalfTy = HalfV->getType();
+ return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
+ 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
};
auto extractHalf = [](Value *FullV, Value *HalfV) {
- auto *FullVT = cast<VectorType>(FullV->getType());
- auto *HalfVT = cast<VectorType>(HalfV->getType());
+ auto *FullVT = cast<FixedVectorType>(FullV->getType());
+ auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
};
- Constant *M1, *M2;
+ ArrayRef<int> M1, M2;
Value *S1Op1, *S2Op1;
- if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) ||
- !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2))))
+ if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
+ !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
return false;
// Check that the operands are half as wide as the result and we extract
@@ -8922,7 +9656,7 @@ static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
// elements.
int M1Start = -1;
int M2Start = -1;
- int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2;
+ int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
!ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
@@ -8948,6 +9682,22 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) {
return true;
}
+/// Check if Op could be used with vmull_high_p64 intrinsic.
+static bool isOperandOfVmullHighP64(Value *Op) {
+ Value *VectorOperand = nullptr;
+ ConstantInt *ElementIndex = nullptr;
+ return match(Op, m_ExtractElt(m_Value(VectorOperand),
+ m_ConstantInt(ElementIndex))) &&
+ ElementIndex->getValue() == 1 &&
+ isa<FixedVectorType>(VectorOperand->getType()) &&
+ cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
+}
+
+/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
+static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
+ return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
+}
+
/// Check if sinking \p I's operands to I's basic block is profitable, because
/// the operands can be folded into a target instruction, e.g.
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -8964,6 +9714,15 @@ bool AArch64TargetLowering::shouldSinkOperands(
Ops.push_back(&II->getOperandUse(0));
Ops.push_back(&II->getOperandUse(1));
return true;
+
+ case Intrinsic::aarch64_neon_pmull64:
+ if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
+ II->getArgOperand(1)))
+ return false;
+ Ops.push_back(&II->getArgOperandUse(0));
+ Ops.push_back(&II->getArgOperandUse(1));
+ return true;
+
default:
return false;
}
@@ -8996,12 +9755,12 @@ bool AArch64TargetLowering::shouldSinkOperands(
}
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
- unsigned &RequiredAligment) const {
+ Align &RequiredAligment) const {
if (!LoadedType.isSimple() ||
(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
return false;
// Cyclone supports unaligned accesses.
- RequiredAligment = 0;
+ RequiredAligment = Align(1);
unsigned NumBits = LoadedType.getSizeInBits();
return NumBits == 32 || NumBits == 64;
}
@@ -9015,7 +9774,7 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
}
MachineMemOperand::Flags
-AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
+AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
return MOStridedAccess;
@@ -9029,7 +9788,7 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
// Ensure the number of vector elements is greater than 1.
- if (VecTy->getNumElements() < 2)
+ if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
return false;
// Ensure the element type is legal.
@@ -9063,22 +9822,24 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
const DataLayout &DL = LI->getModule()->getDataLayout();
- VectorType *VecTy = Shuffles[0]->getType();
+ VectorType *VTy = Shuffles[0]->getType();
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
- if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
+ if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL))
return false;
- unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
+ unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);
+
+ auto *FVTy = cast<FixedVectorType>(VTy);
// A pointer vector can not be the return type of the ldN intrinsics. Need to
// load integer vectors first and then convert to pointer vectors.
- Type *EltTy = VecTy->getVectorElementType();
+ Type *EltTy = FVTy->getElementType();
if (EltTy->isPointerTy())
- VecTy =
- VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
+ FVTy =
+ FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
IRBuilder<> Builder(LI);
@@ -9088,19 +9849,19 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
if (NumLoads > 1) {
// If we're going to generate more than one load, reset the sub-vector type
// to something legal.
- VecTy = VectorType::get(VecTy->getVectorElementType(),
- VecTy->getVectorNumElements() / NumLoads);
+ FVTy = FixedVectorType::get(FVTy->getElementType(),
+ FVTy->getNumElements() / NumLoads);
// We will compute the pointer operand of each load from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
- BaseAddr, VecTy->getVectorElementType()->getPointerTo(
- LI->getPointerAddressSpace()));
+ BaseAddr,
+ FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
}
- Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
- Type *Tys[2] = {VecTy, PtrTy};
+ Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
+ Type *Tys[2] = {FVTy, PtrTy};
static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
Intrinsic::aarch64_neon_ld3,
Intrinsic::aarch64_neon_ld4};
@@ -9117,9 +9878,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
// If we're generating more than one load, compute the base address of
// subsequent loads as an offset from the previous.
if (LoadCount > 0)
- BaseAddr =
- Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
- VecTy->getVectorNumElements() * Factor);
+ BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
+ FVTy->getNumElements() * Factor);
CallInst *LdN = Builder.CreateCall(
LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
@@ -9134,8 +9894,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
// Convert the integer vector to pointer vector if the element is pointer.
if (EltTy->isPointerTy())
SubVec = Builder.CreateIntToPtr(
- SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
- VecTy->getVectorNumElements()));
+ SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
+ FVTy->getNumElements()));
SubVecs[SVI].push_back(SubVec);
}
}
@@ -9186,13 +9946,12 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
- VectorType *VecTy = SVI->getType();
- assert(VecTy->getVectorNumElements() % Factor == 0 &&
- "Invalid interleaved store");
+ auto *VecTy = cast<FixedVectorType>(SVI->getType());
+ assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
- unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
- Type *EltTy = VecTy->getVectorElementType();
- VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
+ unsigned LaneLen = VecTy->getNumElements() / Factor;
+ Type *EltTy = VecTy->getElementType();
+ auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
@@ -9212,14 +9971,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
// vectors to integer vectors.
if (EltTy->isPointerTy()) {
Type *IntTy = DL.getIntPtrType(EltTy);
- unsigned NumOpElts = Op0->getType()->getVectorNumElements();
+ unsigned NumOpElts =
+ cast<FixedVectorType>(Op0->getType())->getNumElements();
// Convert to the corresponding integer vector.
- Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
+ auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
- SubVecTy = VectorType::get(IntTy, LaneLen);
+ SubVecTy = FixedVectorType::get(IntTy, LaneLen);
}
// The base address of the store.
@@ -9229,14 +9989,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
// If we're going to generate more than one store, reset the lane length
// and sub-vector type to something legal.
LaneLen /= NumStores;
- SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+ SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
// We will compute the pointer operand of each store from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
- BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
- SI->getPointerAddressSpace()));
+ BaseAddr,
+ SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
}
auto Mask = SVI->getShuffleMask();
@@ -9258,7 +10018,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
unsigned IdxI = StoreCount * LaneLen * Factor + i;
if (Mask[IdxI] >= 0) {
Ops.push_back(Builder.CreateShuffleVector(
- Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+ Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
} else {
unsigned StartMask = 0;
for (unsigned j = 1; j < LaneLen; j++) {
@@ -9274,14 +10034,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
// Note: StartMask cannot be negative, it's checked in
// isReInterleaveMask
Ops.push_back(Builder.CreateShuffleVector(
- Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
+ Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
}
}
// If we generating more than one store, we compute the base address of
// subsequent stores as an offset from the previous.
if (StoreCount > 0)
- BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
+ BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
BaseAddr, LaneLen * Factor);
Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
@@ -9290,16 +10050,59 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
return true;
}
-static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
- unsigned AlignCheck) {
- return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
- (DstAlign == 0 || DstAlign % AlignCheck == 0));
+// Lower an SVE structured load intrinsic returning a tuple type to target
+// specific intrinsic taking the same input but returning a multi-result value
+// of the split tuple type.
+//
+// E.g. Lowering an LD3:
+//
+// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
+// <vscale x 4 x i1> %pred,
+// <vscale x 4 x i32>* %addr)
+//
+// Output DAG:
+//
+// t0: ch = EntryToken
+// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
+// t4: i64,ch = CopyFromReg t0, Register:i64 %1
+// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
+// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
+//
+// This is called pre-legalization to avoid widening/splitting issues with
+// non-power-of-2 tuple types used for LD3, such as nxv12i32.
+SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
+ ArrayRef<SDValue> LoadOps,
+ EVT VT, SelectionDAG &DAG,
+ const SDLoc &DL) const {
+ assert(VT.isScalableVector() && "Can only lower scalable vectors");
+
+ unsigned N, Opcode;
+ static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
+ {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
+ {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
+ {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
+
+ std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
+ assert(VT.getVectorElementCount().Min % N == 0 &&
+ "invalid tuple vector type!");
+
+ EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorElementCount() / N);
+ assert(isTypeLegal(SplitVT));
+
+ SmallVector<EVT, 5> VTs(N, SplitVT);
+ VTs.push_back(MVT::Other); // Chain
+ SDVTList NodeTys = DAG.getVTList(VTs);
+
+ SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
+ SmallVector<SDValue, 4> PseudoLoadOps;
+ for (unsigned I = 0; I < N; ++I)
+ PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
}
EVT AArch64TargetLowering::getOptimalMemOpType(
- uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset, bool MemcpyStrSrc,
- const AttributeList &FuncAttributes) const {
+ const MemOp &Op, const AttributeList &FuncAttributes) const {
bool CanImplicitFloat =
!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
@@ -9307,9 +10110,9 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
// Only use AdvSIMD to implement memset of 32-byte and above. It would have
// taken one instruction to materialize the v2i64 zero and one store (with
// restrictive addressing mode). Just do i64 stores.
- bool IsSmallMemset = IsMemset && Size < 32;
- auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
- if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
+ bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
+ auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+ if (Op.isAligned(AlignCheck))
return true;
bool Fast;
return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
@@ -9317,22 +10120,20 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
Fast;
};
- if (CanUseNEON && IsMemset && !IsSmallMemset &&
- AlignmentIsAcceptable(MVT::v2i64, 16))
+ if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
+ AlignmentIsAcceptable(MVT::v2i64, Align(16)))
return MVT::v2i64;
- if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
+ if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
return MVT::f128;
- if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
+ if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
return MVT::i64;
- if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
+ if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
return MVT::i32;
return MVT::Other;
}
LLT AArch64TargetLowering::getOptimalMemOpLLT(
- uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset, bool MemcpyStrSrc,
- const AttributeList &FuncAttributes) const {
+ const MemOp &Op, const AttributeList &FuncAttributes) const {
bool CanImplicitFloat =
!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
@@ -9340,9 +10141,9 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
// Only use AdvSIMD to implement memset of 32-byte and above. It would have
// taken one instruction to materialize the v2i64 zero and one store (with
// restrictive addressing mode). Just do i64 stores.
- bool IsSmallMemset = IsMemset && Size < 32;
- auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
- if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
+ bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
+ auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+ if (Op.isAligned(AlignCheck))
return true;
bool Fast;
return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
@@ -9350,14 +10151,14 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
Fast;
};
- if (CanUseNEON && IsMemset && !IsSmallMemset &&
- AlignmentIsAcceptable(MVT::v2i64, 16))
+ if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
+ AlignmentIsAcceptable(MVT::v2i64, Align(16)))
return LLT::vector(2, 64);
- if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
+ if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
return LLT::scalar(128);
- if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
+ if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
return LLT::scalar(64);
- if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
+ if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
return LLT::scalar(32);
return LLT();
}
@@ -9404,6 +10205,10 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
return false;
+ // FIXME: Update this method to support scalable addressing modes.
+ if (isa<ScalableVectorType>(Ty))
+ return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
+
// check reg + imm case:
// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
uint64_t NumBytes = 0;
@@ -10110,7 +10915,7 @@ static SDValue tryCombineToBSL(SDNode *N,
}
if (FoundMatch)
- return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+ return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
N0->getOperand(1 - i), N1->getOperand(1 - j));
}
@@ -10167,29 +10972,81 @@ static SDValue performSVEAndCombine(SDNode *N,
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ SelectionDAG &DAG = DCI.DAG;
SDValue Src = N->getOperand(0);
+ unsigned Opc = Src->getOpcode();
+
+ // Zero/any extend of an unsigned unpack
+ if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
+ SDValue UnpkOp = Src->getOperand(0);
+ SDValue Dup = N->getOperand(1);
+
+ if (Dup.getOpcode() != AArch64ISD::DUP)
+ return SDValue();
+
+ SDLoc DL(N);
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
+ uint64_t ExtVal = C->getZExtValue();
+
+ // If the mask is fully covered by the unpack, we don't need to push
+ // a new AND onto the operand
+ EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
+ if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
+ (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
+ (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
+ return Src;
+
+ // Truncate to prevent a DUP with an over wide constant
+ APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
+
+ // Otherwise, make sure we propagate the AND to the operand
+ // of the unpack
+ Dup = DAG.getNode(AArch64ISD::DUP, DL,
+ UnpkOp->getValueType(0),
+ DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
+
+ SDValue And = DAG.getNode(ISD::AND, DL,
+ UnpkOp->getValueType(0), UnpkOp, Dup);
+
+ return DAG.getNode(Opc, DL, N->getValueType(0), And);
+ }
+
SDValue Mask = N->getOperand(1);
if (!Src.hasOneUse())
return SDValue();
- // GLD1* instructions perform an implicit zero-extend, which makes them
+ EVT MemVT;
+
+ // SVE load instructions perform an implicit zero-extend, which makes them
// perfect candidates for combining.
- switch (Src->getOpcode()) {
- case AArch64ISD::GLD1:
- case AArch64ISD::GLD1_SCALED:
- case AArch64ISD::GLD1_SXTW:
- case AArch64ISD::GLD1_SXTW_SCALED:
- case AArch64ISD::GLD1_UXTW:
- case AArch64ISD::GLD1_UXTW_SCALED:
- case AArch64ISD::GLD1_IMM:
+ switch (Opc) {
+ case AArch64ISD::LD1_MERGE_ZERO:
+ case AArch64ISD::LDNF1_MERGE_ZERO:
+ case AArch64ISD::LDFF1_MERGE_ZERO:
+ MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
+ break;
+ case AArch64ISD::GLD1_MERGE_ZERO:
+ case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+ case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
+ case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
+ case AArch64ISD::GLDNT1_MERGE_ZERO:
+ MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
break;
default:
return SDValue();
}
- EVT MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
-
if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
return Src;
@@ -10273,6 +11130,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+ unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
// Optimize concat_vectors of truncated vectors, where the intermediate
// type is illegal, to avoid said illegality, e.g.,
@@ -10285,9 +11143,8 @@ static SDValue performConcatVectorsCombine(SDNode *N,
// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
// on both input and result type, so we might generate worse code.
// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
- if (N->getNumOperands() == 2 &&
- N0->getOpcode() == ISD::TRUNCATE &&
- N1->getOpcode() == ISD::TRUNCATE) {
+ if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
+ N1Opc == ISD::TRUNCATE) {
SDValue N00 = N0->getOperand(0);
SDValue N10 = N1->getOperand(0);
EVT N00VT = N00.getValueType();
@@ -10312,6 +11169,52 @@ static SDValue performConcatVectorsCombine(SDNode *N,
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ // Optimise concat_vectors of two [us]rhadds that use extracted subvectors
+ // from the same original vectors. Combine these into a single [us]rhadd that
+ // operates on the two original vectors. Example:
+ // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
+ // extract_subvector (v16i8 OpB,
+ // <0>))),
+ // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
+ // extract_subvector (v16i8 OpB,
+ // <8>)))))
+ // ->
+ // (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
+ if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
+ (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) {
+ SDValue N00 = N0->getOperand(0);
+ SDValue N01 = N0->getOperand(1);
+ SDValue N10 = N1->getOperand(0);
+ SDValue N11 = N1->getOperand(1);
+
+ EVT N00VT = N00.getValueType();
+ EVT N10VT = N10.getValueType();
+
+ if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
+ SDValue N00Source = N00->getOperand(0);
+ SDValue N01Source = N01->getOperand(0);
+ SDValue N10Source = N10->getOperand(0);
+ SDValue N11Source = N11->getOperand(0);
+
+ if (N00Source == N10Source && N01Source == N11Source &&
+ N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
+ assert(N0.getValueType() == N1.getValueType());
+
+ uint64_t N00Index = N00.getConstantOperandVal(1);
+ uint64_t N01Index = N01.getConstantOperandVal(1);
+ uint64_t N10Index = N10.getConstantOperandVal(1);
+ uint64_t N11Index = N11.getConstantOperandVal(1);
+
+ if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
+ N10Index == N00VT.getVectorNumElements())
+ return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
+ }
+ }
+ }
+
// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
// splat. The indexed instructions are going to be expecting a DUPLANE64, so
// canonicalise to that.
@@ -10330,7 +11233,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
// becomes
// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
- if (N1->getOpcode() != ISD::BITCAST)
+ if (N1Opc != ISD::BITCAST)
return SDValue();
SDValue RHS = N1->getOperand(0);
MVT RHSTy = RHS.getValueType().getSimpleVT();
@@ -10794,6 +11697,35 @@ static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc,
return SDValue();
}
+static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ SDValue Op1 = N->getOperand(1);
+ SDValue Op2 = N->getOperand(2);
+ EVT ScalarTy = Op1.getValueType();
+
+ if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) {
+ Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
+ Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
+ }
+
+ return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0),
+ Op1, Op2);
+}
+
+static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
+ SDLoc dl(N);
+ SDValue Scalar = N->getOperand(3);
+ EVT ScalarTy = Scalar.getValueType();
+
+ if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
+ Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
+
+ SDValue Passthru = N->getOperand(1);
+ SDValue Pred = N->getOperand(2);
+ return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
+ Pred, Scalar, Passthru);
+}
+
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
SDLoc dl(N);
LLVMContext &Ctx = *DAG.getContext();
@@ -10819,8 +11751,7 @@ static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
}
-static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID,
- bool Invert,
+static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalize())
@@ -10873,18 +11804,12 @@ static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID,
}
}
+ if (!Imm)
+ return SDValue();
+
SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
- SDValue ID = DAG.getTargetConstant(ReplacementIID, DL, MVT::i64);
- SDValue Op0, Op1;
- if (Invert) {
- Op0 = Splat;
- Op1 = N->getOperand(2);
- } else {
- Op0 = N->getOperand(2);
- Op1 = Splat;
- }
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- ID, Pred, Op0, Op1);
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
+ N->getOperand(2), Splat, DAG.getCondCode(CC));
}
return SDValue();
@@ -10914,6 +11839,46 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
return DAG.getZExtOrTrunc(Res, DL, VT);
}
+static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
+ SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ SDValue Pred = N->getOperand(1);
+ SDValue VecToReduce = N->getOperand(2);
+
+ EVT ReduceVT = VecToReduce.getValueType();
+ SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
+
+ // SVE reductions set the whole vector register with the first element
+ // containing the reduction result, which we'll now extract.
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
+ Zero);
+}
+
+static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
+ SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ SDValue Pred = N->getOperand(1);
+ SDValue InitVal = N->getOperand(2);
+ SDValue VecToReduce = N->getOperand(3);
+ EVT ReduceVT = VecToReduce.getValueType();
+
+ // Ordered reductions use the first lane of the result vector as the
+ // reduction's initial value.
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
+ DAG.getUNDEF(ReduceVT), InitVal, Zero);
+
+ SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
+
+ // SVE reductions set the whole vector register with the first element
+ // containing the reduction result, which we'll now extract.
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
+ Zero);
+}
+
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -10982,38 +11947,107 @@ static SDValue performIntrinsicCombine(SDNode *N,
return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG);
case Intrinsic::aarch64_sve_andv:
return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG);
+ case Intrinsic::aarch64_sve_index:
+ return LowerSVEIntrinsicIndex(N, DAG);
+ case Intrinsic::aarch64_sve_dup:
+ return LowerSVEIntrinsicDUP(N, DAG);
+ case Intrinsic::aarch64_sve_dup_x:
+ return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
+ N->getOperand(1));
case Intrinsic::aarch64_sve_ext:
return LowerSVEIntrinsicEXT(N, DAG);
+ case Intrinsic::aarch64_sve_smin:
+ return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_umin:
+ return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_smax:
+ return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_umax:
+ return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_lsl:
+ return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_lsr:
+ return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_asr:
+ return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
+ case Intrinsic::aarch64_sve_cmphs:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
+ break;
+ case Intrinsic::aarch64_sve_cmphi:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
+ break;
+ case Intrinsic::aarch64_sve_cmpge:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETGE));
+ break;
+ case Intrinsic::aarch64_sve_cmpgt:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETGT));
+ break;
+ case Intrinsic::aarch64_sve_cmpeq:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
+ break;
+ case Intrinsic::aarch64_sve_cmpne:
+ if (!N->getOperand(2).getValueType().isFloatingPoint())
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+ N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), DAG.getCondCode(ISD::SETNE));
+ break;
+ case Intrinsic::aarch64_sve_fadda:
+ return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
+ case Intrinsic::aarch64_sve_faddv:
+ return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
+ case Intrinsic::aarch64_sve_fmaxnmv:
+ return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
+ case Intrinsic::aarch64_sve_fmaxv:
+ return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
+ case Intrinsic::aarch64_sve_fminnmv:
+ return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
+ case Intrinsic::aarch64_sve_fminv:
+ return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
+ case Intrinsic::aarch64_sve_sel:
+ return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_cmpeq_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpeq,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
case Intrinsic::aarch64_sve_cmpne_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpne,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
case Intrinsic::aarch64_sve_cmpge_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
case Intrinsic::aarch64_sve_cmpgt_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
case Intrinsic::aarch64_sve_cmplt_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt,
- true, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
case Intrinsic::aarch64_sve_cmple_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge,
- true, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
case Intrinsic::aarch64_sve_cmphs_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
case Intrinsic::aarch64_sve_cmphi_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi,
- false, DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
case Intrinsic::aarch64_sve_cmplo_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, true,
- DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
case Intrinsic::aarch64_sve_cmpls_wide:
- return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, true,
- DCI, DAG);
+ return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
case Intrinsic::aarch64_sve_ptest_any:
return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
AArch64CC::ANY_ACTIVE);
@@ -11091,14 +12125,14 @@ static SDValue performExtendCombine(SDNode *N,
if (!ResVT.isSimple() || !SrcVT.isSimple())
return SDValue();
- // If the source VT is a 64-bit vector, we can play games and get the
- // better results we want.
- if (SrcVT.getSizeInBits() != 64)
+ // If the source VT is a 64-bit fixed or scalable vector, we can play games
+ // and get the better results we want.
+ if (SrcVT.getSizeInBits().getKnownMinSize() != 64)
return SDValue();
unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
- unsigned ElementCount = SrcVT.getVectorNumElements();
- SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+ ElementCount SrcEC = SrcVT.getVectorElementCount();
+ SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC);
SDLoc DL(N);
Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
@@ -11106,17 +12140,14 @@ static SDValue performExtendCombine(SDNode *N,
// bit source.
EVT LoVT, HiVT;
SDValue Lo, Hi;
- unsigned NumElements = ResVT.getVectorNumElements();
- assert(!(NumElements & 1) && "Splitting vector, but not in half!");
- LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
- ResVT.getVectorElementType(), NumElements / 2);
+ LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext());
EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
- LoVT.getVectorNumElements());
+ LoVT.getVectorElementCount());
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
DAG.getConstant(0, DL, MVT::i64));
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
- DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
+ DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64));
Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
@@ -11165,11 +12196,71 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
return NewST1;
}
+// Returns an SVE type that ContentTy can be trivially sign or zero extended
+// into.
+static MVT getSVEContainerType(EVT ContentTy) {
+ assert(ContentTy.isSimple() && "No SVE containers for extended types");
+
+ switch (ContentTy.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("No known SVE container for this MVT type");
+ case MVT::nxv2i8:
+ case MVT::nxv2i16:
+ case MVT::nxv2i32:
+ case MVT::nxv2i64:
+ case MVT::nxv2f32:
+ case MVT::nxv2f64:
+ return MVT::nxv2i64;
+ case MVT::nxv4i8:
+ case MVT::nxv4i16:
+ case MVT::nxv4i32:
+ case MVT::nxv4f32:
+ return MVT::nxv4i32;
+ case MVT::nxv8i8:
+ case MVT::nxv8i16:
+ case MVT::nxv8f16:
+ case MVT::nxv8bf16:
+ return MVT::nxv8i16;
+ case MVT::nxv16i8:
+ return MVT::nxv16i8;
+ }
+}
+
+static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ EVT ContainerVT = VT;
+ if (ContainerVT.isInteger())
+ ContainerVT = getSVEContainerType(ContainerVT);
+
+ SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
+ SDValue Ops[] = { N->getOperand(0), // Chain
+ N->getOperand(2), // Pg
+ N->getOperand(3), // Base
+ DAG.getValueType(VT) };
+
+ SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
+ SDValue LoadChain = SDValue(Load.getNode(), 1);
+
+ if (ContainerVT.isInteger() && (VT != ContainerVT))
+ Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
+
+ return DAG.getMergeValues({ Load, LoadChain }, DL);
+}
+
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
EVT PtrTy = N->getOperand(3).getValueType();
+ if (VT == MVT::nxv8bf16 &&
+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+ return SDValue();
+
EVT LoadVT = VT;
if (VT.isFloatingPoint())
LoadVT = VT.changeTypeToInteger();
@@ -11190,6 +12281,58 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
return L;
}
+template <unsigned Opcode>
+static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
+ static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
+ Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
+ "Unsupported opcode.");
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ EVT LoadVT = VT;
+ if (VT.isFloatingPoint())
+ LoadVT = VT.changeTypeToInteger();
+
+ SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
+ SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
+ SDValue LoadChain = SDValue(Load.getNode(), 1);
+
+ if (VT.isFloatingPoint())
+ Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
+
+ return DAG.getMergeValues({Load, LoadChain}, DL);
+}
+
+static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ SDValue Data = N->getOperand(2);
+ EVT DataVT = Data.getValueType();
+ EVT HwSrcVt = getSVEContainerType(DataVT);
+ SDValue InputVT = DAG.getValueType(DataVT);
+
+ if (DataVT == MVT::nxv8bf16 &&
+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+ return SDValue();
+
+ if (DataVT.isFloatingPoint())
+ InputVT = DAG.getValueType(HwSrcVt);
+
+ SDValue SrcNew;
+ if (Data.getValueType().isFloatingPoint())
+ SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
+ else
+ SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
+
+ SDValue Ops[] = { N->getOperand(0), // Chain
+ SrcNew,
+ N->getOperand(4), // Base
+ N->getOperand(3), // Pg
+ InputVT
+ };
+
+ return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
+}
+
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
@@ -11197,6 +12340,10 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
EVT DataVT = Data.getValueType();
EVT PtrTy = N->getOperand(4).getValueType();
+ if (DataVT == MVT::nxv8bf16 &&
+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+ return SDValue();
+
if (DataVT.isFloatingPoint())
Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
@@ -11226,6 +12373,10 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
SDValue StVal = St.getValue();
EVT VT = StVal.getValueType();
+ // Avoid scalarizing zero splat stores for scalable vectors.
+ if (VT.isScalableVector())
+ return SDValue();
+
// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
// 2, 3 or 4 i32 elements.
int NumVecElts = VT.getVectorNumElements();
@@ -11348,7 +12499,8 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SDValue StVal = S->getValue();
EVT VT = StVal.getValueType();
- if (!VT.isVector())
+
+ if (!VT.isFixedLengthVector())
return SDValue();
// If we get a splat of zeros, convert this vector store to a store of
@@ -11419,6 +12571,9 @@ static SDValue performPostLD1Combine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
+ if (VT.isScalableVector())
+ return SDValue();
+
unsigned LoadIdx = IsLaneOp ? 1 : 0;
SDNode *LD = N->getOperand(LoadIdx).getNode();
// If it is not LOAD, can not do such combine.
@@ -12258,32 +13413,57 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(MinOffset, DL, MVT::i64));
}
-// Returns an SVE type that ContentTy can be trivially sign or zero extended
-// into.
-static MVT getSVEContainerType(EVT ContentTy) {
- assert(ContentTy.isSimple() && "No SVE containers for extended types");
+// Turns the vector of indices into a vector of byte offstes by scaling Offset
+// by (BitWidth / 8).
+static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
+ SDLoc DL, unsigned BitWidth) {
+ assert(Offset.getValueType().isScalableVector() &&
+ "This method is only for scalable vectors of offsets");
- switch (ContentTy.getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("No known SVE container for this MVT type");
- case MVT::nxv2i8:
- case MVT::nxv2i16:
- case MVT::nxv2i32:
- case MVT::nxv2i64:
- case MVT::nxv2f32:
- case MVT::nxv2f64:
- return MVT::nxv2i64;
- case MVT::nxv4i8:
- case MVT::nxv4i16:
- case MVT::nxv4i32:
- case MVT::nxv4f32:
- return MVT::nxv4i32;
- }
+ SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
+ SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
+
+ return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
}
-static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
- unsigned Opcode,
- bool OnlyPackedOffsets = true) {
+/// Check if the value of \p OffsetInBytes can be used as an immediate for
+/// the gather load/prefetch and scatter store instructions with vector base and
+/// immediate offset addressing mode:
+///
+/// [<Zn>.[S|D]{, #<imm>}]
+///
+/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
+
+inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
+ unsigned ScalarSizeInBytes) {
+ // The immediate is not a multiple of the scalar size.
+ if (OffsetInBytes % ScalarSizeInBytes)
+ return false;
+
+ // The immediate is out of range.
+ if (OffsetInBytes / ScalarSizeInBytes > 31)
+ return false;
+
+ return true;
+}
+
+/// Check if the value of \p Offset represents a valid immediate for the SVE
+/// gather load/prefetch and scatter store instructiona with vector base and
+/// immediate offset addressing mode:
+///
+/// [<Zn>.[S|D]{, #<imm>}]
+///
+/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
+static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
+ unsigned ScalarSizeInBytes) {
+ ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
+ return OffsetConst && isValidImmForSVEVecImmAddrMode(
+ OffsetConst->getZExtValue(), ScalarSizeInBytes);
+}
+
+static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
+ unsigned Opcode,
+ bool OnlyPackedOffsets = true) {
const SDValue Src = N->getOperand(2);
const EVT SrcVT = Src->getValueType(0);
assert(SrcVT.isScalableVector() &&
@@ -12303,11 +13483,46 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
// Depending on the addressing mode, this is either a pointer or a vector of
// pointers (that fits into one register)
- const SDValue Base = N->getOperand(4);
+ SDValue Base = N->getOperand(4);
// Depending on the addressing mode, this is either a single offset or a
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(5);
+ // For "scalar + vector of indices", just scale the indices. This only
+ // applies to non-temporal scatters because there's no instruction that takes
+ // indicies.
+ if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
+ Offset =
+ getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
+ Opcode = AArch64ISD::SSTNT1_PRED;
+ }
+
+ // In the case of non-temporal gather loads there's only one SVE instruction
+ // per data-size: "scalar + vector", i.e.
+ // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+ // Since we do have intrinsics that allow the arguments to be in a different
+ // order, we may need to swap them to match the spec.
+ if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
+ std::swap(Base, Offset);
+
+ // SST1_IMM requires that the offset is an immediate that is:
+ // * a multiple of #SizeInBytes,
+ // * in the range [0, 31 x #SizeInBytes],
+ // where #SizeInBytes is the size in bytes of the stored items. For
+ // immediates outside that range and non-immediate scalar offsets use SST1 or
+ // SST1_UXTW instead.
+ if (Opcode == AArch64ISD::SST1_IMM_PRED) {
+ if (!isValidImmForSVEVecImmAddrMode(Offset,
+ SrcVT.getScalarSizeInBits() / 8)) {
+ if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
+ Opcode = AArch64ISD::SST1_UXTW_PRED;
+ else
+ Opcode = AArch64ISD::SST1_PRED;
+
+ std::swap(Base, Offset);
+ }
+ }
+
auto &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(Base.getValueType()))
return SDValue();
@@ -12325,9 +13540,9 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
// Source value type that is representable in hardware
EVT HwSrcVt = getSVEContainerType(SrcVT);
- // Keep the original type of the input data to store - this is needed to
- // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the
- // integer equivalent, so just use HwSrcVt.
+ // Keep the original type of the input data to store - this is needed to be
+ // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
+ // FP values we want the integer equivalent, so just use HwSrcVt.
SDValue InputVT = DAG.getValueType(SrcVT);
if (SrcVT.isFloatingPoint())
InputVT = DAG.getValueType(HwSrcVt);
@@ -12350,24 +13565,67 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(Opcode, DL, VTs, Ops);
}
-static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
- unsigned Opcode,
- bool OnlyPackedOffsets = true) {
- EVT RetVT = N->getValueType(0);
+static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
+ unsigned Opcode,
+ bool OnlyPackedOffsets = true) {
+ const EVT RetVT = N->getValueType(0);
assert(RetVT.isScalableVector() &&
"Gather loads are only possible for SVE vectors");
+
SDLoc DL(N);
+ // Make sure that the loaded data will fit into an SVE register
if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
return SDValue();
// Depending on the addressing mode, this is either a pointer or a vector of
// pointers (that fits into one register)
- const SDValue Base = N->getOperand(3);
+ SDValue Base = N->getOperand(3);
// Depending on the addressing mode, this is either a single offset or a
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(4);
+ // For "scalar + vector of indices", just scale the indices. This only
+ // applies to non-temporal gathers because there's no instruction that takes
+ // indicies.
+ if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
+ Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
+ RetVT.getScalarSizeInBits());
+ Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
+ }
+
+ // In the case of non-temporal gather loads there's only one SVE instruction
+ // per data-size: "scalar + vector", i.e.
+ // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+ // Since we do have intrinsics that allow the arguments to be in a different
+ // order, we may need to swap them to match the spec.
+ if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
+ Offset.getValueType().isVector())
+ std::swap(Base, Offset);
+
+ // GLD{FF}1_IMM requires that the offset is an immediate that is:
+ // * a multiple of #SizeInBytes,
+ // * in the range [0, 31 x #SizeInBytes],
+ // where #SizeInBytes is the size in bytes of the loaded items. For
+ // immediates outside that range and non-immediate scalar offsets use
+ // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
+ if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
+ Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
+ if (!isValidImmForSVEVecImmAddrMode(Offset,
+ RetVT.getScalarSizeInBits() / 8)) {
+ if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
+ Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
+ ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
+ : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
+ else
+ Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
+ ? AArch64ISD::GLD1_MERGE_ZERO
+ : AArch64ISD::GLDFF1_MERGE_ZERO;
+
+ std::swap(Base, Offset);
+ }
+ }
+
auto &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(Base.getValueType()))
return SDValue();
@@ -12382,10 +13640,9 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
// Return value type that is representable in hardware
EVT HwRetVt = getSVEContainerType(RetVT);
- // Keep the original output value type around - this will better inform
- // optimisations (e.g. instruction folding when load is followed by
- // zext/sext). This will only be used for ints, so the value for FPs
- // doesn't matter.
+ // Keep the original output value type around - this is needed to be able to
+ // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
+ // values we want the integer equivalent, so just use HwRetVT.
SDValue OutVT = DAG.getValueType(RetVT);
if (RetVT.isFloatingPoint())
OutVT = DAG.getValueType(HwRetVt);
@@ -12409,55 +13666,126 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getMergeValues({Load, LoadChain}, DL);
}
-
static SDValue
performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ SDLoc DL(N);
SDValue Src = N->getOperand(0);
unsigned Opc = Src->getOpcode();
- // Gather load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
+ // Sign extend of an unsigned unpack -> signed unpack
+ if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
+
+ unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
+ : AArch64ISD::SUNPKLO;
+
+ // Push the sign extend to the operand of the unpack
+ // This is necessary where, for example, the operand of the unpack
+ // is another unpack:
+ // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
+ // ->
+ // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
+ // ->
+ // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
+ SDValue ExtOp = Src->getOperand(0);
+ auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
+ EVT EltTy = VT.getVectorElementType();
+ (void)EltTy;
+
+ assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
+ "Sign extending from an invalid type");
+
+ EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
+ VT.getVectorElementType(),
+ VT.getVectorElementCount() * 2);
+
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
+ ExtOp, DAG.getValueType(ExtVT));
+
+ return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
+ }
+
+ // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
unsigned NewOpc;
+ unsigned MemVTOpNum = 4;
switch (Opc) {
- case AArch64ISD::GLD1:
- NewOpc = AArch64ISD::GLD1S;
+ case AArch64ISD::LD1_MERGE_ZERO:
+ NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
+ MemVTOpNum = 3;
+ break;
+ case AArch64ISD::LDNF1_MERGE_ZERO:
+ NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
+ MemVTOpNum = 3;
+ break;
+ case AArch64ISD::LDFF1_MERGE_ZERO:
+ NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
+ MemVTOpNum = 3;
+ break;
+ case AArch64ISD::GLD1_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_SCALED:
- NewOpc = AArch64ISD::GLD1S_SCALED;
+ case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_SXTW:
- NewOpc = AArch64ISD::GLD1S_SXTW;
+ case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_SXTW_SCALED:
- NewOpc = AArch64ISD::GLD1S_SXTW_SCALED;
+ case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_UXTW:
- NewOpc = AArch64ISD::GLD1S_UXTW;
+ case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_UXTW_SCALED:
- NewOpc = AArch64ISD::GLD1S_UXTW_SCALED;
+ case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
break;
- case AArch64ISD::GLD1_IMM:
- NewOpc = AArch64ISD::GLD1S_IMM;
+ case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
+ break;
+ case AArch64ISD::GLDNT1_MERGE_ZERO:
+ NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
break;
default:
return SDValue();
}
EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
- EVT GLD1SrcMemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
+ EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
- if ((SignExtSrcVT != GLD1SrcMemVT) || !Src.hasOneUse())
+ if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
return SDValue();
EVT DstVT = N->getValueType(0);
SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
- SDValue Ops[] = {Src->getOperand(0), Src->getOperand(1), Src->getOperand(2),
- Src->getOperand(3), Src->getOperand(4)};
+
+ SmallVector<SDValue, 5> Ops;
+ for (unsigned I = 0; I < Src->getNumOperands(); ++I)
+ Ops.push_back(Src->getOperand(I));
SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
DCI.CombineTo(N, ExtLoad);
@@ -12467,6 +13795,51 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue(N, 0);
}
+/// Legalize the gather prefetch (scalar + vector addressing mode) when the
+/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
+/// != nxv2i32) do not need legalization.
+static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
+ const unsigned OffsetPos = 4;
+ SDValue Offset = N->getOperand(OffsetPos);
+
+ // Not an unpacked vector, bail out.
+ if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
+ return SDValue();
+
+ // Extend the unpacked offset vector to 64-bit lanes.
+ SDLoc DL(N);
+ Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
+ SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
+ // Replace the offset operand with the 64-bit one.
+ Ops[OffsetPos] = Offset;
+
+ return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
+}
+
+/// Combines a node carrying the intrinsic
+/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
+/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
+/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
+/// sve gather prefetch instruction with vector plus immediate addressing mode.
+static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
+ unsigned ScalarSizeInBytes) {
+ const unsigned ImmPos = 4, OffsetPos = 3;
+ // No need to combine the node if the immediate is valid...
+ if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
+ return SDValue();
+
+ // ...otherwise swap the offset base with the offset...
+ SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
+ std::swap(Ops[ImmPos], Ops[OffsetPos]);
+ // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
+ // `aarch64_sve_prfb_gather_uxtw_index`.
+ SDLoc DL(N);
+ Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
+ MVT::i64);
+
+ return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -12531,6 +13904,23 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
+ return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
+ return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
+ return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
+ return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
+ case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
+ return legalizeSVEGatherPrefetchOffsVec(N, DAG);
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
@@ -12555,44 +13945,180 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performNEONPostLDSTCombine(N, DCI, DAG);
case Intrinsic::aarch64_sve_ldnt1:
return performLDNT1Combine(N, DAG);
+ case Intrinsic::aarch64_sve_ld1rq:
+ return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
+ case Intrinsic::aarch64_sve_ld1ro:
+ return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
+ case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldnt1_gather:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldnt1_gather_index:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ld1:
+ return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldnf1:
+ return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldff1:
+ return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_st1:
+ return performST1Combine(N, DAG);
case Intrinsic::aarch64_sve_stnt1:
return performSTNT1Combine(N, DAG);
+ case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
+ case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
+ case Intrinsic::aarch64_sve_stnt1_scatter:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
+ case Intrinsic::aarch64_sve_stnt1_scatter_index:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
case Intrinsic::aarch64_sve_ld1_gather:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1);
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1_gather_index:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLD1_SCALED_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1_gather_sxtw:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW,
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
/*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_ld1_gather_uxtw:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW,
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
/*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED,
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
/*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED,
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldff1_gather:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldff1_gather_index:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
/*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_ld1_gather_imm:
- return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
+ case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
+ return performGatherLoadCombine(N, DAG,
+ AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
case Intrinsic::aarch64_sve_st1_scatter:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1);
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
case Intrinsic::aarch64_sve_st1_scatter_index:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED);
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
case Intrinsic::aarch64_sve_st1_scatter_sxtw:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW,
- /*OnlyPackedOffsets=*/false);
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
+ /*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_st1_scatter_uxtw:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW,
- /*OnlyPackedOffsets=*/false);
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
+ /*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED,
- /*OnlyPackedOffsets=*/false);
+ return performScatterStoreCombine(N, DAG,
+ AArch64ISD::SST1_SXTW_SCALED_PRED,
+ /*OnlyPackedOffsets=*/false);
case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_st1_scatter_imm:
- return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM);
+ return performScatterStoreCombine(N, DAG,
+ AArch64ISD::SST1_UXTW_SCALED_PRED,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
+ case Intrinsic::aarch64_sve_tuple_get: {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Src1 = N->getOperand(2);
+ SDValue Idx = N->getOperand(3);
+
+ uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
+ EVT ResVT = N->getValueType(0);
+ uint64_t NumLanes = ResVT.getVectorElementCount().Min;
+ SDValue Val =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1,
+ DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32));
+ return DAG.getMergeValues({Val, Chain}, DL);
+ }
+ case Intrinsic::aarch64_sve_tuple_set: {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Tuple = N->getOperand(2);
+ SDValue Idx = N->getOperand(3);
+ SDValue Vec = N->getOperand(4);
+
+ EVT TupleVT = Tuple.getValueType();
+ uint64_t TupleLanes = TupleVT.getVectorElementCount().Min;
+
+ uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
+ uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min;
+
+ if ((TupleLanes % NumLanes) != 0)
+ report_fatal_error("invalid tuple vector!");
+
+ uint64_t NumVecs = TupleLanes / NumLanes;
+
+ SmallVector<SDValue, 4> Opnds;
+ for (unsigned I = 0; I < NumVecs; ++I) {
+ if (I == IdxConst)
+ Opnds.push_back(Vec);
+ else {
+ Opnds.push_back(
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple,
+ DAG.getConstant(I * NumLanes, DL, MVT::i32)));
+ }
+ }
+ SDValue Concat =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
+ return DAG.getMergeValues({Concat, Chain}, DL);
+ }
+ case Intrinsic::aarch64_sve_tuple_create2:
+ case Intrinsic::aarch64_sve_tuple_create3:
+ case Intrinsic::aarch64_sve_tuple_create4: {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+
+ SmallVector<SDValue, 4> Opnds;
+ for (unsigned I = 2; I < N->getNumOperands(); ++I)
+ Opnds.push_back(N->getOperand(I));
+
+ EVT VT = Opnds[0].getValueType();
+ EVT EltVT = VT.getVectorElementType();
+ EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+ VT.getVectorElementCount() *
+ (N->getNumOperands() - 2));
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
+ return DAG.getMergeValues({Concat, Chain}, DL);
+ }
+ case Intrinsic::aarch64_sve_ld2:
+ case Intrinsic::aarch64_sve_ld3:
+ case Intrinsic::aarch64_sve_ld4: {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Mask = N->getOperand(2);
+ SDValue BasePtr = N->getOperand(3);
+ SDValue LoadOps[] = {Chain, Mask, BasePtr};
+ unsigned IntrinsicID =
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ SDValue Result =
+ LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
+ return DAG.getMergeValues({Result, Chain}, DL);
+ }
default:
break;
}
@@ -12724,7 +14250,8 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SDLoc DL(N);
SDValue Op = N->getOperand(0);
- if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
+ if (N->getValueType(0) != MVT::i16 ||
+ (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16))
return;
Op = SDValue(
@@ -12759,6 +14286,40 @@ static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
return std::make_pair(Lo, Hi);
}
+void AArch64TargetLowering::ReplaceExtractSubVectorResults(
+ SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+ SDValue In = N->getOperand(0);
+ EVT InVT = In.getValueType();
+
+ // Common code will handle these just fine.
+ if (!InVT.isScalableVector() || !InVT.isInteger())
+ return;
+
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ // The following checks bail if this is not a halving operation.
+
+ ElementCount ResEC = VT.getVectorElementCount();
+
+ if (InVT.getVectorElementCount().Min != (ResEC.Min * 2))
+ return;
+
+ auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!CIndex)
+ return;
+
+ unsigned Index = CIndex->getZExtValue();
+ if ((Index != 0) && (Index != ResEC.Min))
+ return;
+
+ unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
+ EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
+
+ SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
+}
+
// Create an even/odd pair of X registers holding integer value V.
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
SDLoc dl(V.getNode());
@@ -12822,10 +14383,12 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
if (DAG.getDataLayout().isBigEndian())
std::swap(SubReg1, SubReg2);
- Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
- SDValue(CmpSwap, 0)));
- Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
- SDValue(CmpSwap, 0)));
+ SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
+ SDValue(CmpSwap, 0));
+ SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
+ SDValue(CmpSwap, 0));
+ Results.push_back(
+ DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
Results.push_back(SDValue(CmpSwap, 1)); // Chain out
return;
}
@@ -12841,8 +14404,8 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
- Results.push_back(SDValue(CmpSwap, 0));
- Results.push_back(SDValue(CmpSwap, 1));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
+ SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
Results.push_back(SDValue(CmpSwap, 3));
}
@@ -12862,6 +14425,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
return;
+ case ISD::CTPOP:
+ Results.push_back(LowerCTPOP(SDValue(N, 0), DAG));
+ return;
case AArch64ISD::SADDV:
ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
return;
@@ -12909,6 +14475,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
Results.append({Pair, Result.getValue(2) /* Chain */});
return;
}
+ case ISD::EXTRACT_SUBVECTOR:
+ ReplaceExtractSubVectorResults(N, Results, DAG);
+ return;
case ISD::INTRINSIC_WO_CHAIN: {
EVT VT = N->getValueType(0);
assert((VT == MVT::i8 || VT == MVT::i16) &&
@@ -13019,7 +14588,7 @@ AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
// on the stack and close enough to the spill slot, this can lead to a
// situation where the monitor always gets cleared and the atomic operation
// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
- if (getTargetMachine().getOptLevel() == 0)
+ if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
return AtomicExpansionKind::None;
return AtomicExpansionKind::LLSC;
}
@@ -13278,8 +14847,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// integer division, leaving the division as-is is a loss even in terms of
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
- bool OptSize =
- Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
+ bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
return OptSize && !VT.isVector();
}
@@ -13309,3 +14877,280 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
bool AArch64TargetLowering::needsFixedCatchObjects() const {
return false;
}
+
+bool AArch64TargetLowering::shouldLocalize(
+ const MachineInstr &MI, const TargetTransformInfo *TTI) const {
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_GLOBAL_VALUE: {
+ // On Darwin, TLS global vars get selected into function calls, which
+ // we don't want localized, as they can get moved into the middle of a
+ // another call sequence.
+ const GlobalValue &GV = *MI.getOperand(1).getGlobal();
+ if (GV.isThreadLocal() && Subtarget->isTargetMachO())
+ return false;
+ break;
+ }
+ // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
+ // localizable.
+ case AArch64::ADRP:
+ case AArch64::G_ADD_LOW:
+ return true;
+ default:
+ break;
+ }
+ return TargetLoweringBase::shouldLocalize(MI, TTI);
+}
+
+bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
+ if (isa<ScalableVectorType>(Inst.getType()))
+ return true;
+
+ for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
+ if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
+ return true;
+
+ return false;
+}
+
+// Return the largest legal scalable vector type that matches VT's element type.
+static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
+ assert(VT.isFixedLengthVector() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected legal fixed length vector!");
+ switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unexpected element type for SVE container");
+ case MVT::i8:
+ return EVT(MVT::nxv16i8);
+ case MVT::i16:
+ return EVT(MVT::nxv8i16);
+ case MVT::i32:
+ return EVT(MVT::nxv4i32);
+ case MVT::i64:
+ return EVT(MVT::nxv2i64);
+ case MVT::f16:
+ return EVT(MVT::nxv8f16);
+ case MVT::f32:
+ return EVT(MVT::nxv4f32);
+ case MVT::f64:
+ return EVT(MVT::nxv2f64);
+ }
+}
+
+// Return a PTRUE with active lanes corresponding to the extent of VT.
+static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
+ EVT VT) {
+ assert(VT.isFixedLengthVector() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected legal fixed length vector!");
+
+ int PgPattern;
+ switch (VT.getVectorNumElements()) {
+ default:
+ llvm_unreachable("unexpected element count for SVE predicate");
+ case 1:
+ PgPattern = AArch64SVEPredPattern::vl1;
+ break;
+ case 2:
+ PgPattern = AArch64SVEPredPattern::vl2;
+ break;
+ case 4:
+ PgPattern = AArch64SVEPredPattern::vl4;
+ break;
+ case 8:
+ PgPattern = AArch64SVEPredPattern::vl8;
+ break;
+ case 16:
+ PgPattern = AArch64SVEPredPattern::vl16;
+ break;
+ case 32:
+ PgPattern = AArch64SVEPredPattern::vl32;
+ break;
+ case 64:
+ PgPattern = AArch64SVEPredPattern::vl64;
+ break;
+ case 128:
+ PgPattern = AArch64SVEPredPattern::vl128;
+ break;
+ case 256:
+ PgPattern = AArch64SVEPredPattern::vl256;
+ break;
+ }
+
+ // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
+ // use AArch64SVEPredPattern::all, which can enable the use of unpredicated
+ // variants of instructions when available.
+
+ MVT MaskVT;
+ switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unexpected element type for SVE predicate");
+ case MVT::i8:
+ MaskVT = MVT::nxv16i1;
+ break;
+ case MVT::i16:
+ case MVT::f16:
+ MaskVT = MVT::nxv8i1;
+ break;
+ case MVT::i32:
+ case MVT::f32:
+ MaskVT = MVT::nxv4i1;
+ break;
+ case MVT::i64:
+ case MVT::f64:
+ MaskVT = MVT::nxv2i1;
+ break;
+ }
+
+ return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
+ DAG.getTargetConstant(PgPattern, DL, MVT::i64));
+}
+
+static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
+ EVT VT) {
+ assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected legal scalable vector!");
+ auto PredTy = VT.changeVectorElementType(MVT::i1);
+ return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
+}
+
+static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
+ if (VT.isFixedLengthVector())
+ return getPredicateForFixedLengthVector(DAG, DL, VT);
+
+ return getPredicateForScalableVector(DAG, DL, VT);
+}
+
+// Grow V to consume an entire SVE register.
+static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
+ assert(VT.isScalableVector() &&
+ "Expected to convert into a scalable vector!");
+ assert(V.getValueType().isFixedLengthVector() &&
+ "Expected a fixed length vector operand!");
+ SDLoc DL(V);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
+}
+
+// Shrink V so it's just big enough to maintain a VT's worth of data.
+static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
+ assert(VT.isFixedLengthVector() &&
+ "Expected to convert into a fixed length vector!");
+ assert(V.getValueType().isScalableVector() &&
+ "Expected a scalable vector operand!");
+ SDLoc DL(V);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
+}
+
+// Convert all fixed length vector loads larger than NEON to masked_loads.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ auto Load = cast<LoadSDNode>(Op);
+
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ auto NewLoad = DAG.getMaskedLoad(
+ ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
+ getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
+ Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
+ Load->getExtensionType());
+
+ auto Result = convertFromScalableVector(DAG, VT, NewLoad);
+ SDValue MergedValues[2] = {Result, Load->getChain()};
+ return DAG.getMergeValues(MergedValues, DL);
+}
+
+// Convert all fixed length vector stores larger than NEON to masked_stores.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ auto Store = cast<StoreSDNode>(Op);
+
+ SDLoc DL(Op);
+ EVT VT = Store->getValue().getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
+ return DAG.getMaskedStore(
+ Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
+ getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
+ Store->getMemOperand(), Store->getAddressingMode(),
+ Store->isTruncatingStore());
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
+ Val = convertToScalableVector(DAG, ContainerVT, Val);
+
+ // Repeatedly truncate Val until the result is of the desired element type.
+ switch (ContainerVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unimplemented container type");
+ case MVT::nxv2i64:
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
+ Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
+ if (VT.getVectorElementType() == MVT::i32)
+ break;
+ LLVM_FALLTHROUGH;
+ case MVT::nxv4i32:
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
+ Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
+ if (VT.getVectorElementType() == MVT::i16)
+ break;
+ LLVM_FALLTHROUGH;
+ case MVT::nxv8i16:
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
+ Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
+ assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
+ break;
+ }
+
+ return convertFromScalableVector(DAG, VT, Val);
+}
+
+SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
+ SelectionDAG &DAG,
+ unsigned NewOp) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ auto Pg = getPredicateForVector(DAG, DL, VT);
+
+ if (useSVEForFixedLengthVectorVT(VT)) {
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ // Create list of operands by convereting existing ones to scalable types.
+ SmallVector<SDValue, 4> Operands = {Pg};
+ for (const SDValue &V : Op->op_values()) {
+ if (isa<CondCodeSDNode>(V)) {
+ Operands.push_back(V);
+ continue;
+ }
+
+ assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
+ "Only fixed length vectors are supported!");
+ Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
+ }
+
+ auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
+ return convertFromScalableVector(DAG, VT, ScalableRes);
+ }
+
+ assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
+
+ SmallVector<SDValue, 4> Operands = {Pg};
+ for (const SDValue &V : Op->op_values()) {
+ assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) &&
+ "Only scalable vectors are supported!");
+ Operands.push_back(V);
+ }
+
+ return DAG.getNode(NewOp, DL, VT, Operands);
+}