aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/ARM/ARMISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/ARM/ARMISelLowering.cpp')
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp1556
1 files changed, 1129 insertions, 427 deletions
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 21de0f6a7630..18bb9bf3eccc 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1,9 +1,8 @@
//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -80,6 +79,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -113,6 +113,7 @@
#include <vector>
using namespace llvm;
+using namespace llvm::PatternMatch;
#define DEBUG_TYPE "arm-isel"
@@ -220,6 +221,121 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
}
+void ARMTargetLowering::setAllExpand(MVT VT) {
+ for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+ setOperationAction(Opc, VT, Expand);
+
+ // We support these really simple operations even on types where all
+ // the actual arithmetic has to be broken down into simpler
+ // operations or turned into library calls.
+ setOperationAction(ISD::BITCAST, VT, Legal);
+ setOperationAction(ISD::LOAD, VT, Legal);
+ setOperationAction(ISD::STORE, VT, Legal);
+ setOperationAction(ISD::UNDEF, VT, Legal);
+}
+
+void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
+ LegalizeAction Action) {
+ setLoadExtAction(ISD::EXTLOAD, From, To, Action);
+ setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
+ setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
+}
+
+void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
+ const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
+
+ for (auto VT : IntTypes) {
+ addRegisterClass(VT, &ARM::QPRRegClass);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::ABS, VT, Legal);
+
+ // No native support for these.
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+
+ if (!HasMVEFP) {
+ setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+ setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+ }
+ }
+
+ const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
+ for (auto VT : FloatTypes) {
+ addRegisterClass(VT, &ARM::QPRRegClass);
+ if (!HasMVEFP)
+ setAllExpand(VT);
+
+ // These are legal or custom whether we have MVE.fp or not
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+
+ if (HasMVEFP) {
+ setOperationAction(ISD::FMINNUM, VT, Legal);
+ setOperationAction(ISD::FMAXNUM, VT, Legal);
+ setOperationAction(ISD::FROUND, VT, Legal);
+
+ // No native support for these.
+ setOperationAction(ISD::FDIV, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::FSQRT, VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FPOW, VT, Expand);
+ setOperationAction(ISD::FLOG, VT, Expand);
+ setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FLOG10, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
+ setOperationAction(ISD::FEXP2, VT, Expand);
+ setOperationAction(ISD::FNEARBYINT, VT, Expand);
+ }
+ }
+
+ // We 'support' these types up to bitcast/load/store level, regardless of
+ // MVE integer-only / float support. Only doing FP data processing on the FP
+ // vector types is inhibited at integer-only level.
+ const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
+ for (auto VT : LongTypes) {
+ addRegisterClass(VT, &ARM::QPRRegClass);
+ setAllExpand(VT);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ }
+ // We can do bitwise operations on v2i64 vectors
+ setOperationAction(ISD::AND, MVT::v2i64, Legal);
+ setOperationAction(ISD::OR, MVT::v2i64, Legal);
+ setOperationAction(ISD::XOR, MVT::v2i64, Legal);
+
+ // It is legal to extload from v4i8 to v4i16 or v4i32.
+ addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
+ addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
+ addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
+
+ // Some truncating stores are legal too.
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
+}
+
ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
const ARMSubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -240,7 +356,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (Subtarget->isTargetMachO()) {
// Uses VFP for Thumb libfuncs if available.
- if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
+ if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
static const struct {
const RTLIB::Libcall Op;
@@ -509,10 +625,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
else
addRegisterClass(MVT::i32, &ARM::GPRRegClass);
- if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
- !Subtarget->isThumb1Only()) {
+ if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
+ Subtarget->hasFPRegs()) {
addRegisterClass(MVT::f32, &ARM::SPRRegClass);
addRegisterClass(MVT::f64, &ARM::DPRRegClass);
+ if (!Subtarget->hasVFP2Base())
+ setAllExpand(MVT::f32);
+ if (!Subtarget->hasFP64())
+ setAllExpand(MVT::f64);
}
if (Subtarget->hasFullFP16()) {
@@ -528,9 +648,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
for (MVT VT : MVT::vector_valuetypes()) {
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
- setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+ addAllExtLoads(VT, InnerVT, Expand);
}
setOperationAction(ISD::MULHS, VT, Expand);
@@ -547,6 +665,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
+ if (Subtarget->hasMVEIntegerOps())
+ addMVEVectorTypes(Subtarget->hasMVEFloatOps());
+
+ // Combine low-overhead loop intrinsics so that we can lower i1 types.
+ if (Subtarget->hasLOB())
+ setTargetDAGCombine(ISD::BRCOND);
+
if (Subtarget->hasNEON()) {
addDRTypeForNEON(MVT::v2f32);
addDRTypeForNEON(MVT::v8i8);
@@ -565,11 +690,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
addQRTypeForNEON(MVT::v8f16);
addDRTypeForNEON(MVT::v4f16);
}
+ }
+ if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
// v2f64 is legal so that QR subregs can be extracted as f64 elements, but
- // neither Neon nor VFP support any arithmetic operations on it.
- // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
- // supported for v4f32.
+ // none of Neon, MVE or VFP supports any arithmetic operations on it.
setOperationAction(ISD::FADD, MVT::v2f64, Expand);
setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
@@ -603,7 +728,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
setOperationAction(ISD::FMA, MVT::v2f64, Expand);
+ }
+ if (Subtarget->hasNEON()) {
+ // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
+ // supported for v4f32.
setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
@@ -697,7 +826,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
// NEON only has FMA instructions as of VFP4.
- if (!Subtarget->hasVFP4()) {
+ if (!Subtarget->hasVFP4Base()) {
setOperationAction(ISD::FMA, MVT::v2f32, Expand);
setOperationAction(ISD::FMA, MVT::v4f32, Expand);
}
@@ -711,9 +840,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
- setTargetDAGCombine(ISD::BUILD_VECTOR);
- setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
- setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::FP_TO_UINT);
@@ -731,7 +857,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
}
- if (Subtarget->isFPOnlySP()) {
+ if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
+ setTargetDAGCombine(ISD::BUILD_VECTOR);
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ }
+
+ if (!Subtarget->hasFP64()) {
// When targeting a floating-point unit with only single-precision
// operations, f64 is legal for the few double-precision instructions which
// are present However, no double-precision operations other than moves,
@@ -767,9 +899,19 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ }
+
+ if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
}
+ if (!Subtarget->hasFP16())
+ setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
+
+ if (!Subtarget->hasFP64())
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+
computeRegisterProperties(Subtarget->getRegisterInfo());
// ARM does not have floating-point extending loads.
@@ -832,6 +974,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRA, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+ // MVE lowers 64 bit shifts to lsll and lsrl
+ // assuming that ISD::SRL and SRA of i64 are already marked custom
+ if (Subtarget->hasMVEIntegerOps())
+ setOperationAction(ISD::SHL, MVT::i64, Custom);
+
// Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
if (Subtarget->isThumb1Only()) {
setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
@@ -1029,7 +1176,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
- if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
+ if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
!Subtarget->isThumb1Only()) {
// Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
// iff target supports vfp2.
@@ -1079,7 +1226,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
setOperationAction(ISD::FREM, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f32, Expand);
- if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
+ if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
!Subtarget->isThumb1Only()) {
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
@@ -1087,7 +1234,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FPOW, MVT::f64, Expand);
setOperationAction(ISD::FPOW, MVT::f32, Expand);
- if (!Subtarget->hasVFP4()) {
+ if (!Subtarget->hasVFP4Base()) {
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FMA, MVT::f32, Expand);
}
@@ -1095,7 +1242,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// Various VFP goodness
if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
// FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
- if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
+ if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
}
@@ -1115,7 +1262,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
// FP-ARMv8 implements a lot of rounding-like FP operations.
- if (Subtarget->hasFPARMv8()) {
+ if (Subtarget->hasFPARMv8Base()) {
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
setOperationAction(ISD::FROUND, MVT::f32, Legal);
@@ -1124,12 +1271,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FRINT, MVT::f32, Legal);
setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
- setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
- setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+ if (Subtarget->hasNEON()) {
+ setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+ }
- if (!Subtarget->isFPOnlySP()) {
+ if (Subtarget->hasFP64()) {
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FROUND, MVT::f64, Legal);
@@ -1141,6 +1290,24 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
}
+ // FP16 often need to be promoted to call lib functions
+ if (Subtarget->hasFullFP16()) {
+ setOperationAction(ISD::FREM, MVT::f16, Promote);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+ setOperationAction(ISD::FSIN, MVT::f16, Promote);
+ setOperationAction(ISD::FCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FPOWI, MVT::f16, Promote);
+ setOperationAction(ISD::FPOW, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP2, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG10, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG2, MVT::f16, Promote);
+
+ setOperationAction(ISD::FROUND, MVT::f16, Legal);
+ }
+
if (Subtarget->hasNEON()) {
// vmin and vmax aren't available in a scalar form, so we use
// a NEON instruction with an undef lane instead.
@@ -1177,11 +1344,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (Subtarget->hasV6Ops())
setTargetDAGCombine(ISD::SRL);
+ if (Subtarget->isThumb1Only())
+ setTargetDAGCombine(ISD::SHL);
setStackPointerRegisterToSaveRestore(ARM::SP);
if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
- !Subtarget->hasVFP2())
+ !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
setSchedulingPreference(Sched::RegPressure);
else
setSchedulingPreference(Sched::Hybrid);
@@ -1204,6 +1373,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());
setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
+
+ if (Subtarget->isThumb() || Subtarget->isThumb2())
+ setTargetDAGCombine(ISD::ABS);
}
bool ARMTargetLowering::useSoftFloat() const {
@@ -1288,6 +1460,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::SSAT: return "ARMISD::SSAT";
case ARMISD::USAT: return "ARMISD::USAT";
+ case ARMISD::ASRL: return "ARMISD::ASRL";
+ case ARMISD::LSRL: return "ARMISD::LSRL";
+ case ARMISD::LSLL: return "ARMISD::LSLL";
+
case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG";
case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG";
case ARMISD::RRX: return "ARMISD::RRX";
@@ -1332,23 +1508,25 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VCGTU: return "ARMISD::VCGTU";
case ARMISD::VTST: return "ARMISD::VTST";
- case ARMISD::VSHL: return "ARMISD::VSHL";
- case ARMISD::VSHRs: return "ARMISD::VSHRs";
- case ARMISD::VSHRu: return "ARMISD::VSHRu";
- case ARMISD::VRSHRs: return "ARMISD::VRSHRs";
- case ARMISD::VRSHRu: return "ARMISD::VRSHRu";
- case ARMISD::VRSHRN: return "ARMISD::VRSHRN";
- case ARMISD::VQSHLs: return "ARMISD::VQSHLs";
- case ARMISD::VQSHLu: return "ARMISD::VQSHLu";
- case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu";
- case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs";
- case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu";
- case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu";
- case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs";
- case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu";
- case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu";
- case ARMISD::VSLI: return "ARMISD::VSLI";
- case ARMISD::VSRI: return "ARMISD::VSRI";
+ case ARMISD::VSHLs: return "ARMISD::VSHLs";
+ case ARMISD::VSHLu: return "ARMISD::VSHLu";
+ case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM";
+ case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM";
+ case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM";
+ case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM";
+ case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM";
+ case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM";
+ case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM";
+ case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM";
+ case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM";
+ case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM";
+ case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM";
+ case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM";
+ case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM";
+ case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM";
+ case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM";
+ case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM";
+ case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM";
case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu";
case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs";
case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM";
@@ -1410,6 +1588,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD";
case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
+ case ARMISD::WLS: return "ARMISD::WLS";
}
return nullptr;
}
@@ -1423,11 +1602,14 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
/// getRegClassFor - Return the register class that should be used for the
/// specified value type.
-const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
+const TargetRegisterClass *
+ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
+ (void)isDivergent;
// Map v4i64 to QQ registers but do not make the type legal. Similarly map
// v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
- // load / store 4 to 8 consecutive D registers.
- if (Subtarget->hasNEON()) {
+ // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
+ // MVE Q registers.
+ if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
if (VT == MVT::v4i64)
return &ARM::QQPRRegClass;
if (VT == MVT::v8i64)
@@ -1590,8 +1772,6 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
-#include "ARMGenCallingConv.inc"
-
/// getEffectiveCallingConv - Get the effective calling convention, taking into
/// account presence of floating point hardware and calling convention
/// limitations, such as support for variadic functions.
@@ -1613,7 +1793,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
case CallingConv::C:
if (!Subtarget->isAAPCS_ABI())
return CallingConv::ARM_APCS;
- else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
+ else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
!isVarArg)
return CallingConv::ARM_AAPCS_VFP;
@@ -1622,10 +1802,11 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
case CallingConv::Fast:
case CallingConv::CXX_FAST_TLS:
if (!Subtarget->isAAPCS_ABI()) {
- if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
+ if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
return CallingConv::Fast;
return CallingConv::ARM_APCS;
- } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
+ } else if (Subtarget->hasVFP2Base() &&
+ !Subtarget->isThumb1Only() && !isVarArg)
return CallingConv::ARM_AAPCS_VFP;
else
return CallingConv::ARM_AAPCS;
@@ -1807,29 +1988,42 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool isVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
- bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
- bool isThisReturn = false;
- bool isSibCall = false;
+ bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+ bool isThisReturn = false;
auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
+ bool PreferIndirect = false;
// Disable tail calls if they're not supported.
if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
isTailCall = false;
+ if (isa<GlobalAddressSDNode>(Callee)) {
+ // If we're optimizing for minimum size and the function is called three or
+ // more times in this block, we can improve codesize by calling indirectly
+ // as BLXr has a 16-bit encoding.
+ auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+ if (CLI.CS) {
+ auto *BB = CLI.CS.getParent();
+ PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
+ count_if(GV->users(), [&BB](const User *U) {
+ return isa<Instruction>(U) &&
+ cast<Instruction>(U)->getParent() == BB;
+ }) > 2;
+ }
+ }
if (isTailCall) {
// Check if it's really possible to do a tail call.
- isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
- isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(),
- Outs, OutVals, Ins, DAG);
+ isTailCall = IsEligibleForTailCallOptimization(
+ Callee, CallConv, isVarArg, isStructRet,
+ MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
+ PreferIndirect);
if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// We don't support GuaranteedTailCallOpt for ARM, only automatically
// detected sibcalls.
- if (isTailCall) {
+ if (isTailCall)
++NumTailCalls;
- isSibCall = true;
- }
}
// Analyze operands of the call, assigning locations to each operand.
@@ -1841,14 +2035,14 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
- // For tail calls, memory operands are available in our caller's stack.
- if (isSibCall)
+ if (isTailCall) {
+ // For tail calls, memory operands are available in our caller's stack.
NumBytes = 0;
-
- // Adjust the stack pointer for the new arguments...
- // These operations are automatically eliminated by the prolog/epilog pass
- if (!isSibCall)
+ } else {
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
+ }
SDValue StackPtr =
DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
@@ -1970,7 +2164,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
Ops));
}
- } else if (!isSibCall) {
+ } else if (!isTailCall) {
assert(VA.isMemLoc());
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -1984,32 +2178,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
- // Tail call byval lowering might overwrite argument registers so in case of
- // tail call optimization the copies to registers are lowered later.
- if (!isTailCall)
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
- }
-
- // For tail calls lower the arguments to the 'real' stack slot.
- if (isTailCall) {
- // Force all the incoming stack arguments to be loaded from the stack
- // before any new outgoing arguments are stored to the stack, because the
- // outgoing stack slots may alias the incoming argument stack slots, and
- // the alias isn't otherwise explicit. This is slightly more conservative
- // than necessary, because it means that each store effectively depends
- // on every argument instead of just those arguments it would clobber.
-
- // Do not flag preceding copytoreg stuff together with the following stuff.
- InFlag = SDValue();
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
- }
- InFlag = SDValue();
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
}
// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
@@ -2064,17 +2236,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
}
} else if (isa<GlobalAddressSDNode>(Callee)) {
- // If we're optimizing for minimum size and the function is called three or
- // more times in this block, we can improve codesize by calling indirectly
- // as BLXr has a 16-bit encoding.
- auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
- auto *BB = CLI.CS.getParent();
- bool PreferIndirect =
- Subtarget->isThumb() && MF.getFunction().optForMinSize() &&
- count_if(GV->users(), [&BB](const User *U) {
- return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB;
- }) > 2;
-
if (!PreferIndirect) {
isDirect = true;
bool isDef = GV->isStrongDefinitionForLinker();
@@ -2098,7 +2259,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned TargetFlags = GV->hasDLLImportStorageClass()
? ARMII::MO_DLLIMPORT
: ARMII::MO_NO_FLAG;
- Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0,
+ Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
TargetFlags);
if (GV->hasDLLImportStorageClass())
Callee =
@@ -2142,7 +2303,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CallOpc = ARMISD::CALL_NOLINK;
else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
// Emit regular call when code size is the priority
- !MF.getFunction().optForMinSize())
+ !Subtarget->hasMinSize())
// "mov lr, pc; b _foo" to avoid confusing the RSP
CallOpc = ARMISD::CALL_NOLINK;
else
@@ -2306,28 +2467,25 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function.
-bool
-ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
- CallingConv::ID CalleeCC,
- bool isVarArg,
- bool isCalleeStructRet,
- bool isCallerStructRet,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SelectionDAG& DAG) const {
+bool ARMTargetLowering::IsEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ bool isCalleeStructRet, bool isCallerStructRet,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
+ const bool isIndirect) const {
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF.getCallingConv();
assert(Subtarget->supportsTailCall());
- // Tail calls to function pointers cannot be optimized for Thumb1 if the args
+ // Indirect tail calls cannot be optimized for Thumb1 if the args
// to the call take up r0-r3. The reason is that there are no legal registers
// left to hold the pointer to the function to be called.
if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
- !isa<GlobalAddressSDNode>(Callee.getNode()))
- return false;
+ (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect))
+ return false;
// Look for obvious safe cases to perform tail call optimization that do not
// require ABI changes. This is what gcc calls sibcall.
@@ -2756,7 +2914,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
auto M = const_cast<Module*>(DAG.getMachineFunction().
getFunction().getParent());
auto GV = new GlobalVariable(
- *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C,
+ *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
Twine(AFI->createPICLabelUId())
@@ -3225,7 +3383,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
} else if (Subtarget->isRWPI() && !IsRO) {
// SB-relative.
SDValue RelAddr;
- if (Subtarget->useMovt(DAG.getMachineFunction())) {
+ if (Subtarget->useMovt()) {
++NumMovwMovt;
SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
@@ -3245,7 +3403,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
// If we have T2 ops, we can materialize the address directly via movt/movw
// pair. This is always cheaper.
- if (Subtarget->useMovt(DAG.getMachineFunction())) {
+ if (Subtarget->useMovt()) {
++NumMovwMovt;
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into two nodes.
@@ -3268,7 +3426,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
SDLoc dl(Op);
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
- if (Subtarget->useMovt(DAG.getMachineFunction()))
+ if (Subtarget->useMovt())
++NumMovwMovt;
// FIXME: Once remat is capable of dealing with instructions with register
@@ -3288,7 +3446,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
- assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
+ assert(Subtarget->useMovt() &&
"Windows on ARM expects to use movw/movt");
assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
"ROPI/RWPI not currently supported for Windows");
@@ -3309,7 +3467,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into two nodes.
Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
- DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
+ DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
TargetFlags));
if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
@@ -3615,7 +3773,8 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
// argument passed via stack.
int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
CCInfo.getInRegsParamsCount(),
- CCInfo.getNextStackOffset(), 4);
+ CCInfo.getNextStackOffset(),
+ std::max(4U, TotalArgRegsSaveSize));
AFI->setVarArgsFrameIndex(FrameIndex);
}
@@ -3891,6 +4050,22 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
}
ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+
+ // If the RHS is a constant zero then the V (overflow) flag will never be
+ // set. This can allow us to simplify GE to PL or LT to MI, which can be
+ // simpler for other passes (like the peephole optimiser) to deal with.
+ if (isNullConstant(RHS)) {
+ switch (CondCode) {
+ default: break;
+ case ARMCC::GE:
+ CondCode = ARMCC::PL;
+ break;
+ case ARMCC::LT:
+ CondCode = ARMCC::MI;
+ break;
+ }
+ }
+
ARMISD::NodeType CompareType;
switch (CondCode) {
default:
@@ -3910,7 +4085,7 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
SelectionDAG &DAG, const SDLoc &dl,
bool InvalidOnQNaN) const {
- assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
+ assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
SDValue Cmp;
SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
if (!isFloatingPointZero(RHS))
@@ -4175,18 +4350,18 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
// Start by selecting the GE condition code for opcodes that return true for
// 'equality'
if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
- CC == ISD::SETULE)
+ CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
CondCode = ARMCC::GE;
// and GT for opcodes that return false for 'equality'.
else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
- CC == ISD::SETULT)
+ CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
CondCode = ARMCC::GT;
// Since we are constrained to GE/GT, if the opcode contains 'less', we need
// to swap the compare operands.
if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
- CC == ISD::SETULT)
+ CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
swpCmpOps = true;
// Both GT and GE are ordered comparisons, and return false for 'unordered'.
@@ -4212,8 +4387,9 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
}
// 'unordered or not equal' is 'anything but equal', so use the EQ condition
- // code and swap the VSEL operands.
- if (CC == ISD::SETUNE) {
+ // code and swap the VSEL operands. Also do this if we don't care about the
+ // unordered case.
+ if (CC == ISD::SETUNE || CC == ISD::SETNE) {
CondCode = ARMCC::EQ;
swpVselOps = true;
}
@@ -4222,7 +4398,7 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
SDValue TrueVal, SDValue ARMcc, SDValue CCR,
SDValue Cmp, SelectionDAG &DAG) const {
- if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
+ if (!Subtarget->hasFP64() && VT == MVT::f64) {
FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
@@ -4428,6 +4604,16 @@ static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
return false;
}
+bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
+ if (VT == MVT::f32)
+ return !Subtarget->hasVFP2Base();
+ if (VT == MVT::f64)
+ return !Subtarget->hasFP64();
+ if (VT == MVT::f16)
+ return !Subtarget->hasFullFP16();
+ return false;
+}
+
SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc dl(Op);
@@ -4471,9 +4657,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue TrueVal = Op.getOperand(2);
SDValue FalseVal = Op.getOperand(3);
- if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
- DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
- dl);
+ if (isUnsupportedFloatingType(LHS.getValueType())) {
+ DAG.getTargetLoweringInfo().softenSetCCOperands(
+ DAG, LHS.getValueType(), LHS, RHS, CC, dl);
// If softenSetCCOperands only returned one value, we should compare it to
// zero.
@@ -4494,8 +4680,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
// inverting the compare condition, swapping 'less' and 'greater') and
// sometimes need to swap the operands to the VSEL (which inverts the
// condition in the sense of firing whenever the previous condition didn't)
- if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
- TrueVal.getValueType() == MVT::f64)) {
+ if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
+ TrueVal.getValueType() == MVT::f32 ||
+ TrueVal.getValueType() == MVT::f64)) {
ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
@@ -4507,6 +4694,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue ARMcc;
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+ // Choose GE over PL, which vsel does now support
+ if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
+ ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
}
@@ -4514,12 +4704,15 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
bool InvalidOnQNaN;
FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
- // Normalize the fp compare. If RHS is zero we keep it there so we match
- // CMPFPw0 instead of CMPFP.
- if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) &&
- (TrueVal.getValueType() == MVT::f16 ||
- TrueVal.getValueType() == MVT::f32 ||
- TrueVal.getValueType() == MVT::f64)) {
+ // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
+ // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
+ // must use VSEL (limited condition codes), due to not having conditional f16
+ // moves.
+ if (Subtarget->hasFPARMv8Base() &&
+ !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
+ (TrueVal.getValueType() == MVT::f16 ||
+ TrueVal.getValueType() == MVT::f32 ||
+ TrueVal.getValueType() == MVT::f64)) {
bool swpCmpOps = false;
bool swpVselOps = false;
checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
@@ -4708,9 +4901,9 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Dest = Op.getOperand(4);
SDLoc dl(Op);
- if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
- DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
- dl);
+ if (isUnsupportedFloatingType(LHS.getValueType())) {
+ DAG.getTargetLoweringInfo().softenSetCCOperands(
+ DAG, LHS.getValueType(), LHS, RHS, CC, dl);
// If softenSetCCOperands only returned one value, we should compare it to
// zero.
@@ -4855,7 +5048,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
if (VT.isVector())
return LowerVectorFP_TO_INT(Op, DAG);
- if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
+ if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) {
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::FP_TO_SINT)
LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
@@ -4919,7 +5112,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
if (VT.isVector())
return LowerVectorINT_TO_FP(Op, DAG);
- if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
+ if (isUnsupportedFloatingType(VT)) {
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::SINT_TO_FP)
LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
@@ -4952,7 +5145,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
if (VT == MVT::f64)
- Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+ Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
DAG.getConstant(32, dl, MVT::i32));
else /*if (VT == MVT::f32)*/
@@ -4960,11 +5153,11 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
if (SrcVT == MVT::f32) {
Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
if (VT == MVT::f64)
- Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+ Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
DAG.getConstant(32, dl, MVT::i32));
} else if (VT == MVT::f32)
- Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
+ Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
DAG.getConstant(32, dl, MVT::i32));
Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
@@ -5469,40 +5662,100 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
return Res;
}
+/// Getvshiftimm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+ // Ignore bit_converts.
+ while (Op.getOpcode() == ISD::BITCAST)
+ Op = Op.getOperand(0);
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (!BVN ||
+ !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
+ ElementBits) ||
+ SplatBitSize > ElementBits)
+ return false;
+ Cnt = SplatBits.getSExtValue();
+ return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation. That value must be in the range:
+/// 0 <= Value < ElementBits for a left shift; or
+/// 0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+ assert(VT.isVector() && "vector shift count is not a vector type");
+ int64_t ElementBits = VT.getScalarSizeInBits();
+ if (!getVShiftImm(Op, ElementBits, Cnt))
+ return false;
+ return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation. For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+/// 1 <= |Value| <= ElementBits for a right shift; or
+/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+ int64_t &Cnt) {
+ assert(VT.isVector() && "vector shift count is not a vector type");
+ int64_t ElementBits = VT.getScalarSizeInBits();
+ if (!getVShiftImm(Op, ElementBits, Cnt))
+ return false;
+ if (!isIntrinsic)
+ return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
+ if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
+ Cnt = -Cnt;
+ return true;
+ }
+ return false;
+}
+
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
EVT VT = N->getValueType(0);
SDLoc dl(N);
+ int64_t Cnt;
if (!VT.isVector())
return SDValue();
- // Lower vector shifts on NEON to use VSHL.
- assert(ST->hasNEON() && "unexpected vector shift");
+ // We essentially have two forms here. Shift by an immediate and shift by a
+ // vector register (there are also shift by a gpr, but that is just handled
+ // with a tablegen pattern). We cannot easily match shift by an immediate in
+ // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
+ // For shifting by a vector, we don't have VSHR, only VSHL (which can be
+ // signed or unsigned, and a negative shift indicates a shift right).
+ if (N->getOpcode() == ISD::SHL) {
+ if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
+ return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
+ DAG.getConstant(Cnt, dl, MVT::i32));
+ return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
+ N->getOperand(1));
+ }
- // Left shifts translate directly to the vshiftu intrinsic.
- if (N->getOpcode() == ISD::SHL)
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
- MVT::i32),
- N->getOperand(0), N->getOperand(1));
+ assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
+ "unexpected vector shift opcode");
- assert((N->getOpcode() == ISD::SRA ||
- N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
+ if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
+ unsigned VShiftOpc =
+ (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
+ return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
+ DAG.getConstant(Cnt, dl, MVT::i32));
+ }
- // NEON uses the same intrinsics for both left and right shifts. For
- // right shifts, the shift amounts are negative, so negate the vector of
- // shift amounts.
+ // Other right shifts we don't have operations for (we use a shift left by a
+ // negative number).
EVT ShiftVT = N->getOperand(1).getValueType();
- SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
- getZeroVector(ShiftVT, DAG, dl),
- N->getOperand(1));
- Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
- Intrinsic::arm_neon_vshifts :
- Intrinsic::arm_neon_vshiftu);
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(vshiftInt, dl, MVT::i32),
- N->getOperand(0), NegatedCount);
+ SDValue NegatedCount = DAG.getNode(
+ ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
+ unsigned VShiftOpc =
+ (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
+ return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
}
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
@@ -5514,15 +5767,59 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
if (VT != MVT::i64)
return SDValue();
- assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+ assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
+ N->getOpcode() == ISD::SHL) &&
"Unknown shift to lower!");
+ unsigned ShOpc = N->getOpcode();
+ if (ST->hasMVEIntegerOps()) {
+ SDValue ShAmt = N->getOperand(1);
+ unsigned ShPartsOpc = ARMISD::LSLL;
+ ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
+
+ // If the shift amount is greater than 32 then do the default optimisation
+ if (Con && Con->getZExtValue() > 32)
+ return SDValue();
+
+ // Extract the lower 32 bits of the shift amount if it's an i64
+ if (ShAmt->getValueType(0) == MVT::i64)
+ ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt,
+ DAG.getConstant(0, dl, MVT::i32));
+
+ if (ShOpc == ISD::SRL) {
+ if (!Con)
+ // There is no t2LSRLr instruction so negate and perform an lsll if the
+ // shift amount is in a register, emulating a right shift.
+ ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(0, dl, MVT::i32), ShAmt);
+ else
+ // Else generate an lsrl on the immediate shift amount
+ ShPartsOpc = ARMISD::LSRL;
+ } else if (ShOpc == ISD::SRA)
+ ShPartsOpc = ARMISD::ASRL;
+
+ // Lower 32 bits of the destination/source
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+ DAG.getConstant(0, dl, MVT::i32));
+ // Upper 32 bits of the destination/source
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+ DAG.getConstant(1, dl, MVT::i32));
+
+ // Generate the shift operation as computed above
+ Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
+ ShAmt);
+ // The upper 32 bits come from the second return value of lsll
+ Hi = SDValue(Lo.getNode(), 1);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+ }
+
// We only lower SRA, SRL of 1 here, all others use generic lowering.
- if (!isOneConstant(N->getOperand(1)))
+ if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
return SDValue();
// If we are in thumb mode, we don't have RRX.
- if (ST->isThumb1Only()) return SDValue();
+ if (ST->isThumb1Only())
+ return SDValue();
// Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
@@ -5731,7 +6028,7 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
}
/// isNEONModifiedImm - Check if the specified splat value corresponds to a
-/// valid vector constant for a NEON instruction with a "modified immediate"
+/// valid vector constant for a NEON or MVE instruction with a "modified immediate"
/// operand (e.g., VMOV). If so, return the encoded value.
static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
unsigned SplatBitSize, SelectionDAG &DAG,
@@ -5817,6 +6114,10 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
break;
}
+ // cmode == 0b1101 is not supported for MVE VMVN
+ if (type == MVEVMVNModImm)
+ return SDValue();
+
if ((SplatBits & ~0xffffff) == 0 &&
((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
// Value = 0x00nnffff: Op=x, Cmode=1101.
@@ -5902,12 +6203,12 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
}
}
- if (!ST->hasVFP3())
+ if (!ST->hasVFP3Base())
return SDValue();
// Use the default (constant pool) lowering for double constants when we have
// an SP-only FPU
- if (IsDouble && Subtarget->isFPOnlySP())
+ if (IsDouble && !Subtarget->hasFP64())
return SDValue();
// Try splatting with a VMOV.f32...
@@ -6383,13 +6684,15 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
if (SplatUndef.isAllOnesValue())
return DAG.getUNDEF(VT);
- if (SplatBitSize <= 64) {
+ if ((ST->hasNEON() && SplatBitSize <= 64) ||
+ (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
// Check if an immediate VMOV works.
EVT VmovVT;
SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VmovVT, VT.is128BitVector(),
VMOVModImm);
+
if (Val.getNode()) {
SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
@@ -6397,10 +6700,10 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
// Try an immediate VMVN.
uint64_t NegatedImm = (~SplatBits).getZExtValue();
- Val = isNEONModifiedImm(NegatedImm,
- SplatUndef.getZExtValue(), SplatBitSize,
- DAG, dl, VmovVT, VT.is128BitVector(),
- VMVNModImm);
+ Val = isNEONModifiedImm(
+ NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
+ DAG, dl, VmovVT, VT.is128BitVector(),
+ ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
if (Val.getNode()) {
SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
@@ -6515,10 +6818,13 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
}
if (VT.getVectorElementType().isFloatingPoint()) {
SmallVector<SDValue, 8> Ops;
+ MVT FVT = VT.getVectorElementType().getSimpleVT();
+ assert(FVT == MVT::f32 || FVT == MVT::f16);
+ MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
for (unsigned i = 0; i < NumElts; ++i)
- Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
+ Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
Op.getOperand(i)));
- EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
Val = LowerBUILD_VECTOR(Val, DAG, ST);
if (Val.getNode())
@@ -6544,7 +6850,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
return shuffle;
}
- if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
+ if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
// If we haven't found an efficient lowering, try splitting a 128-bit vector
// into two 64-bit vectors; we might discover a better way to lower it.
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
@@ -6799,6 +7105,38 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
+enum ShuffleOpCodes {
+ OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+ OP_VREV,
+ OP_VDUP0,
+ OP_VDUP1,
+ OP_VDUP2,
+ OP_VDUP3,
+ OP_VEXT1,
+ OP_VEXT2,
+ OP_VEXT3,
+ OP_VUZPL, // VUZP, left result
+ OP_VUZPR, // VUZP, right result
+ OP_VZIPL, // VZIP, left result
+ OP_VZIPR, // VZIP, right result
+ OP_VTRNL, // VTRN, left result
+ OP_VTRNR // VTRN, right result
+};
+
+static bool isLegalMVEShuffleOp(unsigned PFEntry) {
+ unsigned OpNum = (PFEntry >> 26) & 0x0F;
+ switch (OpNum) {
+ case OP_COPY:
+ case OP_VREV:
+ case OP_VDUP0:
+ case OP_VDUP1:
+ case OP_VDUP2:
+ case OP_VDUP3:
+ return true;
+ }
+ return false;
+}
+
/// isShuffleMaskLegal - Targets can use this to indicate that they only
/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
@@ -6820,7 +7158,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
- if (Cost <= 4)
+ if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
return true;
}
@@ -6828,15 +7166,22 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
unsigned Imm, WhichResult;
unsigned EltSize = VT.getScalarSizeInBits();
- return (EltSize >= 32 ||
- ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
- isVREVMask(M, VT, 64) ||
- isVREVMask(M, VT, 32) ||
- isVREVMask(M, VT, 16) ||
- isVEXTMask(M, VT, ReverseVEXT, Imm) ||
- isVTBLMask(M, VT) ||
- isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
- ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
+ if (EltSize >= 32 ||
+ ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+ isVREVMask(M, VT, 64) ||
+ isVREVMask(M, VT, 32) ||
+ isVREVMask(M, VT, 16))
+ return true;
+ else if (Subtarget->hasNEON() &&
+ (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+ isVTBLMask(M, VT) ||
+ isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
+ return true;
+ else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
+ isReverseMask(M, VT))
+ return true;
+ else
+ return false;
}
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
@@ -6848,24 +7193,6 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
- enum {
- OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
- OP_VREV,
- OP_VDUP0,
- OP_VDUP1,
- OP_VDUP2,
- OP_VDUP3,
- OP_VEXT1,
- OP_VEXT2,
- OP_VEXT3,
- OP_VUZPL, // VUZP, left result
- OP_VUZPR, // VUZP, right result
- OP_VZIPL, // VZIP, left result
- OP_VZIPR, // VZIP, right result
- OP_VTRNL, // VTRN, left result
- OP_VTRNR // VTRN, right result
- };
-
if (OpNum == OP_COPY) {
if (LHSID == (1*9+2)*9+3) return LHS;
assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
@@ -6955,7 +7282,8 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
DAG.getConstant(ExtractNum, DL, MVT::i32));
}
-static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
SDLoc dl(Op);
@@ -6999,9 +7327,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
DAG.getConstant(Lane, dl, MVT::i32));
}
- bool ReverseVEXT;
- unsigned Imm;
- if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+ bool ReverseVEXT = false;
+ unsigned Imm = 0;
+ if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
if (ReverseVEXT)
std::swap(V1, V2);
return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
@@ -7015,7 +7343,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
if (isVREVMask(ShuffleMask, VT, 16))
return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
- if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+ if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
DAG.getConstant(Imm, dl, MVT::i32));
}
@@ -7025,14 +7353,16 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
// source operands and with masks corresponding to both results of one of
// these operations, DAG memoization will ensure that a single node is
// used for both shuffles.
- unsigned WhichResult;
- bool isV_UNDEF;
- if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
- ShuffleMask, VT, WhichResult, isV_UNDEF)) {
- if (isV_UNDEF)
- V2 = V1;
- return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
- .getValue(WhichResult);
+ unsigned WhichResult = 0;
+ bool isV_UNDEF = false;
+ if (ST->hasNEON()) {
+ if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
+ ShuffleMask, VT, WhichResult, isV_UNDEF)) {
+ if (isV_UNDEF)
+ V2 = V1;
+ return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
+ .getValue(WhichResult);
+ }
}
// Also check for these shuffles through CONCAT_VECTORS: we canonicalize
@@ -7050,7 +7380,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
// ->
// concat(VZIP(v1, v2):0, :1)
//
- if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
+ if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
SDValue SubV1 = V1->getOperand(0);
SDValue SubV2 = V1->getOperand(1);
EVT SubVT = SubV1.getValueType();
@@ -7092,8 +7422,18 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
- if (Cost <= 4)
- return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ if (Cost <= 4) {
+ if (ST->hasNEON())
+ return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ else if (isLegalMVEShuffleOp(PFEntry)) {
+ unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+ unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
+ unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
+ unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
+ if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
+ return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ }
+ }
}
// Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
@@ -7118,22 +7458,50 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
}
- if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
+ if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
- if (VT == MVT::v8i8)
+ if (ST->hasNEON() && VT == MVT::v8i8)
if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
return NewOp;
return SDValue();
}
-static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::
+LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
// INSERT_VECTOR_ELT is legal only for immediate indexes.
SDValue Lane = Op.getOperand(2);
if (!isa<ConstantSDNode>(Lane))
return SDValue();
+ SDValue Elt = Op.getOperand(1);
+ EVT EltVT = Elt.getValueType();
+ if (getTypeAction(*DAG.getContext(), EltVT) ==
+ TargetLowering::TypePromoteFloat) {
+ // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
+ // but the type system will try to do that if we don't intervene.
+ // Reinterpret any such vector-element insertion as one with the
+ // corresponding integer types.
+
+ SDLoc dl(Op);
+
+ EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
+ assert(getTypeAction(*DAG.getContext(), IEltVT) !=
+ TargetLowering::TypePromoteFloat);
+
+ SDValue VecIn = Op.getOperand(0);
+ EVT VecVT = VecIn.getValueType();
+ EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
+ VecVT.getVectorNumElements());
+
+ SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
+ SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
+ SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
+ IVecIn, IElt, Lane);
+ return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
+ }
+
return Op;
}
@@ -7809,8 +8177,7 @@ ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
return SDValue();
const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
- const auto &MF = DAG.getMachineFunction();
- const bool MinSize = MF.getFunction().optForMinSize();
+ const bool MinSize = ST.hasMinSize();
const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
: ST.hasDivideInARMMode();
@@ -8063,7 +8430,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
- case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
@@ -8149,6 +8516,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
break;
case ISD::SRL:
case ISD::SRA:
+ case ISD::SHL:
Res = Expand64BitShift(N, DAG, Subtarget);
break;
case ISD::SREM:
@@ -8175,6 +8543,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
return;
case ISD::INTRINSIC_WO_CHAIN:
return ReplaceLongIntrinsic(N, Results, DAG);
+ case ISD::ABS:
+ lowerABS(N, Results, DAG);
+ return ;
+
}
if (Res.getNode())
Results.push_back(Res);
@@ -8980,7 +9352,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
// Load an immediate to varEnd.
unsigned varEnd = MRI.createVirtualRegister(TRC);
- if (Subtarget->useMovt(*MF)) {
+ if (Subtarget->useMovt()) {
unsigned Vtmp = varEnd;
if ((LoopSize & 0xFFFF0000) != 0)
Vtmp = MRI.createVirtualRegister(TRC);
@@ -9003,18 +9375,23 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
if (Align == 0)
Align = MF->getDataLayout().getTypeAllocSize(C->getType());
unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+ MachineMemOperand *CPMMO =
+ MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+ MachineMemOperand::MOLoad, 4, 4);
if (IsThumb)
BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
.addReg(varEnd, RegState::Define)
.addConstantPoolIndex(Idx)
- .add(predOps(ARMCC::AL));
+ .add(predOps(ARMCC::AL))
+ .addMemOperand(CPMMO);
else
BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
.addReg(varEnd, RegState::Define)
.addConstantPoolIndex(Idx)
.addImm(0)
- .add(predOps(ARMCC::AL));
+ .add(predOps(ARMCC::AL))
+ .addMemOperand(CPMMO);
}
BB->addSuccessor(loopMBB);
@@ -9262,7 +9639,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.add(MI.getOperand(2)) // Rn
.add(MI.getOperand(3)) // PredImm
.add(MI.getOperand(4)) // PredReg
- .add(MI.getOperand(0)); // Rt
+ .add(MI.getOperand(0)) // Rt
+ .cloneMemRefs(MI);
MI.eraseFromParent();
return BB;
}
@@ -10372,6 +10750,22 @@ static SDValue PerformAddeSubeCombine(SDNode *N,
return SDValue();
}
+static SDValue PerformABSCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ SDValue res;
+ SelectionDAG &DAG = DCI.DAG;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
+ return SDValue();
+
+ if (!TLI.expandABS(N, res, DAG))
+ return SDValue();
+
+ return res;
+}
+
/// PerformADDECombine - Target-specific dag combine transform from
/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
@@ -10419,11 +10813,28 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
if (Level == BeforeLegalizeTypes)
return true;
- if (Subtarget->isThumb() && Subtarget->isThumb1Only())
+ if (N->getOpcode() != ISD::SHL)
return true;
- if (N->getOpcode() != ISD::SHL)
+ if (Subtarget->isThumb1Only()) {
+ // Avoid making expensive immediates by commuting shifts. (This logic
+ // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
+ // for free.)
+ if (N->getOpcode() != ISD::SHL)
+ return true;
+ SDValue N1 = N->getOperand(0);
+ if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
+ N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
+ return true;
+ if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
+ if (Const->getAPIntValue().ult(256))
+ return false;
+ if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
+ Const->getAPIntValue().sgt(-256))
+ return false;
+ }
return true;
+ }
// Turn off commute-with-shift transform after legalization, so it doesn't
// conflict with PerformSHLSimplify. (We could try to detect when
@@ -10432,9 +10843,8 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
return false;
}
-bool
-ARMTargetLowering::shouldFoldShiftPairToMask(const SDNode *N,
- CombineLevel Level) const {
+bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
+ const SDNode *N, CombineLevel Level) const {
if (!Subtarget->isThumb1Only())
return true;
@@ -10444,6 +10854,15 @@ ARMTargetLowering::shouldFoldShiftPairToMask(const SDNode *N,
return false;
}
+bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
+ if (!Subtarget->hasNEON()) {
+ if (Subtarget->isThumb1Only())
+ return VT.getScalarSizeInBits() <= 32;
+ return true;
+ }
+ return VT.isScalarInteger();
+}
+
static SDValue PerformSHLSimplify(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
@@ -10830,7 +11249,7 @@ static SDValue PerformANDCombine(SDNode *N,
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
- if (BVN &&
+ if (BVN && Subtarget->hasNEON() &&
BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
if (SplatBitSize <= 64) {
EVT VbicVT;
@@ -11308,7 +11727,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
const ARMSubtarget *Subtarget) {
// vmovrrd(vmovdrr x, y) -> x,y
SDValue InDouble = N->getOperand(0);
- if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
+ if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
// vmovrrd(load f64) -> (load i32), (load i32)
@@ -11329,9 +11748,11 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
DAG.getConstant(4, DL, MVT::i32));
- SDValue NewLD2 = DAG.getLoad(
- MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(),
- std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags());
+
+ SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
+ LD->getPointerInfo().getWithOffset(4),
+ std::min(4U, LD->getAlignment()),
+ LD->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
if (DCI.DAG.getDataLayout().isBigEndian())
@@ -11922,10 +12343,14 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformVDUPCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
SDValue Op = N->getOperand(0);
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
// Match VDUP(LOAD) -> VLD1DUP.
// We match this pattern here rather than waiting for isel because the
// transform is only legal for unindexed loads.
@@ -12132,11 +12557,11 @@ static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
uint32_t IntBits = IntTy.getSizeInBits();
unsigned NumLanes = Op.getValueType().getVectorNumElements();
- if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
+ if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
// These instructions only exist converting from f32 to i32. We can handle
// smaller integers by generating an extra truncate, but larger ones would
- // be lossy. We also can't handle more then 4 lanes, since these intructions
- // only support v2i32/v4i32 types.
+ // be lossy. We also can't handle anything other than 2 or 4 lanes, since
+ // these intructions only support v2i32/v4i32 types.
return SDValue();
}
@@ -12190,11 +12615,11 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
uint32_t IntBits = IntTy.getSizeInBits();
unsigned NumLanes = Op.getValueType().getVectorNumElements();
- if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
+ if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
// These instructions only exist converting from i32 to f32. We can handle
// smaller integers by generating an extra extend, but larger ones would
- // be lossy. We also can't handle more then 4 lanes, since these intructions
- // only support v2i32/v4i32 types.
+ // be lossy. We also can't handle anything other than 2 or 4 lanes, since
+ // these intructions only support v2i32/v4i32 types.
return SDValue();
}
@@ -12220,58 +12645,6 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
ConvInput, DAG.getConstant(C, dl, MVT::i32));
}
-/// Getvshiftimm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift operation, where all the elements of the
-/// build_vector must have the same constant integer value.
-static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
- // Ignore bit_converts.
- while (Op.getOpcode() == ISD::BITCAST)
- Op = Op.getOperand(0);
- BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
- APInt SplatBits, SplatUndef;
- unsigned SplatBitSize;
- bool HasAnyUndefs;
- if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
- HasAnyUndefs, ElementBits) ||
- SplatBitSize > ElementBits)
- return false;
- Cnt = SplatBits.getSExtValue();
- return true;
-}
-
-/// isVShiftLImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift left operation. That value must be in the range:
-/// 0 <= Value < ElementBits for a left shift; or
-/// 0 <= Value <= ElementBits for a long left shift.
-static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
- assert(VT.isVector() && "vector shift count is not a vector type");
- int64_t ElementBits = VT.getScalarSizeInBits();
- if (! getVShiftImm(Op, ElementBits, Cnt))
- return false;
- return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
-}
-
-/// isVShiftRImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift right operation. For a shift opcode, the value
-/// is positive, but for an intrinsic the value count must be negative. The
-/// absolute value must be in the range:
-/// 1 <= |Value| <= ElementBits for a right shift; or
-/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
-static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
- int64_t &Cnt) {
- assert(VT.isVector() && "vector shift count is not a vector type");
- int64_t ElementBits = VT.getScalarSizeInBits();
- if (! getVShiftImm(Op, ElementBits, Cnt))
- return false;
- if (!isIntrinsic)
- return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
- if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
- Cnt = -Cnt;
- return true;
- }
- return false;
-}
-
/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
@@ -12307,12 +12680,12 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
case Intrinsic::arm_neon_vshifts:
case Intrinsic::arm_neon_vshiftu:
if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
- VShiftOpc = ARMISD::VSHL;
+ VShiftOpc = ARMISD::VSHLIMM;
break;
}
if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
- VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
- ARMISD::VSHRs : ARMISD::VSHRu);
+ VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
+ : ARMISD::VSHRuIMM);
break;
}
return SDValue();
@@ -12357,29 +12730,41 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
// Opcode already set above.
break;
case Intrinsic::arm_neon_vrshifts:
- VShiftOpc = ARMISD::VRSHRs; break;
+ VShiftOpc = ARMISD::VRSHRsIMM;
+ break;
case Intrinsic::arm_neon_vrshiftu:
- VShiftOpc = ARMISD::VRSHRu; break;
+ VShiftOpc = ARMISD::VRSHRuIMM;
+ break;
case Intrinsic::arm_neon_vrshiftn:
- VShiftOpc = ARMISD::VRSHRN; break;
+ VShiftOpc = ARMISD::VRSHRNIMM;
+ break;
case Intrinsic::arm_neon_vqshifts:
- VShiftOpc = ARMISD::VQSHLs; break;
+ VShiftOpc = ARMISD::VQSHLsIMM;
+ break;
case Intrinsic::arm_neon_vqshiftu:
- VShiftOpc = ARMISD::VQSHLu; break;
+ VShiftOpc = ARMISD::VQSHLuIMM;
+ break;
case Intrinsic::arm_neon_vqshiftsu:
- VShiftOpc = ARMISD::VQSHLsu; break;
+ VShiftOpc = ARMISD::VQSHLsuIMM;
+ break;
case Intrinsic::arm_neon_vqshiftns:
- VShiftOpc = ARMISD::VQSHRNs; break;
+ VShiftOpc = ARMISD::VQSHRNsIMM;
+ break;
case Intrinsic::arm_neon_vqshiftnu:
- VShiftOpc = ARMISD::VQSHRNu; break;
+ VShiftOpc = ARMISD::VQSHRNuIMM;
+ break;
case Intrinsic::arm_neon_vqshiftnsu:
- VShiftOpc = ARMISD::VQSHRNsu; break;
+ VShiftOpc = ARMISD::VQSHRNsuIMM;
+ break;
case Intrinsic::arm_neon_vqrshiftns:
- VShiftOpc = ARMISD::VQRSHRNs; break;
+ VShiftOpc = ARMISD::VQRSHRNsIMM;
+ break;
case Intrinsic::arm_neon_vqrshiftnu:
- VShiftOpc = ARMISD::VQRSHRNu; break;
+ VShiftOpc = ARMISD::VQRSHRNuIMM;
+ break;
case Intrinsic::arm_neon_vqrshiftnsu:
- VShiftOpc = ARMISD::VQRSHRNsu; break;
+ VShiftOpc = ARMISD::VQRSHRNsuIMM;
+ break;
}
SDLoc dl(N);
@@ -12393,9 +12778,9 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
unsigned VShiftOpc = 0;
if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
- VShiftOpc = ARMISD::VSLI;
+ VShiftOpc = ARMISD::VSLIIMM;
else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
- VShiftOpc = ARMISD::VSRI;
+ VShiftOpc = ARMISD::VSRIIMM;
else {
llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
}
@@ -12420,8 +12805,10 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
/// combining instead of DAG legalizing because the build_vectors for 64-bit
/// vector element shift counts are generally not legal, and it is hard to see
/// their values after they get legalized to loads from a constant pool.
-static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue PerformShiftCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
+ SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
// Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
@@ -12436,12 +12823,47 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
+ N->getOperand(0)->getOpcode() == ISD::AND &&
+ N->getOperand(0)->hasOneUse()) {
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+ // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
+ // usually show up because instcombine prefers to canonicalize it to
+ // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
+ // out of GEP lowering in some cases.
+ SDValue N0 = N->getOperand(0);
+ ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!ShiftAmtNode)
+ return SDValue();
+ uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
+ ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ if (!AndMaskNode)
+ return SDValue();
+ uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
+ // Don't transform uxtb/uxth.
+ if (AndMask == 255 || AndMask == 65535)
+ return SDValue();
+ if (isMask_32(AndMask)) {
+ uint32_t MaskedBits = countLeadingZeros(AndMask);
+ if (MaskedBits > ShiftAmt) {
+ SDLoc DL(N);
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
+ DAG.getConstant(MaskedBits, DL, MVT::i32));
+ return DAG.getNode(
+ ISD::SRL, DL, MVT::i32, SHL,
+ DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
+ }
+ }
+ }
+
// Nothing to be done for scalar shifts.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!VT.isVector() || !TLI.isTypeLegal(VT))
return SDValue();
+ if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
+ return SDValue();
- assert(ST->hasNEON() && "unexpected vector shift");
int64_t Cnt;
switch (N->getOpcode()) {
@@ -12450,7 +12872,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
case ISD::SHL:
if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
SDLoc dl(N);
- return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
+ return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
DAG.getConstant(Cnt, dl, MVT::i32));
}
break;
@@ -12458,8 +12880,8 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
case ISD::SRA:
case ISD::SRL:
if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
- unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
- ARMISD::VSHRs : ARMISD::VSHRu);
+ unsigned VShiftOpc =
+ (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
SDLoc dl(N);
return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
DAG.getConstant(Cnt, dl, MVT::i32));
@@ -12606,6 +13028,45 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
return V;
}
+static SDValue PerformHWLoopCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *ST) {
+ // Look for (brcond (xor test.set.loop.iterations, -1)
+ SDValue CC = N->getOperand(1);
+ unsigned Opc = CC->getOpcode();
+ SDValue Int;
+
+ if ((Opc == ISD::XOR || Opc == ISD::SETCC) &&
+ (CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) {
+
+ assert((isa<ConstantSDNode>(CC->getOperand(1)) &&
+ cast<ConstantSDNode>(CC->getOperand(1))->isOne()) &&
+ "Expected to compare against 1");
+
+ Int = CC->getOperand(0);
+ } else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN)
+ Int = CC;
+ else
+ return SDValue();
+
+ unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue();
+ if (IntOp != Intrinsic::test_set_loop_iterations)
+ return SDValue();
+
+ SDLoc dl(Int);
+ SDValue Chain = N->getOperand(0);
+ SDValue Elements = Int.getOperand(2);
+ SDValue ExitBlock = N->getOperand(2);
+
+ // TODO: Once we start supporting tail predication, we can add another
+ // operand to WLS for the number of elements processed in a vector loop.
+
+ SDValue Ops[] = { Chain, Elements, ExitBlock };
+ SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+ DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
+ return Res;
+}
+
/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
SDValue
ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
@@ -12779,15 +13240,21 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
// On Thumb1, the DAG above may be further combined if z is a power of 2
// (z == 2 ^ K).
// CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
- // merge t3, t4
- // where t1 = (SUBCARRY (SUB x, y), z, 0)
- // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
- // t3 = if K != 0 then (SHL t2:0, K) else t2:0
- // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ]
+ // t1 = (USUBO (SUB x, y), 1)
+ // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
+ // Result = if K != 0 then (SHL t2:0, K) else t2:0
+ //
+ // This also handles the special case of comparing against zero; it's
+ // essentially, the same pattern, except there's no SUBS:
+ // CMOV x, z, !=, (CMPZ x, 0) ->
+ // t1 = (USUBO x, 1)
+ // t2 = (SUBCARRY x, t1:0, t1:1)
+ // Result = if K != 0 then (SHL t2:0, K) else t2:0
const APInt *TrueConst;
if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
- (FalseVal.getOpcode() == ARMISD::SUBS) &&
- (FalseVal.getOperand(0) == LHS) && (FalseVal.getOperand(1) == RHS) &&
+ ((FalseVal.getOpcode() == ARMISD::SUBS &&
+ FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) ||
+ (FalseVal == LHS && isNullConstant(RHS))) &&
(TrueConst = isPowerOf2Constant(TrueVal))) {
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
unsigned ShiftAmount = TrueConst->logBase2();
@@ -12795,10 +13262,6 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
TrueVal = DAG.getConstant(1, dl, VT);
SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
- // Make it a carry, not a borrow.
- SDValue Carry = DAG.getNode(
- ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1));
- Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry);
if (ShiftAmount)
Res = DAG.getNode(ISD::SHL, dl, VT, Res,
@@ -12826,6 +13289,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
default: break;
+ case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
@@ -12834,6 +13298,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
+ case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget);
case ARMISD::ADDC:
case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
@@ -12845,7 +13310,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
- case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
+ case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return PerformVCVTCombine(N, DCI.DAG, Subtarget);
@@ -12854,7 +13319,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
case ISD::SHL:
case ISD::SRA:
- case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget);
+ case ISD::SRL:
+ return PerformShiftCombine(N, DCI, Subtarget);
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
@@ -12957,9 +13423,9 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
}
-bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
- unsigned,
- unsigned,
+bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
+ unsigned Alignment,
+ MachineMemOperand::Flags,
bool *Fast) const {
// Depends what it gets converted into if the type is weird.
if (!VT.isSimple())
@@ -12967,23 +13433,18 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
+ auto Ty = VT.getSimpleVT().SimpleTy;
- switch (VT.getSimpleVT().SimpleTy) {
- default:
- return false;
- case MVT::i8:
- case MVT::i16:
- case MVT::i32: {
+ if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
// Unaligned access can use (for example) LRDB, LRDH, LDR
if (AllowsUnaligned) {
if (Fast)
*Fast = Subtarget->hasV7Ops();
return true;
}
- return false;
}
- case MVT::f64:
- case MVT::v2f64: {
+
+ if (Ty == MVT::f64 || Ty == MVT::v2f64) {
// For any little-endian targets with neon, we can support unaligned ld/st
// of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
// A big-endian target may also explicitly support unaligned accesses
@@ -12992,9 +13453,54 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
*Fast = true;
return true;
}
- return false;
}
+
+ if (!Subtarget->hasMVEIntegerOps())
+ return false;
+ if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 &&
+ Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 &&
+ Ty != MVT::v2f64 &&
+ // These are for truncated stores
+ Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16)
+ return false;
+
+ if (Subtarget->isLittle()) {
+ // In little-endian MVE, the store instructions VSTRB.U8,
+ // VSTRH.U16 and VSTRW.U32 all store the vector register in
+ // exactly the same format, and differ only in the range of
+ // their immediate offset field and the required alignment.
+ //
+ // In particular, VSTRB.U8 can store a vector at byte alignment.
+ // So at this stage we can simply say that loads/stores of all
+ // 128-bit wide vector types are permitted at any alignment,
+ // because we know at least _one_ instruction can manage that.
+ //
+ // Later on we might find that some of those loads are better
+ // generated as VLDRW.U32 if alignment permits, to take
+ // advantage of the larger immediate range. But for the moment,
+ // all that matters is that if we don't lower the load then
+ // _some_ instruction can handle it.
+ if (Fast)
+ *Fast = true;
+ return true;
+ } else {
+ // In big-endian MVE, those instructions aren't so similar
+ // after all, because they reorder the bytes of the vector
+ // differently. So this time we can only store a particular
+ // kind of vector if its alignment is at least the element
+ // type. And we can't store vectors of i64 or f64 at all
+ // without having to do some postprocessing, because there's
+ // no VSTRD.U64.
+ if (Ty == MVT::v16i8 ||
+ ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) ||
+ ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
}
+
+ return false;
}
static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
@@ -13003,24 +13509,24 @@ static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
(DstAlign == 0 || DstAlign % AlignCheck == 0));
}
-EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
- unsigned DstAlign, unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset,
- bool MemcpyStrSrc,
- MachineFunction &MF) const {
- const Function &F = MF.getFunction();
-
+EVT ARMTargetLowering::getOptimalMemOpType(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
// See if we can use NEON instructions for this...
if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
- !F.hasFnAttribute(Attribute::NoImplicitFloat)) {
+ !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
bool Fast;
if (Size >= 16 &&
(memOpAlign(SrcAlign, DstAlign, 16) ||
- (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
+ (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1,
+ MachineMemOperand::MONone, &Fast) &&
+ Fast))) {
return MVT::v2f64;
} else if (Size >= 8 &&
(memOpAlign(SrcAlign, DstAlign, 8) ||
- (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
+ (allowsMisalignedMemoryAccesses(
+ MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) &&
Fast))) {
return MVT::f64;
}
@@ -13089,6 +13595,46 @@ bool ARMTargetLowering::isFNegFree(EVT VT) const {
return false;
}
+/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
+/// of the vector elements.
+static bool areExtractExts(Value *Ext1, Value *Ext2) {
+ auto areExtDoubled = [](Instruction *Ext) {
+ return Ext->getType()->getScalarSizeInBits() ==
+ 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
+ };
+
+ if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
+ !match(Ext2, m_ZExtOrSExt(m_Value())) ||
+ !areExtDoubled(cast<Instruction>(Ext1)) ||
+ !areExtDoubled(cast<Instruction>(Ext2)))
+ return false;
+
+ return true;
+}
+
+/// Check if sinking \p I's operands to I's basic block is profitable, because
+/// the operands can be folded into a target instruction, e.g.
+/// sext/zext can be folded into vsubl.
+bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
+ SmallVectorImpl<Use *> &Ops) const {
+ if (!Subtarget->hasNEON() || !I->getType()->isVectorTy())
+ return false;
+
+ switch (I->getOpcode()) {
+ case Instruction::Sub:
+ case Instruction::Add: {
+ if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+ return false;
+ Ops.push_back(&I->getOperandUse(0));
+ Ops.push_back(&I->getOperandUse(1));
+ return true;
+ }
+ default:
+ return false;
+ }
+ return false;
+}
+
bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
EVT VT = ExtVal.getValueType();
@@ -13105,7 +13651,7 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
SDNode *U = *ExtVal->use_begin();
if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
- U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
+ U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
return false;
return true;
@@ -13142,7 +13688,6 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
unsigned Scale = 1;
switch (VT.getSimpleVT().SimpleTy) {
- default: return false;
case MVT::i1:
case MVT::i8:
// Scale == 1;
@@ -13151,7 +13696,8 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
// Scale == 2;
Scale = 2;
break;
- case MVT::i32:
+ default:
+ // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
// Scale == 4;
Scale = 4;
break;
@@ -13159,38 +13705,58 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
if ((V & (Scale - 1)) != 0)
return false;
- V /= Scale;
- return V == (V & ((1LL << 5) - 1));
+ return isUInt<5>(V / Scale);
}
static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
const ARMSubtarget *Subtarget) {
- bool isNeg = false;
+ if (!VT.isInteger() && !VT.isFloatingPoint())
+ return false;
+ if (VT.isVector() && Subtarget->hasNEON())
+ return false;
+ if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
+ !Subtarget->hasMVEFloatOps())
+ return false;
+
+ bool IsNeg = false;
if (V < 0) {
- isNeg = true;
- V = - V;
+ IsNeg = true;
+ V = -V;
}
- switch (VT.getSimpleVT().SimpleTy) {
- default: return false;
- case MVT::i1:
- case MVT::i8:
- case MVT::i16:
- case MVT::i32:
- // + imm12 or - imm8
- if (isNeg)
- return V == (V & ((1LL << 8) - 1));
- return V == (V & ((1LL << 12) - 1));
- case MVT::f32:
- case MVT::f64:
- // Same as ARM mode. FIXME: NEON?
- if (!Subtarget->hasVFP2())
- return false;
- if ((V & 3) != 0)
+ unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U);
+
+ // MVE: size * imm7
+ if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
+ switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
+ case MVT::i32:
+ case MVT::f32:
+ return isShiftedUInt<7,2>(V);
+ case MVT::i16:
+ case MVT::f16:
+ return isShiftedUInt<7,1>(V);
+ case MVT::i8:
+ return isUInt<7>(V);
+ default:
return false;
- V >>= 2;
- return V == (V & ((1LL << 8) - 1));
+ }
}
+
+ // half VLDR: 2 * imm8
+ if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
+ return isShiftedUInt<8, 1>(V);
+ // VLDR and LDRD: 4 * imm8
+ if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
+ return isShiftedUInt<8, 2>(V);
+
+ if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
+ // + imm12 or - imm8
+ if (IsNeg)
+ return isUInt<8>(V);
+ return isUInt<12>(V);
+ }
+
+ return false;
}
/// isLegalAddressImmediate - Return true if the integer value can be used
@@ -13218,18 +13784,15 @@ static bool isLegalAddressImmediate(int64_t V, EVT VT,
case MVT::i8:
case MVT::i32:
// +- imm12
- return V == (V & ((1LL << 12) - 1));
+ return isUInt<12>(V);
case MVT::i16:
// +- imm8
- return V == (V & ((1LL << 8) - 1));
+ return isUInt<8>(V);
case MVT::f32:
case MVT::f64:
- if (!Subtarget->hasVFP2()) // FIXME: NEON?
+ if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
return false;
- if ((V & 3) != 0)
- return false;
- V >>= 2;
- return V == (V & ((1LL << 8) - 1));
+ return isShiftedUInt<8, 2>(V);
}
}
@@ -13649,13 +14212,13 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
EVT VT = Op.getValueType();
const unsigned DstSz = VT.getScalarSizeInBits();
const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
+ (void)SrcSz;
assert(SrcSz == Known.getBitWidth());
assert(DstSz > SrcSz);
if (Op.getOpcode() == ARMISD::VGETLANEs)
Known = Known.sext(DstSz);
else {
- Known = Known.zext(DstSz);
- Known.Zero.setBitsFrom(SrcSz);
+ Known = Known.zext(DstSz, true /* extended bits are known zero */);
}
assert(DstSz == Known.getBitWidth());
break;
@@ -13790,7 +14353,7 @@ const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
// Although we are correct (we are free to emit anything, without
// constraints), we might break use cases that would expect us to be more
// efficient and emit something else.
- if (!Subtarget->hasVFP2())
+ if (!Subtarget->hasVFP2Base())
return "r";
if (ConstraintVT.isFloatingPoint())
return "w";
@@ -13822,6 +14385,7 @@ ARMTargetLowering::getConstraintType(StringRef Constraint) const {
} else if (Constraint.size() == 2) {
switch (Constraint[0]) {
default: break;
+ case 'T': return C_RegisterClass;
// All 'U+' constraints are addresses.
case 'U': return C_Memory;
}
@@ -13867,7 +14431,8 @@ using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
- if (Constraint.size() == 1) {
+ switch (Constraint.size()) {
+ case 1:
// GCC ARM Constraint Letters
switch (Constraint[0]) {
case 'l': // Low regs or general regs.
@@ -13913,7 +14478,25 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
return RCPair(0U, &ARM::QPR_VFP2RegClass);
break;
}
+ break;
+
+ case 2:
+ if (Constraint[0] == 'T') {
+ switch (Constraint[1]) {
+ default:
+ break;
+ case 'e':
+ return RCPair(0U, &ARM::tGPREvenRegClass);
+ case 'o':
+ return RCPair(0U, &ARM::tGPROddRegClass);
+ }
+ }
+ break;
+
+ default:
+ break;
}
+
if (StringRef("{cc}").equals_lower(Constraint))
return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
@@ -14272,28 +14855,107 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
}
SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
- assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
+ SDValue SrcVal = Op.getOperand(0);
+ const unsigned DstSz = Op.getValueType().getSizeInBits();
+ const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
+ assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
"Unexpected type for custom-lowering FP_EXTEND");
+ assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
+ "With both FP DP and 16, any FP conversion is legal!");
+
+ assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
+ "With FP16, 16 to 32 conversion is legal!");
+
+ // Either we are converting from 16 -> 64, without FP16 and/or
+ // FP.double-precision or without Armv8-fp. So we must do it in two
+ // steps.
+ // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
+ // without FP16. So we must do a function call.
+ SDLoc Loc(Op);
RTLIB::Libcall LC;
- LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+ if (SrcSz == 16) {
+ // Instruction from 16 -> 32
+ if (Subtarget->hasFP16())
+ SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal);
+ // Lib call from 16 -> 32
+ else {
+ LC = RTLIB::getFPEXT(MVT::f16, MVT::f32);
+ assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+ "Unexpected type for custom-lowering FP_EXTEND");
+ SrcVal =
+ makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first;
+ }
+ }
- SDValue SrcVal = Op.getOperand(0);
- return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
- SDLoc(Op)).first;
+ if (DstSz != 64)
+ return SrcVal;
+ // For sure now SrcVal is 32 bits
+ if (Subtarget->hasFP64()) // Instruction from 32 -> 64
+ return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal);
+
+ LC = RTLIB::getFPEXT(MVT::f32, MVT::f64);
+ assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+ "Unexpected type for custom-lowering FP_EXTEND");
+ return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first;
}
SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
- assert(Op.getOperand(0).getValueType() == MVT::f64 &&
- Subtarget->isFPOnlySP() &&
+ SDValue SrcVal = Op.getOperand(0);
+ EVT SrcVT = SrcVal.getValueType();
+ EVT DstVT = Op.getValueType();
+ const unsigned DstSz = Op.getValueType().getSizeInBits();
+ const unsigned SrcSz = SrcVT.getSizeInBits();
+ (void)DstSz;
+ assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
"Unexpected type for custom-lowering FP_ROUND");
- RTLIB::Libcall LC;
- LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+ assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
+ "With both FP DP and 16, any FP conversion is legal!");
- SDValue SrcVal = Op.getOperand(0);
- return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
- SDLoc(Op)).first;
+ SDLoc Loc(Op);
+
+ // Instruction from 32 -> 16 if hasFP16 is valid
+ if (SrcSz == 32 && Subtarget->hasFP16())
+ return Op;
+
+ // Lib call from 32 -> 16 / 64 -> [32, 16]
+ RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
+ assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+ "Unexpected type for custom-lowering FP_ROUND");
+ return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first;
+}
+
+void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
+ MVT HalfT = MVT::i32;
+ SDLoc dl(N);
+ SDValue Hi, Lo, Tmp;
+
+ if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) ||
+ !isOperationLegalOrCustom(ISD::UADDO, HalfT))
+ return ;
+
+ unsigned OpTypeBits = HalfT.getScalarSizeInBits();
+ SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
+
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+ DAG.getConstant(0, dl, HalfT));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
+ DAG.getConstant(1, dl, HalfT));
+
+ Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
+ DAG.getConstant(OpTypeBits - 1, dl,
+ getShiftAmountTy(HalfT, DAG.getDataLayout())));
+ Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
+ Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
+ SDValue(Lo.getNode(), 1));
+ Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
+ Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
+
+ Results.push_back(Lo);
+ Results.push_back(Hi);
}
bool
@@ -14314,14 +14976,15 @@ bool ARM::isBitFieldInvertedMask(unsigned v) {
/// isFPImmLegal - Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
-bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
- if (!Subtarget->hasVFP3())
+bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
+ if (!Subtarget->hasVFP3Base())
return false;
if (VT == MVT::f16 && Subtarget->hasFullFP16())
return ARM_AM::getFP16Imm(Imm) != -1;
if (VT == MVT::f32)
return ARM_AM::getFP32Imm(Imm) != -1;
- if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
+ if (VT == MVT::f64 && Subtarget->hasFP64())
return ARM_AM::getFP64Imm(Imm) != -1;
return false;
}
@@ -14590,6 +15253,9 @@ ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
// and up to 64 bits on the non-M profiles
TargetLowering::AtomicExpansionKind
ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ if (AI->isFloatingPointOperation())
+ return AtomicExpansionKind::CmpXChg;
+
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
@@ -14621,6 +15287,36 @@ bool ARMTargetLowering::useLoadStackGuardNode() const {
return Subtarget->isTargetMachO();
}
+void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
+ if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+ return TargetLowering::insertSSPDeclarations(M);
+
+ // MSVC CRT has a global variable holding security cookie.
+ M.getOrInsertGlobal("__security_cookie",
+ Type::getInt8PtrTy(M.getContext()));
+
+ // MSVC CRT has a function to validate security cookie.
+ FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
+ "__security_check_cookie", Type::getVoidTy(M.getContext()),
+ Type::getInt8PtrTy(M.getContext()));
+ if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
+ F->addAttribute(1, Attribute::AttrKind::InReg);
+}
+
+Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
+ // MSVC CRT has a global variable holding security cookie.
+ if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+ return M.getGlobalVariable("__security_cookie");
+ return TargetLowering::getSDagStackGuard(M);
+}
+
+Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
+ // MSVC CRT has a function to validate security cookie.
+ if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+ return M.getFunction("__security_check_cookie");
+ return TargetLowering::getSSPStackGuardCheck(M);
+}
+
bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
unsigned &Cost) const {
// If we do not have NEON, vector types are not natively supported.
@@ -14658,6 +15354,10 @@ bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget->hasV6T2Ops();
}
+bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
+ return !Subtarget->hasMinSize();
+}
+
Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -14850,8 +15550,9 @@ bool ARMTargetLowering::lowerInterleavedLoad(
// If we're generating more than one load, compute the base address of
// subsequent loads as an offset from the previous.
if (LoadCount > 0)
- BaseAddr = Builder.CreateConstGEP1_32(
- BaseAddr, VecTy->getVectorNumElements() * Factor);
+ BaseAddr =
+ Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
+ VecTy->getVectorNumElements() * Factor);
SmallVector<Value *, 2> Ops;
Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
@@ -14990,7 +15691,8 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
// If we generating more than one store, we compute the base address of
// subsequent stores as an offset from the previous.
if (StoreCount > 0)
- BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
+ BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
+ BaseAddr, LaneLen * Factor);
SmallVector<Value *, 6> Ops;
Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));