summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/ARM/ARMISelLowering.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
commit706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch)
tree4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Target/ARM/ARMISelLowering.cpp
parent7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff)
Notes
Diffstat (limited to 'llvm/lib/Target/ARM/ARMISelLowering.cpp')
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp727
1 files changed, 552 insertions, 175 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index db26feb570103..cf738cd664346 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -78,6 +78,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
@@ -142,6 +143,11 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
cl::desc("Maximum size of ALL constants to promote into a constant pool"),
cl::init(128));
+static cl::opt<unsigned>
+MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
+ cl::desc("Maximum interleave factor for MVE VLDn to generate."),
+ cl::init(2));
+
// The APCS parameter registers.
static const MCPhysReg GPRArgRegs[] = {
ARM::R0, ARM::R1, ARM::R2, ARM::R3
@@ -209,6 +215,9 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
VT != MVT::v2i64 && VT != MVT::v1i64)
for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
setOperationAction(Opcode, VT, Legal);
+ if (!VT.isFloatingPoint())
+ for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
+ setOperationAction(Opcode, VT, Legal);
}
void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
@@ -296,6 +305,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, VT, Legal);
setIndexedStoreAction(im, VT, Legal);
+ setIndexedMaskedLoadAction(im, VT, Legal);
+ setIndexedMaskedStoreAction(im, VT, Legal);
}
}
@@ -322,6 +333,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, VT, Legal);
setIndexedStoreAction(im, VT, Legal);
+ setIndexedMaskedLoadAction(im, VT, Legal);
+ setIndexedMaskedStoreAction(im, VT, Legal);
}
if (HasMVEFP) {
@@ -366,6 +379,13 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
+ // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal);
+
// Some truncating stores are legal too.
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
@@ -374,12 +394,12 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
// Pre and Post inc on these are legal, given the correct extends
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
- setIndexedLoadAction(im, MVT::v8i8, Legal);
- setIndexedStoreAction(im, MVT::v8i8, Legal);
- setIndexedLoadAction(im, MVT::v4i8, Legal);
- setIndexedStoreAction(im, MVT::v4i8, Legal);
- setIndexedLoadAction(im, MVT::v4i16, Legal);
- setIndexedStoreAction(im, MVT::v4i16, Legal);
+ for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
+ setIndexedLoadAction(im, VT, Legal);
+ setIndexedStoreAction(im, VT, Legal);
+ setIndexedMaskedLoadAction(im, VT, Legal);
+ setIndexedMaskedStoreAction(im, VT, Legal);
+ }
}
// Predicate types
@@ -446,7 +466,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
{ RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
{ RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
{ RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
- { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ },
// Double-precision comparisons.
{ RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
@@ -456,7 +475,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
{ RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
{ RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
{ RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
- { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ },
// Floating-point to integer conversions.
// i64 conversions are done via library routines even when generating VFP
@@ -520,7 +538,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
{ RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
- { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
// Single-precision floating-point arithmetic helper functions
// RTABI chapter 4.1.2, Table 4
@@ -538,7 +555,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
{ RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
- { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
// Floating-point to integer conversions.
// RTABI chapter 4.1.2, Table 6
@@ -964,19 +980,26 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
}
if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
- if (Subtarget->hasFullFP16())
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
+ if (Subtarget->hasFullFP16()) {
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+ }
}
- if (!Subtarget->hasFP16())
+ if (!Subtarget->hasFP16()) {
setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
-
- if (!Subtarget->hasFP64())
- setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
+ }
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -1050,6 +1073,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRA, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+ setOperationAction(ISD::LOAD, MVT::i64, Custom);
+ setOperationAction(ISD::STORE, MVT::i64, Custom);
// MVE lowers 64 bit shifts to lsll and lsrl
// assuming that ISD::SRL and SRA of i64 are already marked custom
@@ -1170,9 +1195,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
}
- if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
- for (auto &VT : {MVT::f32, MVT::f64})
- setOperationAction(ISD::FPOWI, VT, Custom);
+ if (Subtarget->getTargetTriple().isOSMSVCRT()) {
+ // MSVCRT doesn't have powi; fall back to pow
+ setLibcallName(RTLIB::POWI_F32, nullptr);
+ setLibcallName(RTLIB::POWI_F64, nullptr);
+ }
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
@@ -1571,6 +1598,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::PRELOAD: return "ARMISD::PRELOAD";
+ case ARMISD::LDRD: return "ARMISD::LDRD";
+ case ARMISD::STRD: return "ARMISD::STRD";
+
case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK";
case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK";
@@ -1855,6 +1885,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
case CallingConv::ARM_AAPCS:
case CallingConv::ARM_APCS:
case CallingConv::GHC:
+ case CallingConv::CFGuard_Check:
return CC;
case CallingConv::PreserveMost:
return CallingConv::PreserveMost;
@@ -1914,6 +1945,8 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
case CallingConv::PreserveMost:
return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+ case CallingConv::CFGuard_Check:
+ return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
}
}
@@ -2062,11 +2095,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MachineFunction::CallSiteInfo CSInfo;
bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool isThisReturn = false;
- auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
bool PreferIndirect = false;
// Disable tail calls if they're not supported.
- if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
+ if (!Subtarget->supportsTailCall())
isTailCall = false;
if (isa<GlobalAddressSDNode>(Callee)) {
@@ -2331,12 +2363,14 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
} else if (Subtarget->isTargetCOFF()) {
assert(Subtarget->isTargetWindows() &&
"Windows is the only supported COFF target");
- unsigned TargetFlags = GV->hasDLLImportStorageClass()
- ? ARMII::MO_DLLIMPORT
- : ARMII::MO_NO_FLAG;
+ unsigned TargetFlags = ARMII::MO_NO_FLAG;
+ if (GV->hasDLLImportStorageClass())
+ TargetFlags = ARMII::MO_DLLIMPORT;
+ else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+ TargetFlags = ARMII::MO_COFFSTUB;
Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
TargetFlags);
- if (GV->hasDLLImportStorageClass())
+ if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
Callee =
DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
@@ -2941,9 +2975,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
if (!Subtarget->supportsTailCall())
return false;
- auto Attr =
- CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
- if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+ if (!CI->isTailCall())
return false;
return true;
@@ -3629,6 +3661,49 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
}
+ case Intrinsic::arm_cls: {
+ const SDValue &Operand = Op.getOperand(1);
+ const EVT VTy = Op.getValueType();
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
+ SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
+ SDValue SHL =
+ DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
+ SDValue OR =
+ DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
+ SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
+ return Result;
+ }
+ case Intrinsic::arm_cls64: {
+ // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
+ // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
+ const SDValue &Operand = Op.getOperand(1);
+ const EVT VTy = Op.getValueType();
+
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
+ DAG.getConstant(1, dl, VTy));
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
+ DAG.getConstant(0, dl, VTy));
+ SDValue Constant0 = DAG.getConstant(0, dl, VTy);
+ SDValue Constant1 = DAG.getConstant(1, dl, VTy);
+ SDValue Constant31 = DAG.getConstant(31, dl, VTy);
+ SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
+ SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
+ SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
+ SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
+ SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
+ SDValue CheckLo =
+ DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
+ SDValue HiIsZero =
+ DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
+ SDValue AdjustedLo =
+ DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
+ SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
+ SDValue Result =
+ DAG.getSelect(dl, VTy, CheckLo,
+ DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
+ return Result;
+ }
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -3698,6 +3773,10 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
case Intrinsic::arm_neon_vtbl2:
return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::arm_mve_pred_i2v:
+ case Intrinsic::arm_mve_pred_v2i:
+ return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1));
}
}
@@ -4887,7 +4966,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
Opcode = ARMISD::CSINC;
std::swap(TrueVal, FalseVal);
std::swap(TVal, FVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
}
if (Opcode) {
@@ -4897,7 +4976,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
std::swap(TrueVal, FalseVal);
std::swap(TVal, FVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
}
// Attempt to use ZR checking TVal is 0, possibly inverting the condition
@@ -4906,7 +4985,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
if (FVal == 0 && Opcode != ARMISD::CSINC) {
std::swap(TrueVal, FalseVal);
std::swap(TVal, FVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
}
if (TVal == 0)
TrueVal = DAG.getRegister(ARM::ZR, MVT::i32);
@@ -4950,7 +5029,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
std::swap(TrueVal, FalseVal);
}
}
@@ -5310,17 +5389,31 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
if (VT.isVector())
return LowerVectorFP_TO_INT(Op, DAG);
- if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) {
+
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+
+ if (isUnsupportedFloatingType(SrcVal.getValueType())) {
RTLIB::Libcall LC;
- if (Op.getOpcode() == ISD::FP_TO_SINT)
- LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
+ if (Op.getOpcode() == ISD::FP_TO_SINT ||
+ Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
+ LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
Op.getValueType());
else
- LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
+ LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
Op.getValueType());
+ SDLoc Loc(Op);
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
- CallOptions, SDLoc(Op)).first;
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ SDValue Result;
+ std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
+ CallOptions, Loc, Chain);
+ return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
+ }
+
+ // FIXME: Remove this when we have strict fp instruction selection patterns
+ if (IsStrict) {
+ DAG.mutateStrictFPToFP(Op.getNode());
}
return Op;
@@ -5517,7 +5610,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<unsigned>(RegName)
.Case("sp", ARM::SP)
@@ -7745,6 +7838,92 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(ARMCC::NE, dl, MVT::i32));
}
+static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
+ ArrayRef<int> ShuffleMask,
+ SelectionDAG &DAG) {
+ // Attempt to lower the vector shuffle using as many whole register movs as
+ // possible. This is useful for types smaller than 32bits, which would
+ // often otherwise become a series for grp movs.
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ if (VT.getScalarSizeInBits() >= 32)
+ return SDValue();
+
+ assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
+ "Unexpected vector type");
+ int NumElts = VT.getVectorNumElements();
+ int QuarterSize = NumElts / 4;
+ // The four final parts of the vector, as i32's
+ SDValue Parts[4];
+
+ // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
+ // <u,u,u,u>), returning the vmov lane index
+ auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
+ // Detect which mov lane this would be from the first non-undef element.
+ int MovIdx = -1;
+ for (int i = 0; i < Length; i++) {
+ if (ShuffleMask[Start + i] >= 0) {
+ if (ShuffleMask[Start + i] % Length != i)
+ return -1;
+ MovIdx = ShuffleMask[Start + i] / Length;
+ break;
+ }
+ }
+ // If all items are undef, leave this for other combines
+ if (MovIdx == -1)
+ return -1;
+ // Check the remaining values are the correct part of the same mov
+ for (int i = 1; i < Length; i++) {
+ if (ShuffleMask[Start + i] >= 0 &&
+ (ShuffleMask[Start + i] / Length != MovIdx ||
+ ShuffleMask[Start + i] % Length != i))
+ return -1;
+ }
+ return MovIdx;
+ };
+
+ for (int Part = 0; Part < 4; ++Part) {
+ // Does this part look like a mov
+ int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
+ if (Elt != -1) {
+ SDValue Input = Op->getOperand(0);
+ if (Elt >= 4) {
+ Input = Op->getOperand(1);
+ Elt -= 4;
+ }
+ SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input);
+ Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast,
+ DAG.getConstant(Elt, dl, MVT::i32));
+ }
+ }
+
+ // Nothing interesting found, just return
+ if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
+ return SDValue();
+
+ // The other parts need to be built with the old shuffle vector, cast to a
+ // v4i32 and extract_vector_elts
+ if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
+ SmallVector<int, 16> NewShuffleMask;
+ for (int Part = 0; Part < 4; ++Part)
+ for (int i = 0; i < QuarterSize; i++)
+ NewShuffleMask.push_back(
+ Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
+ SDValue NewShuffle = DAG.getVectorShuffle(
+ VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
+ SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle);
+
+ for (int Part = 0; Part < 4; ++Part)
+ if (!Parts[Part])
+ Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ BitCast, DAG.getConstant(Part, dl, MVT::i32));
+ }
+ // Build a vector out of the various parts and bitcast it back to the original
+ // type.
+ SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts);
+ return DAG.getBitcast(VT, NewVec);
+}
+
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDValue V1 = Op.getOperand(0);
@@ -7939,6 +8118,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
return NewOp;
+ if (ST->hasMVEIntegerOps())
+ if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
+ return NewOp;
+
return SDValue();
}
@@ -8905,6 +9088,24 @@ static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
}
+void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ EVT MemVT = LD->getMemoryVT();
+ assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
+
+ if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
+ !Subtarget->isThumb1Only() && LD->isVolatile()) {
+ SDLoc dl(N);
+ SDValue Result = DAG.getMemIntrinsicNode(
+ ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
+ {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
+ Result.getValue(0), Result.getValue(1));
+ Results.append({Pair, Result.getValue(2)});
+ }
+}
+
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
EVT MemVT = ST->getMemoryVT();
@@ -8934,6 +9135,40 @@ static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
ST->getMemOperand());
}
+static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+ EVT MemVT = ST->getMemoryVT();
+ assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
+
+ if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
+ !Subtarget->isThumb1Only() && ST->isVolatile()) {
+ SDNode *N = Op.getNode();
+ SDLoc dl(N);
+
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
+ DAG.getTargetConstant(0, dl, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
+ DAG.getTargetConstant(1, dl, MVT::i32));
+
+ return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
+ {ST->getChain(), Lo, Hi, ST->getBasePtr()},
+ MemVT, ST->getMemOperand());
+ } else if (Subtarget->hasMVEIntegerOps() &&
+ ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
+ MemVT == MVT::v16i1))) {
+ return LowerPredicateStore(Op, DAG);
+ }
+
+ return SDValue();
+}
+
+static bool isZeroVector(SDValue N) {
+ return (ISD::isBuildVectorAllZeros(N.getNode()) ||
+ (N->getOpcode() == ARMISD::VMOVIMM &&
+ isNullConstant(N->getOperand(0))));
+}
+
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
MVT VT = Op.getSimpleValueType();
@@ -8941,13 +9176,7 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
SDValue PassThru = N->getPassThru();
SDLoc dl(Op);
- auto IsZero = [](SDValue PassThru) {
- return (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
- (PassThru->getOpcode() == ARMISD::VMOVIMM &&
- isNullConstant(PassThru->getOperand(0))));
- };
-
- if (IsZero(PassThru))
+ if (isZeroVector(PassThru))
return Op;
// MVE Masked loads use zero as the passthru value. Here we convert undef to
@@ -8955,12 +9184,13 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
DAG.getTargetConstant(0, dl, MVT::i32));
SDValue NewLoad = DAG.getMaskedLoad(
- VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
- N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
+ VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
+ N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+ N->getExtensionType(), N->isExpandingLoad());
SDValue Combo = NewLoad;
if (!PassThru.isUndef() &&
(PassThru.getOpcode() != ISD::BITCAST ||
- !IsZero(PassThru->getOperand(0))))
+ !isZeroVector(PassThru->getOperand(0))))
Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
}
@@ -9043,58 +9273,6 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N,
Results.push_back(SDValue(CmpSwap, 2));
}
-static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
- SelectionDAG &DAG) {
- const auto &TLI = DAG.getTargetLoweringInfo();
-
- assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
- "Custom lowering is MSVCRT specific!");
-
- SDLoc dl(Op);
- SDValue Val = Op.getOperand(0);
- MVT Ty = Val->getSimpleValueType(0);
- SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
- SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
- TLI.getPointerTy(DAG.getDataLayout()));
-
- TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
-
- Entry.Node = Val;
- Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
- Entry.IsZExt = true;
- Args.push_back(Entry);
-
- Entry.Node = Exponent;
- Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
- Entry.IsZExt = true;
- Args.push_back(Entry);
-
- Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
-
- // In the in-chain to the call is the entry node If we are emitting a
- // tailcall, the chain will be mutated if the node has a non-entry input
- // chain.
- SDValue InChain = DAG.getEntryNode();
- SDValue TCChain = InChain;
-
- const Function &F = DAG.getMachineFunction().getFunction();
- bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
- F.getReturnType() == LCRTy;
- if (IsTC)
- InChain = TCChain;
-
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl)
- .setChain(InChain)
- .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
- .setTailCall(IsTC);
- std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);
-
- // Return the chain (the DAG root) if it is a tail call
- return !CI.second.getNode() ? DAG.getRoot() : CI.first;
-}
-
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
switch (Op.getOpcode()) {
@@ -9114,6 +9292,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::STRICT_FP_TO_UINT:
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
@@ -9170,7 +9350,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::LOAD:
return LowerPredicateLoad(Op, DAG);
case ISD::STORE:
- return LowerPredicateStore(Op, DAG);
+ return LowerSTORE(Op, DAG, Subtarget);
case ISD::MLOAD:
return LowerMLOAD(Op, DAG);
case ISD::ATOMIC_LOAD:
@@ -9182,9 +9362,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->isTargetWindows())
return LowerDYNAMIC_STACKALLOC(Op, DAG);
llvm_unreachable("Don't know how to custom lower this!");
+ case ISD::STRICT_FP_ROUND:
case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
+ case ISD::STRICT_FP_EXTEND:
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
- case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
case ARMISD::WIN__DBZCHK: return SDValue();
}
}
@@ -9271,7 +9452,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ABS:
lowerABS(N, Results, DAG);
return ;
-
+ case ISD::LOAD:
+ LowerLOAD(N, Results, DAG);
+ break;
}
if (Res.getNode())
Results.push_back(Res);
@@ -11711,7 +11894,8 @@ static SDValue PerformADDCombine(SDNode *N,
/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
///
static SDValue PerformSUBCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -11720,7 +11904,28 @@ static SDValue PerformSUBCombine(SDNode *N,
if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
return Result;
- return SDValue();
+ if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
+ return SDValue();
+
+ // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
+ // so that we can readily pattern match more mve instructions which can use
+ // a scalar operand.
+ SDValue VDup = N->getOperand(1);
+ if (VDup->getOpcode() != ARMISD::VDUP)
+ return SDValue();
+
+ SDValue VMov = N->getOperand(0);
+ if (VMov->getOpcode() == ISD::BITCAST)
+ VMov = VMov->getOperand(0);
+
+ if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
+ return SDValue();
+
+ SDLoc dl(N);
+ SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DCI.DAG.getConstant(0, dl, MVT::i32),
+ VDup->getOperand(0));
+ return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
}
/// PerformVMULCombine
@@ -12736,6 +12941,39 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return SDValue();
}
+static SDValue PerformVCMPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasMVEIntegerOps())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ ARMCC::CondCodes Cond =
+ (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ SDLoc dl(N);
+
+ // vcmp X, 0, cc -> vcmpz X, cc
+ if (isZeroVector(Op1))
+ return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0,
+ N->getOperand(2));
+
+ unsigned SwappedCond = getSwappedCondition(Cond);
+ if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
+ // vcmp 0, X, cc -> vcmpz X, reversed(cc)
+ if (isZeroVector(Op0))
+ return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
+ DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
+ // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
+ if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
+ return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
+ DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
+ }
+
+ return SDValue();
+}
+
/// PerformInsertEltCombine - Target-specific dag combine xforms for
/// ISD::INSERT_VECTOR_ELT.
static SDValue PerformInsertEltCombine(SDNode *N,
@@ -13844,11 +14082,12 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDValue N0 = N->getOperand(0);
- // Check for sign- and zero-extensions of vector extract operations of 8-
- // and 16-bit vector elements. NEON supports these directly. They are
+ // Check for sign- and zero-extensions of vector extract operations of 8- and
+ // 16-bit vector elements. NEON and MVE support these directly. They are
// handled during DAG combining because type legalization will promote them
// to 32-bit types and it is messy to recognize the operations after that.
- if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
+ N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue Vec = N0.getOperand(0);
SDValue Lane = N0.getOperand(1);
EVT VT = N->getValueType(0);
@@ -14067,7 +14306,7 @@ static SDValue PerformHWLoopCombine(SDNode *N,
return SDValue();
if (Negate)
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
return (CC == ISD::SETEQ && Imm == 0) ||
@@ -14371,7 +14610,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
- case ISD::SUB: return PerformSUBCombine(N, DCI);
+ case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
@@ -14415,6 +14654,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return PerformARMBUILD_VECTORCombine(N, DCI);
case ARMISD::PREDICATE_CAST:
return PerformPREDICATE_CASTCombine(N, DCI);
+ case ARMISD::VCMP:
+ return PerformVCMPCombine(N, DCI, Subtarget);
case ARMISD::SMULWB: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
@@ -14523,7 +14764,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
if (!VT.isSimple())
return false;
- // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
+ // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
auto Ty = VT.getSimpleVT().SimpleTy;
@@ -14725,8 +14966,12 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
switch (I->getOpcode()) {
case Instruction::Add:
case Instruction::Mul:
+ case Instruction::ICmp:
return true;
case Instruction::Sub:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
return Operand == 1;
default:
return false;
@@ -14808,6 +15053,40 @@ int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
return -1;
}
+/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+/// expanded to FMAs when this method returns true, otherwise fmuladd is
+/// expanded to fmul + fadd.
+///
+/// ARM supports both fused and unfused multiply-add operations; we already
+/// lower a pair of fmul and fadd to the latter so it's not clear that there
+/// would be a gain or that the gain would be worthwhile enough to risk
+/// correctness bugs.
+///
+/// For MVE, we set this to true as it helps simplify the need for some
+/// patterns (and we don't have the non-fused floating point instruction).
+bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const {
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::v4f32:
+ case MVT::v8f16:
+ return Subtarget->hasMVEFloatOps();
+ case MVT::f16:
+ return Subtarget->useFPVFMx16();
+ case MVT::f32:
+ return Subtarget->useFPVFMx();
+ case MVT::f64:
+ return Subtarget->useFPVFMx64();
+ default:
+ break;
+ }
+
+ return false;
+}
+
static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
if (V < 0)
return false;
@@ -14850,7 +15129,7 @@ static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
V = -V;
}
- unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U);
+ unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
// MVE: size * imm7
if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
@@ -15155,14 +15434,19 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
}
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
- bool isSEXTLoad, bool isLE, SDValue &Base,
- SDValue &Offset, bool &isInc,
- SelectionDAG &DAG) {
+ bool isSEXTLoad, bool IsMasked, bool isLE,
+ SDValue &Base, SDValue &Offset,
+ bool &isInc, SelectionDAG &DAG) {
if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
return false;
if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
return false;
+ // We allow LE non-masked loads to change the type (for example use a vldrb.8
+ // as opposed to a vldrw.32). This can allow extra addressing modes or
+ // alignments for what is otherwise an equivalent instruction.
+ bool CanChangeType = isLE && !IsMasked;
+
ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
int RHSC = (int)RHS->getZExtValue();
@@ -15181,7 +15465,7 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
};
// Try to find a matching instruction based on s/zext, Alignment, Offset and
- // (in BE) type.
+ // (in BE/masked) type.
Base = Ptr->getOperand(0);
if (VT == MVT::v4i16) {
if (Align >= 2 && IsInRange(RHSC, 0x80, 2))
@@ -15189,13 +15473,15 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
} else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
if (IsInRange(RHSC, 0x80, 1))
return true;
- } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) &&
+ } else if (Align >= 4 &&
+ (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
IsInRange(RHSC, 0x80, 4))
return true;
- else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) &&
+ else if (Align >= 2 &&
+ (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
IsInRange(RHSC, 0x80, 2))
return true;
- else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
+ else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
return true;
return false;
}
@@ -15215,6 +15501,7 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
SDValue Ptr;
unsigned Align;
bool isSEXTLoad = false;
+ bool IsMasked = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
Ptr = LD->getBasePtr();
VT = LD->getMemoryVT();
@@ -15224,6 +15511,17 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
Ptr = ST->getBasePtr();
VT = ST->getMemoryVT();
Align = ST->getAlignment();
+ } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+ Ptr = LD->getBasePtr();
+ VT = LD->getMemoryVT();
+ Align = LD->getAlignment();
+ isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+ IsMasked = true;
+ } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
+ Ptr = ST->getBasePtr();
+ VT = ST->getMemoryVT();
+ Align = ST->getAlignment();
+ IsMasked = true;
} else
return false;
@@ -15232,8 +15530,8 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
if (VT.isVector())
isLegal = Subtarget->hasMVEIntegerOps() &&
getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad,
- Subtarget->isLittle(), Base, Offset,
- isInc, DAG);
+ IsMasked, Subtarget->isLittle(), Base,
+ Offset, isInc, DAG);
else {
if (Subtarget->isThumb2())
isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
@@ -15261,6 +15559,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
SDValue Ptr;
unsigned Align;
bool isSEXTLoad = false, isNonExt;
+ bool IsMasked = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
@@ -15272,6 +15571,19 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
Ptr = ST->getBasePtr();
Align = ST->getAlignment();
isNonExt = !ST->isTruncatingStore();
+ } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+ VT = LD->getMemoryVT();
+ Ptr = LD->getBasePtr();
+ Align = LD->getAlignment();
+ isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+ isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
+ IsMasked = true;
+ } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
+ VT = ST->getMemoryVT();
+ Ptr = ST->getBasePtr();
+ Align = ST->getAlignment();
+ isNonExt = !ST->isTruncatingStore();
+ IsMasked = true;
} else
return false;
@@ -15295,7 +15607,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
bool isLegal = false;
if (VT.isVector())
isLegal = Subtarget->hasMVEIntegerOps() &&
- getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad,
+ getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked,
Subtarget->isLittle(), Base, Offset,
isInc, DAG);
else {
@@ -16048,7 +16360,8 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
}
SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
- SDValue SrcVal = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
const unsigned DstSz = Op.getValueType().getSizeInBits();
const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
@@ -16068,34 +16381,35 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDLoc Loc(Op);
RTLIB::Libcall LC;
MakeLibCallOptions CallOptions;
- if (SrcSz == 16) {
- // Instruction from 16 -> 32
- if (Subtarget->hasFP16())
- SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal);
- // Lib call from 16 -> 32
- else {
- LC = RTLIB::getFPEXT(MVT::f16, MVT::f32);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
+ bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
+ MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
+ MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
+ if (Supported) {
+ if (IsStrict) {
+ SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
+ {DstVT, MVT::Other}, {Chain, SrcVal});
+ Chain = SrcVal.getValue(1);
+ } else {
+ SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
+ }
+ } else {
+ LC = RTLIB::getFPEXT(SrcVT, DstVT);
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_EXTEND");
- SrcVal =
- makeLibCall(DAG, LC, MVT::f32, SrcVal, CallOptions, Loc).first;
+ std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
+ Loc, Chain);
}
}
- if (DstSz != 64)
- return SrcVal;
- // For sure now SrcVal is 32 bits
- if (Subtarget->hasFP64()) // Instruction from 32 -> 64
- return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal);
-
- LC = RTLIB::getFPEXT(MVT::f32, MVT::f64);
- assert(LC != RTLIB::UNKNOWN_LIBCALL &&
- "Unexpected type for custom-lowering FP_EXTEND");
- return makeLibCall(DAG, LC, MVT::f64, SrcVal, CallOptions, Loc).first;
+ return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
}
SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
- SDValue SrcVal = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
EVT SrcVT = SrcVal.getValueType();
EVT DstVT = Op.getValueType();
const unsigned DstSz = Op.getValueType().getSizeInBits();
@@ -16118,7 +16432,11 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_ROUND");
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, Loc).first;
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ SDValue Result;
+ std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
+ Loc, Chain);
+ return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
}
void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
@@ -16644,15 +16962,20 @@ ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
}
bool ARMTargetLowering::isLegalInterleavedAccessType(
- VectorType *VecTy, const DataLayout &DL) const {
+ unsigned Factor, VectorType *VecTy, const DataLayout &DL) const {
unsigned VecSize = DL.getTypeSizeInBits(VecTy);
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+ if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
+ return false;
+
// Ensure the vector doesn't have f16 elements. Even though we could do an
// i16 vldN, we can't hold the f16 vectors and will end up converting via
// f32.
- if (VecTy->getElementType()->isHalfTy())
+ if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
+ return false;
+ if (Subtarget->hasMVEIntegerOps() && Factor == 3)
return false;
// Ensure the number of vector elements is greater than 1.
@@ -16665,12 +16988,16 @@ bool ARMTargetLowering::isLegalInterleavedAccessType(
// Ensure the total vector size is 64 or a multiple of 128. Types larger than
// 128 will be split into multiple interleaved accesses.
- return VecSize == 64 || VecSize % 128 == 0;
+ if (Subtarget->hasNEON() && VecSize == 64)
+ return true;
+ return VecSize % 128 == 0;
}
unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
if (Subtarget->hasNEON())
return 4;
+ if (Subtarget->hasMVEIntegerOps())
+ return MVEMaxSupportedInterleaveFactor;
return TargetLoweringBase::getMaxSupportedInterleaveFactor();
}
@@ -16702,7 +17029,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
- if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
+ if (!isLegalInterleavedAccessType(Factor, VecTy, DL))
return false;
unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
@@ -16734,13 +17061,37 @@ bool ARMTargetLowering::lowerInterleavedLoad(
assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
- Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
- Type *Tys[] = {VecTy, Int8Ptr};
- static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
- Intrinsic::arm_neon_vld3,
- Intrinsic::arm_neon_vld4};
- Function *VldnFunc =
- Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+ auto createLoadIntrinsic = [&](Value *BaseAddr) {
+ if (Subtarget->hasNEON()) {
+ Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+ Type *Tys[] = {VecTy, Int8Ptr};
+ static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
+ Intrinsic::arm_neon_vld3,
+ Intrinsic::arm_neon_vld4};
+ Function *VldnFunc =
+ Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+
+ SmallVector<Value *, 2> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+ Ops.push_back(Builder.getInt32(LI->getAlignment()));
+
+ return Builder.CreateCall(VldnFunc, Ops, "vldN");
+ } else {
+ assert((Factor == 2 || Factor == 4) &&
+ "expected interleave factor of 2 or 4 for MVE");
+ Intrinsic::ID LoadInts =
+ Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
+ Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo(
+ LI->getPointerAddressSpace());
+ Type *Tys[] = {VecTy, VecEltTy};
+ Function *VldnFunc =
+ Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
+
+ SmallVector<Value *, 2> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy));
+ return Builder.CreateCall(VldnFunc, Ops, "vldN");
+ }
+ };
// Holds sub-vectors extracted from the load intrinsic return values. The
// sub-vectors are associated with the shufflevector instructions they will
@@ -16755,11 +17106,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
VecTy->getVectorNumElements() * Factor);
- SmallVector<Value *, 2> Ops;
- Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
- Ops.push_back(Builder.getInt32(LI->getAlignment()));
-
- CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+ CallInst *VldN = createLoadIntrinsic(BaseAddr);
// Replace uses of each shufflevector with the corresponding vector loaded
// by ldN.
@@ -16838,7 +17185,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
- if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
+ if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL))
return false;
unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
@@ -16882,11 +17229,46 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
auto Mask = SVI->getShuffleMask();
- Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
- Type *Tys[] = {Int8Ptr, SubVecTy};
- static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
- Intrinsic::arm_neon_vst3,
- Intrinsic::arm_neon_vst4};
+ auto createStoreIntrinsic = [&](Value *BaseAddr,
+ SmallVectorImpl<Value *> &Shuffles) {
+ if (Subtarget->hasNEON()) {
+ static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
+ Intrinsic::arm_neon_vst3,
+ Intrinsic::arm_neon_vst4};
+ Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+ Type *Tys[] = {Int8Ptr, SubVecTy};
+
+ Function *VstNFunc = Intrinsic::getDeclaration(
+ SI->getModule(), StoreInts[Factor - 2], Tys);
+
+ SmallVector<Value *, 6> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+ for (auto S : Shuffles)
+ Ops.push_back(S);
+ Ops.push_back(Builder.getInt32(SI->getAlignment()));
+ Builder.CreateCall(VstNFunc, Ops);
+ } else {
+ assert((Factor == 2 || Factor == 4) &&
+ "expected interleave factor of 2 or 4 for MVE");
+ Intrinsic::ID StoreInts =
+ Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
+ Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo(
+ SI->getPointerAddressSpace());
+ Type *Tys[] = {EltPtrTy, SubVecTy};
+ Function *VstNFunc =
+ Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);
+
+ SmallVector<Value *, 6> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy));
+ for (auto S : Shuffles)
+ Ops.push_back(S);
+ for (unsigned F = 0; F < Factor; F++) {
+ Ops.push_back(Builder.getInt32(F));
+ Builder.CreateCall(VstNFunc, Ops);
+ Ops.pop_back();
+ }
+ }
+ };
for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
// If we generating more than one store, we compute the base address of
@@ -16895,17 +17277,13 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
BaseAddr, LaneLen * Factor);
- SmallVector<Value *, 6> Ops;
- Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
-
- Function *VstNFunc =
- Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+ SmallVector<Value *, 4> Shuffles;
// Split the shufflevector operands into sub vectors for the new vstN call.
for (unsigned i = 0; i < Factor; i++) {
unsigned IdxI = StoreCount * LaneLen * Factor + i;
if (Mask[IdxI] >= 0) {
- Ops.push_back(Builder.CreateShuffleVector(
+ Shuffles.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
} else {
unsigned StartMask = 0;
@@ -16922,13 +17300,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
// In the case of all undefs we're defaulting to using elems from 0
// Note: StartMask cannot be negative, it's checked in
// isReInterleaveMask
- Ops.push_back(Builder.CreateShuffleVector(
+ Shuffles.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
}
}
- Ops.push_back(Builder.getInt32(SI->getAlignment()));
- Builder.CreateCall(VstNFunc, Ops);
+ createStoreIntrinsic(BaseAddr, Shuffles);
}
return true;
}