summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/ARM/ARMISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/ARM/ARMISelLowering.cpp')
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp475
1 files changed, 346 insertions, 129 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 743cca9ff71f..2e78b52d0993 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -110,6 +110,7 @@
#include <cstdlib>
#include <iterator>
#include <limits>
+#include <optional>
#include <string>
#include <tuple>
#include <utility>
@@ -1370,7 +1371,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// instructions. (ARMv6 doesn't have dmb, but it has an equivalent
// encoding; see ARMISD::MEMBARRIER_MCR.)
setMaxAtomicSizeInBitsSupported(64);
- } else if (Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) {
+ } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
+ Subtarget->hasForced32BitAtomics()) {
// Cortex-M (besides Cortex-M0) have 32-bit atomics.
setMaxAtomicSizeInBitsSupported(32);
} else {
@@ -1379,6 +1381,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setMaxAtomicSizeInBitsSupported(0);
}
+ setMaxDivRemBitWidthSupported(64);
+
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
// Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
@@ -1393,7 +1397,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
// iff target supports vfp2.
setOperationAction(ISD::BITCAST, MVT::i64, Custom);
- setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+ setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
}
@@ -2627,11 +2631,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const TargetMachine &TM = getTargetMachine();
const Module *Mod = MF.getFunction().getParent();
- const GlobalValue *GV = nullptr;
+ const GlobalValue *GVal = nullptr;
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
- GV = G->getGlobal();
+ GVal = G->getGlobal();
bool isStub =
- !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
+ !TM.shouldAssumeDSOLocal(*Mod, GVal) && Subtarget->isTargetMachO();
bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
bool isLocalARMFunc = false;
@@ -2644,36 +2648,58 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// those, the target's already in a register, so we don't need to do
// anything extra.
if (isa<GlobalAddressSDNode>(Callee)) {
- // Create a constant pool entry for the callee address
- unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
- ARMConstantPoolValue *CPV =
- ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
+ // When generating execute-only code we use movw movt pair.
+ // Currently execute-only is only available for architectures that
+ // support movw movt, so we are safe to assume that.
+ if (Subtarget->genExecuteOnly()) {
+ assert(Subtarget->useMovt() &&
+ "long-calls with execute-only requires movt and movw!");
+ ++NumMovwMovt;
+ Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
+ DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
+ } else {
+ // Create a constant pool entry for the callee address
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
+ GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
- // Get the address of the callee into a register
- SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
- CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- Callee = DAG.getLoad(
- PtrVt, dl, DAG.getEntryNode(), CPAddr,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ // Get the address of the callee into a register
+ SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
+ Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
+ Callee = DAG.getLoad(
+ PtrVt, dl, DAG.getEntryNode(), Addr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ }
} else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
const char *Sym = S->getSymbol();
- // Create a constant pool entry for the callee address
- unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
- ARMConstantPoolValue *CPV =
- ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
- ARMPCLabelIndex, 0);
- // Get the address of the callee into a register
- SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
- CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
- Callee = DAG.getLoad(
- PtrVt, dl, DAG.getEntryNode(), CPAddr,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ // When generating execute-only code we use movw movt pair.
+ // Currently execute-only is only available for architectures that
+ // support movw movt, so we are safe to assume that.
+ if (Subtarget->genExecuteOnly()) {
+ assert(Subtarget->useMovt() &&
+ "long-calls with execute-only requires movt and movw!");
+ ++NumMovwMovt;
+ Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
+ DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
+ } else {
+ // Create a constant pool entry for the callee address
+ unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+ ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
+ *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
+
+ // Get the address of the callee into a register
+ SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
+ Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
+ Callee = DAG.getLoad(
+ PtrVt, dl, DAG.getEntryNode(), Addr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+ }
}
} else if (isa<GlobalAddressSDNode>(Callee)) {
if (!PreferIndirect) {
isDirect = true;
- bool isDef = GV->isStrongDefinitionForLinker();
+ bool isDef = GVal->isStrongDefinitionForLinker();
// ARM call to a local ARM function is predicable.
isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
@@ -2682,7 +2708,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
Callee = DAG.getNode(
ARMISD::WrapperPIC, dl, PtrVt,
- DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
+ DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
Callee = DAG.getLoad(
PtrVt, dl, DAG.getEntryNode(), Callee,
MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(),
@@ -2692,11 +2718,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert(Subtarget->isTargetWindows() &&
"Windows is the only supported COFF target");
unsigned TargetFlags = ARMII::MO_NO_FLAG;
- if (GV->hasDLLImportStorageClass())
+ if (GVal->hasDLLImportStorageClass())
TargetFlags = ARMII::MO_DLLIMPORT;
- else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+ else if (!TM.shouldAssumeDSOLocal(*GVal->getParent(), GVal))
TargetFlags = ARMII::MO_COFFSTUB;
- Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
+ Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
TargetFlags);
if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
Callee =
@@ -2704,7 +2730,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
} else {
- Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
+ Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
}
}
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
@@ -2775,8 +2801,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// we've carefully laid out the parameters so that when sp is reset they'll be
// in the correct location.
if (isTailCall && !isSibCall) {
- Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
- DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+ Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, dl);
InFlag = Chain.getValue(1);
}
@@ -2837,9 +2862,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
uint64_t CalleePopBytes =
canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
- Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
- DAG.getIntPtrConstant(CalleePopBytes, dl, true),
- InFlag, dl);
+ Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, dl);
if (!Ins.empty())
InFlag = Chain.getValue(1);
@@ -2915,7 +2938,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
int FI = std::numeric_limits<int>::max();
if (Arg.getOpcode() == ISD::CopyFromReg) {
Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
- if (!Register::isVirtualRegister(VR))
+ if (!VR.isVirtual())
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
@@ -3444,11 +3467,16 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
return LowerGlobalAddress(GA, DAG);
}
+ // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
+ // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
+ Align CPAlign = CP->getAlign();
+ if (Subtarget->isThumb1Only())
+ CPAlign = std::max(CPAlign, Align(4));
if (CP->isMachineConstantPoolEntry())
Res =
- DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign());
+ DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
else
- Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign());
+ Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
}
@@ -4393,7 +4421,7 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
bool ARMTargetLowering::splitValueIntoRegisterParts(
SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
- unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
+ unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
bool IsABIRegCopy = CC.has_value();
EVT ValueVT = Val.getValueType();
if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
@@ -4411,7 +4439,7 @@ bool ARMTargetLowering::splitValueIntoRegisterParts(
SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
- MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
+ MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
bool IsABIRegCopy = CC.has_value();
if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
PartVT == MVT::f32) {
@@ -4480,7 +4508,7 @@ SDValue ARMTargetLowering::LowerFormalArguments(
int lastInsIndex = -1;
if (isVarArg && MFI.hasVAStart()) {
unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
- if (RegIdx != array_lengthof(GPRArgRegs))
+ if (RegIdx != std::size(GPRArgRegs))
ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
}
@@ -6344,8 +6372,8 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
return DAG.getMergeValues(Ops, dl);
}
-SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
+ SelectionDAG &DAG) const {
// The rounding mode is in bits 23:22 of the FPSCR.
// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
@@ -6738,23 +6766,23 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
if (ST->hasMVEFloatOps()) {
Opc = ARMCC::NE; break;
} else {
- Invert = true; LLVM_FALLTHROUGH;
+ Invert = true; [[fallthrough]];
}
case ISD::SETOEQ:
case ISD::SETEQ: Opc = ARMCC::EQ; break;
case ISD::SETOLT:
- case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETLT: Swap = true; [[fallthrough]];
case ISD::SETOGT:
case ISD::SETGT: Opc = ARMCC::GT; break;
case ISD::SETOLE:
- case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETLE: Swap = true; [[fallthrough]];
case ISD::SETOGE:
case ISD::SETGE: Opc = ARMCC::GE; break;
- case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETUGE: Swap = true; [[fallthrough]];
case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
- case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETUGT: Swap = true; [[fallthrough]];
case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
- case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETUEQ: Invert = true; [[fallthrough]];
case ISD::SETONE: {
// Expand this to (OLT | OGT).
SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
@@ -6766,7 +6794,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
Result = DAG.getNOT(dl, Result, VT);
return Result;
}
- case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETUO: Invert = true; [[fallthrough]];
case ISD::SETO: {
// Expand this to (OLT | OGE).
SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
@@ -6787,16 +6815,16 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
if (ST->hasMVEIntegerOps()) {
Opc = ARMCC::NE; break;
} else {
- Invert = true; LLVM_FALLTHROUGH;
+ Invert = true; [[fallthrough]];
}
case ISD::SETEQ: Opc = ARMCC::EQ; break;
- case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETLT: Swap = true; [[fallthrough]];
case ISD::SETGT: Opc = ARMCC::GT; break;
- case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETLE: Swap = true; [[fallthrough]];
case ISD::SETGE: Opc = ARMCC::GE; break;
- case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETULT: Swap = true; [[fallthrough]];
case ISD::SETUGT: Opc = ARMCC::HI; break;
- case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
+ case ISD::SETULE: Swap = true; [[fallthrough]];
case ISD::SETUGE: Opc = ARMCC::HS; break;
}
@@ -6828,25 +6856,25 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
// If one of the operands is a constant vector zero, attempt to fold the
// comparison to a specialized compare-against-zero form.
- SDValue SingleOp;
- if (ISD::isBuildVectorAllZeros(Op1.getNode()))
- SingleOp = Op0;
- else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
+ if (ISD::isBuildVectorAllZeros(Op0.getNode()) &&
+ (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
+ Opc == ARMCC::NE)) {
if (Opc == ARMCC::GE)
Opc = ARMCC::LE;
else if (Opc == ARMCC::GT)
Opc = ARMCC::LT;
- SingleOp = Op1;
+ std::swap(Op0, Op1);
}
SDValue Result;
- if (SingleOp.getNode()) {
- Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp,
+ if (ISD::isBuildVectorAllZeros(Op1.getNode()) &&
+ (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
+ Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
+ Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
DAG.getConstant(Opc, dl, MVT::i32));
- } else {
+ else
Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
DAG.getConstant(Opc, dl, MVT::i32));
- }
Result = DAG.getSExtOrTrunc(Result, dl, VT);
@@ -7485,6 +7513,28 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) {
return true;
}
+static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
+ unsigned NumElts = VT.getVectorNumElements();
+ // Make sure the mask has the right size.
+ if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
+ return false;
+
+ // Half-width truncation patterns (e.g. v4i32 -> v8i16):
+ // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
+ // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
+ // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
+ // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
+ int Ofs = Top ? 1 : 0;
+ int Upper = SingleSource ? 0 : NumElts;
+ for (int i = 0, e = NumElts / 2; i != e; ++i) {
+ if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
+ return false;
+ if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
+ return false;
+ }
+ return true;
+}
+
static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
unsigned NumElts = VT.getVectorNumElements();
// Make sure the mask has the right size.
@@ -7678,10 +7728,9 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
// extend that single value
SDValue FirstOp = Op.getOperand(0);
if (!isa<ConstantSDNode>(FirstOp) &&
- std::all_of(std::next(Op->op_begin()), Op->op_end(),
- [&FirstOp](SDUse &U) {
- return U.get().isUndef() || U.get() == FirstOp;
- })) {
+ llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
+ return U.get().isUndef() || U.get() == FirstOp;
+ })) {
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
DAG.getValueType(MVT::i1));
return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
@@ -8009,12 +8058,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
EVT ExtVT = VT.getVectorElementType();
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
- SDValue Lower =
- DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
+ SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
if (Lower.getOpcode() == ISD::BUILD_VECTOR)
Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
- SDValue Upper = DAG.getBuildVector(
- HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
+ SDValue Upper =
+ DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
if (Upper.getOpcode() == ISD::BUILD_VECTOR)
Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
if (Lower && Upper)
@@ -8339,6 +8387,11 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
(isVMOVNMask(M, VT, true, false) ||
isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
return true;
+ else if (Subtarget->hasMVEIntegerOps() &&
+ (isTruncMask(M, VT, false, false) ||
+ isTruncMask(M, VT, false, true) ||
+ isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
+ return true;
else
return false;
}
@@ -8367,15 +8420,13 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
default: llvm_unreachable("Unknown shuffle opcode!");
case OP_VREV:
// VREV divides the vector in half and swaps within the half.
- if (VT.getVectorElementType() == MVT::i32 ||
- VT.getVectorElementType() == MVT::f32)
+ if (VT.getScalarSizeInBits() == 32)
return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
// vrev <4 x i16> -> VREV32
- if (VT.getVectorElementType() == MVT::i16 ||
- VT.getVectorElementType() == MVT::f16)
+ if (VT.getScalarSizeInBits() == 16)
return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
// vrev <4 x i8> -> VREV16
- assert(VT.getVectorElementType() == MVT::i8);
+ assert(VT.getScalarSizeInBits() == 8);
return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
case OP_VDUP0:
case OP_VDUP1:
@@ -8503,6 +8554,7 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
"No support for vector shuffle of boolean predicates");
SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
SDLoc dl(Op);
if (isReverseMask(ShuffleMask, VT)) {
SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
@@ -8520,12 +8572,16 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
// many cases the generated code might be even better than scalar code
// operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
// fields in a register into 8 other arbitrary 2-bit fields!
- SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG);
- EVT NewVT = PredAsVector.getValueType();
+ SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
+ EVT NewVT = PredAsVector1.getValueType();
+ SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
+ : PromoteMVEPredVector(dl, V2, VT, DAG);
+ assert(PredAsVector2.getValueType() == NewVT &&
+ "Expected identical vector type in expanded i1 shuffle!");
// Do the shuffle!
- SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector,
- DAG.getUNDEF(NewVT), ShuffleMask);
+ SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
+ PredAsVector2, ShuffleMask);
// Now return the result of comparing the shuffled vector with zero,
// which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
@@ -8813,10 +8869,29 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
}
}
- if (ST->hasMVEIntegerOps() && EltSize <= 32)
+ if (ST->hasMVEIntegerOps() && EltSize <= 32) {
if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
return V;
+ for (bool Top : {false, true}) {
+ for (bool SingleSource : {false, true}) {
+ if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
+ MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
+ MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
+ SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
+ SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
+ SingleSource ? V1 : V2);
+ if (Top) {
+ SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
+ Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
+ Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
+ }
+ return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
+ }
+ }
+ }
+ }
+
// If the shuffle is not directly supported and it has 4 elements, use
// the PerfectShuffle-generated table to synthesize it from other shuffles.
unsigned NumElts = VT.getVectorNumElements();
@@ -9015,7 +9090,7 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
// Extract the vector elements from Op1 and Op2 one by one and truncate them
// to be the right size for the destination. For example, if Op1 is v4i1
- // then the promoted vector is v4i32. The result of concatentation gives a
+ // then the promoted vector is v4i32. The result of concatenation gives a
// v8i1, which when promoted is v8i16. That means each i32 element from Op1
// needs truncating to i16 and inserting in the result.
EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
@@ -10391,7 +10466,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
- case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
+ case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::SDIV:
@@ -12214,7 +12289,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
// Any ARM instruction that sets the 's' bit should specify an optional
// "cc_out" operand in the last operand position.
- if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
+ if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
assert(!NewOpc && "Optional cc_out operand required");
return;
}
@@ -12300,7 +12375,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
// (zext cc) can never be the all ones value.
if (AllOnes)
return false;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case ISD::SIGN_EXTEND: {
SDLoc dl(N);
EVT VT = N->getValueType(0);
@@ -13706,7 +13781,7 @@ static SDValue PerformSHLSimplify(SDNode *N,
return SDValue();
// Check that all the users could perform the shl themselves.
- for (auto U : N->uses()) {
+ for (auto *U : N->uses()) {
switch(U->getOpcode()) {
default:
return SDValue();
@@ -13748,10 +13823,13 @@ static SDValue PerformSHLSimplify(SDNode *N,
APInt C2Int = C2->getAPIntValue();
APInt C1Int = C1ShlC2->getAPIntValue();
+ unsigned C2Width = C2Int.getBitWidth();
+ if (C2Int.uge(C2Width))
+ return SDValue();
+ uint64_t C2Value = C2Int.getZExtValue();
// Check that performing a lshr will not lose any information.
- APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
- C2Int.getBitWidth() - C2->getZExtValue());
+ APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
if ((C1Int & Mask) != C1Int)
return SDValue();
@@ -14676,7 +14754,7 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
unsigned LSB = countTrailingZeros(~InvMask);
- unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
+ unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
assert(Width <
static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
"undefined behavior");
@@ -15823,7 +15901,7 @@ static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
Tys[n] = AlignedVecTy;
Tys[n++] = MVT::i32;
Tys[n] = MVT::Other;
- SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
+ SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
// Then, gather the new node's operands.
SmallVector<SDValue, 8> Ops;
@@ -16134,7 +16212,7 @@ static SDValue PerformMVEVLDCombine(SDNode *N,
Tys[n] = VecTy;
Tys[n++] = MVT::i32;
Tys[n] = MVT::Other;
- SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
+ SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
// Then, gather the new node's operands.
SmallVector<SDValue, 8> Ops;
@@ -16215,7 +16293,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
for (n = 0; n < NumVecs; ++n)
Tys[n] = VT;
Tys[n] = MVT::Other;
- SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
+ SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
@@ -16512,7 +16590,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
SDValue Store = DAG.getTruncStore(
Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
- NewToVT, Alignment.value(), MMOFlags, AAInfo);
+ NewToVT, Alignment, MMOFlags, AAInfo);
Stores.push_back(Store);
}
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
@@ -16553,7 +16631,7 @@ static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St,
SDValue Extract = Trunc.getOperand(i);
SDValue Store = DAG.getTruncStore(
Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
- NewToVT, Alignment.value(), MMOFlags, AAInfo);
+ NewToVT, Alignment, MMOFlags, AAInfo);
Stores.push_back(Store);
}
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
@@ -16589,8 +16667,8 @@ static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) {
AAMDNodes AAInfo = St->getAAInfo();
EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
- St->getPointerInfo(), NewToVT,
- Alignment.value(), MMOFlags, AAInfo);
+ St->getPointerInfo(), NewToVT, Alignment,
+ MMOFlags, AAInfo);
return Store;
}
@@ -18333,7 +18411,7 @@ SDValue ARMTargetLowering::PerformMVETruncCombine(
if (S0->getOperand(0) == S1->getOperand(0) &&
S0->getOperand(1) == S1->getOperand(1)) {
// Construct complete shuffle mask
- SmallVector<int, 8> Mask(S0->getMask().begin(), S0->getMask().end());
+ SmallVector<int, 8> Mask(S0->getMask());
Mask.append(S1->getMask().begin(), S1->getMask().end());
if (isVMOVNTruncMask(Mask, VT, false))
@@ -18793,7 +18871,7 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
Align Alignment,
MachineMemOperand::Flags,
- bool *Fast) const {
+ unsigned *Fast) const {
// Depends what it gets converted into if the type is weird.
if (!VT.isSimple())
return false;
@@ -18817,7 +18895,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
// A big-endian target may also explicitly support unaligned accesses
if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
if (Fast)
- *Fast = true;
+ *Fast = 1;
return true;
}
}
@@ -18829,7 +18907,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
Ty == MVT::v2i1)) {
if (Fast)
- *Fast = true;
+ *Fast = 1;
return true;
}
@@ -18855,7 +18933,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
Ty == MVT::v2f64) {
if (Fast)
- *Fast = true;
+ *Fast = 1;
return true;
}
@@ -18868,7 +18946,7 @@ EVT ARMTargetLowering::getOptimalMemOpType(
// See if we can use NEON instructions for this...
if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
- bool Fast;
+ unsigned Fast;
if (Op.size() >= 16 &&
(Op.isAligned(Align(16)) ||
(allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
@@ -19147,18 +19225,6 @@ bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
return true;
}
-InstructionCost ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
- const AddrMode &AM,
- Type *Ty,
- unsigned AS) const {
- if (isLegalAddressingMode(DL, AM, Ty, AS)) {
- if (Subtarget->hasFPAO())
- return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
- return 0;
- }
- return -1;
-}
-
/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
/// expanded to FMAs when this method returns true, otherwise fmuladd is
@@ -20151,6 +20217,8 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
case 'w':
if (VT == MVT::Other)
break;
+ if (VT == MVT::f16 || VT == MVT::bf16)
+ return RCPair(0U, &ARM::HPRRegClass);
if (VT == MVT::f32)
return RCPair(0U, &ARM::SPRRegClass);
if (VT.getSizeInBits() == 64)
@@ -20171,6 +20239,8 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
case 't':
if (VT == MVT::Other)
break;
+ if (VT == MVT::f16 || VT == MVT::bf16)
+ return RCPair(0U, &ARM::HPRRegClass);
if (VT == MVT::f32 || VT == MVT::i32)
return RCPair(0U, &ARM::SPRRegClass);
if (VT.getSizeInBits() == 64)
@@ -20422,9 +20492,22 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
"Invalid opcode for Div/Rem lowering");
bool isSigned = (Opcode == ISD::SDIVREM);
EVT VT = Op->getValueType(0);
- Type *Ty = VT.getTypeForEVT(*DAG.getContext());
SDLoc dl(Op);
+ if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
+ SmallVector<SDValue> Result;
+ if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
+ SDValue Res0 =
+ DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
+ SDValue Res1 =
+ DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
+ {Res0, Res1});
+ }
+ }
+
+ Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+
// If the target has hardware divide, use divide + multiply + subtract:
// div = a / b
// rem = a - b * div
@@ -20473,11 +20556,20 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
// Lowers REM using divmod helpers
// see RTABI section 4.2/4.3
SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
+ EVT VT = N->getValueType(0);
+
+ if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
+ SmallVector<SDValue> Result;
+ if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
+ return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
+ Result[0], Result[1]);
+ }
+
// Build return types (div and rem)
std::vector<Type*> RetTyParams;
Type *RetTyElement;
- switch (N->getValueType(0).getSimpleVT().SimpleTy) {
+ switch (VT.getSimpleVT().SimpleTy) {
default: llvm_unreachable("Unexpected request for libcall!");
case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
@@ -20978,7 +21070,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
case AtomicOrdering::SequentiallyConsistent:
if (!Inst->hasAtomicStore())
return nullptr; // Nothing to do
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case AtomicOrdering::Release:
case AtomicOrdering::AcquireRelease:
if (Subtarget->preferISHSTBarriers())
@@ -21105,7 +21197,10 @@ bool ARMTargetLowering::shouldInsertFencesForAtomic(
return InsertFencesForAtomic;
}
-bool ARMTargetLowering::useLoadStackGuardNode() const { return true; }
+bool ARMTargetLowering::useLoadStackGuardNode() const {
+ // ROPI/RWPI are not supported currently.
+ return !Subtarget->isROPI() && !Subtarget->isRWPI();
+}
void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
@@ -21156,7 +21251,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
return false;
assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
- unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize();
+ unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
// We can do a store + vector extract on any vector that fits perfectly in a D
// or Q register.
if (BitWidth == 64 || BitWidth == 128) {
@@ -21166,16 +21261,36 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
return false;
}
-bool ARMTargetLowering::isCheapToSpeculateCttz() const {
+bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
return Subtarget->hasV6T2Ops();
}
-bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
+bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
return Subtarget->hasV6T2Ops();
}
-bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
- return !Subtarget->hasMinSize() || Subtarget->isTargetWindows();
+bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial(
+ const Instruction &AndI) const {
+ if (!Subtarget->hasV7Ops())
+ return false;
+
+ // Sink the `and` instruction only if the mask would fit into a modified
+ // immediate operand.
+ ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+ if (!Mask || Mask->getValue().getBitWidth() > 32u)
+ return false;
+ auto MaskVal = unsigned(Mask->getValue().getZExtValue());
+ return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
+ : ARM_AM::getSOImmVal(MaskVal)) != -1;
+}
+
+TargetLowering::ShiftLegalizationStrategy
+ARMTargetLowering::preferredShiftLegalizationStrategy(
+ SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
+ if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
+ return ShiftLegalizationStrategy::LowerToLibcall;
+ return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
+ ExpansionFactor);
}
Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
@@ -21661,11 +21776,11 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
case HA_DOUBLE:
return false;
case HA_VECT64:
- return VT->getPrimitiveSizeInBits().getFixedSize() == 64;
+ return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
case HA_VECT128:
- return VT->getPrimitiveSizeInBits().getFixedSize() == 128;
+ return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
case HA_UNKNOWN:
- switch (VT->getPrimitiveSizeInBits().getFixedSize()) {
+ switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
case 64:
Base = HA_VECT64;
return true;
@@ -21777,3 +21892,105 @@ void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
MF.getFrameInfo().computeMaxCallFrameSize(MF);
TargetLoweringBase::finalizeLowering(MF);
}
+
+bool ARMTargetLowering::isComplexDeinterleavingSupported() const {
+ return Subtarget->hasMVEIntegerOps();
+}
+
+bool ARMTargetLowering::isComplexDeinterleavingOperationSupported(
+ ComplexDeinterleavingOperation Operation, Type *Ty) const {
+ auto *VTy = dyn_cast<FixedVectorType>(Ty);
+ if (!VTy)
+ return false;
+
+ auto *ScalarTy = VTy->getScalarType();
+ unsigned NumElements = VTy->getNumElements();
+
+ unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
+ if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
+ return false;
+
+ // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
+ if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
+ return Subtarget->hasMVEFloatOps();
+
+ if (Operation != ComplexDeinterleavingOperation::CAdd)
+ return false;
+
+ return Subtarget->hasMVEIntegerOps() &&
+ (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
+ ScalarTy->isIntegerTy(32));
+}
+
+Value *ARMTargetLowering::createComplexDeinterleavingIR(
+ Instruction *I, ComplexDeinterleavingOperation OperationType,
+ ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
+ Value *Accumulator) const {
+
+ FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
+
+ IRBuilder<> B(I);
+
+ unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
+
+ assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
+
+ if (TyWidth > 128) {
+ int Stride = Ty->getNumElements() / 2;
+ auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
+ auto SplitSeqVec = llvm::to_vector(SplitSeq);
+ ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
+ ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
+
+ auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
+ auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
+ auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
+ auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
+ Value *LowerSplitAcc = nullptr;
+ Value *UpperSplitAcc = nullptr;
+
+ if (Accumulator) {
+ LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
+ UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
+ }
+
+ auto *LowerSplitInt = createComplexDeinterleavingIR(
+ I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
+ auto *UpperSplitInt = createComplexDeinterleavingIR(
+ I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
+
+ ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
+ return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
+ }
+
+ auto *IntTy = Type::getInt32Ty(B.getContext());
+
+ ConstantInt *ConstRotation = nullptr;
+ if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
+ ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
+
+ if (Accumulator)
+ return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
+ {ConstRotation, Accumulator, InputB, InputA});
+ return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
+ {ConstRotation, InputB, InputA});
+ }
+
+ if (OperationType == ComplexDeinterleavingOperation::CAdd) {
+ // 1 means the value is not halved.
+ auto *ConstHalving = ConstantInt::get(IntTy, 1);
+
+ if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
+ ConstRotation = ConstantInt::get(IntTy, 0);
+ else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
+ ConstRotation = ConstantInt::get(IntTy, 1);
+
+ if (!ConstRotation)
+ return nullptr; // Invalid rotation for arm_mve_vcaddq
+
+ return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
+ {ConstHalving, ConstRotation, InputA, InputB});
+ }
+
+ return nullptr;
+}