summaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/ARM
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-01-24 22:00:03 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-01-24 22:00:03 +0000
commit480093f4440d54b30b3025afeac24b48f2ba7a2e (patch)
tree162e72994062888647caf0d875428db9445491a8 /contrib/llvm-project/llvm/lib/Target/ARM
parent489b1cf2ecf5b9b4a394857987014bfb09067726 (diff)
parent706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)
Notes
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/ARM')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARM.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARM.td32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp89
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h128
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp46
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td35
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp1069
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp611
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp727
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td61
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td2779
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td169
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td228
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp27
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp968
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp17
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp33
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp333
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h36
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp105
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp95
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp301
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp167
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp161
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp121
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h53
58 files changed, 5581 insertions, 3133 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp
index 30b9c8071ba2..f8a86a70c077 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -157,9 +157,8 @@ unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
MachineInstr *MI = MRI->getVRegDef(SReg);
if (!MI) return ARM::ssub_0;
MachineOperand *MO = MI->findRegisterDefOperand(SReg);
-
- assert(MO->isReg() && "Non-register operand found!");
if (!MO) return ARM::ssub_0;
+ assert(MO->isReg() && "Non-register operand found!");
if (MI->isCopy() && usesRegClass(MI->getOperand(1),
&ARM::SPRRegClass)) {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
index 2e6f756d522c..3412813a3ef2 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
@@ -43,7 +43,6 @@ FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
FunctionPass *createA15SDOptimizerPass();
FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
FunctionPass *createARMExpandPseudoPass();
-FunctionPass *createARMCodeGenPreparePass();
FunctionPass *createARMConstantIslandPass();
FunctionPass *createMLxExpansionPass();
FunctionPass *createThumb2ITBlockPass();
@@ -54,6 +53,7 @@ FunctionPass *createThumb2SizeReductionPass(
InstructionSelector *
createARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI,
const ARMRegisterBankInfo &RBI);
+Pass *createMVEGatherScatterLoweringPass();
void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
ARMAsmPrinter &AP);
@@ -61,7 +61,6 @@ void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
void initializeARMParallelDSPPass(PassRegistry &);
void initializeARMLoadStoreOptPass(PassRegistry &);
void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
-void initializeARMCodeGenPreparePass(PassRegistry &);
void initializeARMConstantIslandsPass(PassRegistry &);
void initializeARMExpandPseudoPass(PassRegistry &);
void initializeThumb2SizeReducePass(PassRegistry &);
@@ -69,6 +68,7 @@ void initializeThumb2ITBlockPass(PassRegistry &);
void initializeMVEVPTBlockPass(PassRegistry &);
void initializeARMLowOverheadLoopsPass(PassRegistry &);
void initializeMVETailPredicationPass(PassRegistry &);
+void initializeMVEGatherScatterLoweringPass(PassRegistry &);
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
index fed4cb2b9316..380eaa863689 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
@@ -303,6 +303,10 @@ def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
"Disable VFP / NEON MAC instructions">;
+// VFPv4 added VFMA instructions that can similar be fast or slow.
+def FeatureHasSlowFPVFMx : SubtargetFeature<"slowfpvfmx", "SlowFPVFMx", "true",
+ "Disable VFP / NEON FMA instructions">;
+
// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
"HasVMLxForwarding", "true",
@@ -415,10 +419,6 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
"DisablePostRAScheduler", "true",
"Don't schedule again after register allocation">;
-// Enable use of alias analysis during code generation
-def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
- "Use alias analysis during codegen">;
-
// Armv8.5-A extensions
def FeatureSB : SubtargetFeature<"sb", "HasSB", "true",
@@ -584,7 +584,6 @@ def ProcExynos : SubtargetFeature<"exynos", "ARMProcFamily", "Exynos",
"Samsung Exynos processors",
[FeatureZCZeroing,
FeatureUseWideStrideVFP,
- FeatureUseAA,
FeatureSplatVFPToNeon,
FeatureSlowVGETLNi32,
FeatureSlowVDUP32,
@@ -593,6 +592,7 @@ def ProcExynos : SubtargetFeature<"exynos", "ARMProcFamily", "Exynos",
FeatureHWDivThumb,
FeatureHWDivARM,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureHasRetAddrStack,
FeatureFuseLiterals,
FeatureFuseAES,
@@ -923,6 +923,7 @@ def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5,
FeatureTrustZone,
FeatureSlowFPBrcc,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureVMLxForwarding,
FeatureMP,
FeatureVFP4]>;
@@ -933,6 +934,7 @@ def : ProcessorModel<"cortex-a7", CortexA8Model, [ARMv7a, ProcA7,
FeatureSlowFPBrcc,
FeatureHasVMLxHazards,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureVMLxForwarding,
FeatureMP,
FeatureVFP4,
@@ -945,6 +947,7 @@ def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8,
FeatureSlowFPBrcc,
FeatureHasVMLxHazards,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureVMLxForwarding]>;
def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9,
@@ -1014,6 +1017,7 @@ def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift,
FeatureAvoidPartialCPSR,
FeatureAvoidMOVsShOp,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureHasVMLxHazards,
FeatureProfUnpredicate,
FeaturePrefISHSTBarrier,
@@ -1032,6 +1036,7 @@ def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4,
FeatureHasRetAddrStack,
FeatureSlowFPBrcc,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureVFP3_D16,
FeatureAvoidPartialCPSR]>;
@@ -1041,6 +1046,7 @@ def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5,
FeatureSlowFPBrcc,
FeatureHWDivARM,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureAvoidPartialCPSR]>;
def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7,
@@ -1051,6 +1057,7 @@ def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7,
FeatureSlowFPBrcc,
FeatureHWDivARM,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureAvoidPartialCPSR]>;
def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r,
@@ -1061,27 +1068,26 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r,
FeatureSlowFPBrcc,
FeatureHWDivARM,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureAvoidPartialCPSR]>;
def : ProcessorModel<"cortex-m3", CortexM4Model, [ARMv7m,
ProcM3,
FeaturePrefLoopAlign32,
FeatureUseMISched,
- FeatureUseAA,
FeatureHasNoBranchPredictor]>;
def : ProcessorModel<"sc300", CortexM4Model, [ARMv7m,
ProcM3,
FeatureUseMISched,
- FeatureUseAA,
FeatureHasNoBranchPredictor]>;
def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em,
FeatureVFP4_D16_SP,
FeaturePrefLoopAlign32,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureUseMISched,
- FeatureUseAA,
FeatureHasNoBranchPredictor]>;
def : ProcNoItin<"cortex-m7", [ARMv7em,
@@ -1095,8 +1101,8 @@ def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline,
FeatureFPARMv8_D16_SP,
FeaturePrefLoopAlign32,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureUseMISched,
- FeatureUseAA,
FeatureHasNoBranchPredictor]>;
def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline,
@@ -1104,8 +1110,8 @@ def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline,
FeatureFPARMv8_D16_SP,
FeaturePrefLoopAlign32,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureUseMISched,
- FeatureUseAA,
FeatureHasNoBranchPredictor]>;
@@ -1192,13 +1198,12 @@ def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift,
FeatureAvoidPartialCPSR,
FeatureAvoidMOVsShOp,
FeatureHasSlowFPVMLx,
+ FeatureHasSlowFPVFMx,
FeatureCrypto,
FeatureUseMISched,
FeatureZCZeroing,
FeatureNoPostRASched]>;
-def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynos]>;
-def : ProcNoItin<"exynos-m2", [ARMv8a, ProcExynos]>;
def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynos]>;
def : ProcNoItin<"exynos-m4", [ARMv82a, ProcExynos,
FeatureFullFP16,
@@ -1215,8 +1220,7 @@ def : ProcNoItin<"kryo", [ARMv8a, ProcKryo,
def : ProcessorModel<"cortex-r52", CortexR52Model, [ARMv8r, ProcR52,
FeatureUseMISched,
- FeatureFPAO,
- FeatureUseAA]>;
+ FeatureFPAO]>;
//===----------------------------------------------------------------------===//
// Register File Description
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index c8c91e53c44e..6f26ca127f94 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -54,8 +54,8 @@ using namespace llvm;
ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
- : AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr),
- InConstantPool(false), OptimizationGoals(-1) {}
+ : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr), AFI(nullptr),
+ MCP(nullptr), InConstantPool(false), OptimizationGoals(-1) {}
void ARMAsmPrinter::EmitFunctionBodyEnd() {
// Make sure to terminate any constant pools that were at the end
@@ -1170,10 +1170,16 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
break;
case ARM::ADDri:
case ARM::t2ADDri:
+ case ARM::t2ADDri12:
+ case ARM::t2ADDspImm:
+ case ARM::t2ADDspImm12:
Offset = -MI->getOperand(2).getImm();
break;
case ARM::SUBri:
case ARM::t2SUBri:
+ case ARM::t2SUBri12:
+ case ARM::t2SUBspImm:
+ case ARM::t2SUBspImm12:
Offset = MI->getOperand(2).getImm();
break;
case ARM::tSUBspi:
@@ -2142,7 +2148,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
//===----------------------------------------------------------------------===//
// Force static initialization.
-extern "C" void LLVMInitializeARMAsmPrinter() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmPrinter() {
RegisterAsmPrinter<ARMAsmPrinter> X(getTheARMLETarget());
RegisterAsmPrinter<ARMAsmPrinter> Y(getTheARMBETarget());
RegisterAsmPrinter<ARMAsmPrinter> A(getTheThumbLETarget());
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 684cd1def977..48f781510254 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -134,7 +134,7 @@ ScheduleHazardRecognizer *ARMBaseInstrInfo::
CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
const ScheduleDAG *DAG) const {
if (Subtarget.isThumb2() || Subtarget.hasVFP2Base())
- return (ScheduleHazardRecognizer *)new ARMHazardRecognizer(II, DAG);
+ return new ARMHazardRecognizer(II, DAG);
return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
}
@@ -829,8 +829,8 @@ void llvm::addPredicatedMveVpredROp(MachineInstrBuilder &MIB,
void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DestReg,
- unsigned SrcReg, bool KillSrc) const {
+ const DebugLoc &DL, MCRegister DestReg,
+ MCRegister SrcReg, bool KillSrc) const {
bool GPRDest = ARM::GPRRegClass.contains(DestReg);
bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
@@ -993,9 +993,8 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Mov->addRegisterKilled(SrcReg, TRI);
}
-bool ARMBaseInstrInfo::isCopyInstrImpl(const MachineInstr &MI,
- const MachineOperand *&Src,
- const MachineOperand *&Dest) const {
+Optional<DestSourcePair>
+ARMBaseInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
// VMOVRRD is also a copy instruction but it requires
// special way of handling. It is more complex copy version
// and since that we are not considering it. For recognition
@@ -1006,10 +1005,8 @@ bool ARMBaseInstrInfo::isCopyInstrImpl(const MachineInstr &MI,
if (!MI.isMoveReg() ||
(MI.getOpcode() == ARM::VORRq &&
MI.getOperand(1).getReg() != MI.getOperand(2).getReg()))
- return false;
- Dest = &MI.getOperand(0);
- Src = &MI.getOperand(1);
- return true;
+ return None;
+ return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
}
const MachineInstrBuilder &
@@ -2726,25 +2723,6 @@ static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
return false;
}
-/// getSwappedCondition - assume the flags are set by MI(a,b), return
-/// the condition code if we modify the instructions such that flags are
-/// set by MI(b,a).
-inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
- switch (CC) {
- default: return ARMCC::AL;
- case ARMCC::EQ: return ARMCC::EQ;
- case ARMCC::NE: return ARMCC::NE;
- case ARMCC::HS: return ARMCC::LS;
- case ARMCC::LO: return ARMCC::HI;
- case ARMCC::HI: return ARMCC::LO;
- case ARMCC::LS: return ARMCC::HS;
- case ARMCC::GE: return ARMCC::LE;
- case ARMCC::LT: return ARMCC::GT;
- case ARMCC::GT: return ARMCC::LT;
- case ARMCC::LE: return ARMCC::GE;
- }
-}
-
/// getCmpToAddCondition - assume the flags are set by CMP(a,b), return
/// the condition code if we modify the instructions such that flags are
/// set by ADD(a,b,X).
@@ -3279,22 +3257,26 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
}
break;
case ARM::t2ADDrr:
- case ARM::t2SUBrr:
+ case ARM::t2SUBrr: {
if (UseOpc == ARM::t2SUBrr && Commute)
return false;
// ADD/SUB are special because they're essentially the same operation, so
// we can handle a larger range of immediates.
+ const bool ToSP = DefMI.getOperand(0).getReg() == ARM::SP;
+ const unsigned t2ADD = ToSP ? ARM::t2ADDspImm : ARM::t2ADDri;
+ const unsigned t2SUB = ToSP ? ARM::t2SUBspImm : ARM::t2SUBri;
if (ARM_AM::isT2SOImmTwoPartVal(ImmVal))
- NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2ADDri : ARM::t2SUBri;
+ NewUseOpc = UseOpc == ARM::t2ADDrr ? t2ADD : t2SUB;
else if (ARM_AM::isT2SOImmTwoPartVal(-ImmVal)) {
ImmVal = -ImmVal;
- NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2SUBri : ARM::t2ADDri;
+ NewUseOpc = UseOpc == ARM::t2ADDrr ? t2SUB : t2ADD;
} else
return false;
SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
break;
+ }
case ARM::t2ORRrr:
case ARM::t2EORrr:
if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal))
@@ -3314,7 +3296,8 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned OpIdx = Commute ? 2 : 1;
Register Reg1 = UseMI.getOperand(OpIdx).getReg();
bool isKill = UseMI.getOperand(OpIdx).isKill();
- Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+ const TargetRegisterClass *TRC = MRI->getRegClass(Reg);
+ Register NewReg = MRI->createVirtualRegister(TRC);
BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(NewUseOpc),
NewReg)
.addReg(Reg1, getKillRegState(isKill))
@@ -3326,6 +3309,18 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
UseMI.getOperand(1).setIsKill();
UseMI.getOperand(2).ChangeToImmediate(SOImmValV2);
DefMI.eraseFromParent();
+ // FIXME: t2ADDrr should be split, as different rulles apply when writing to SP.
+ // Just as t2ADDri, that was split to [t2ADDri, t2ADDspImm].
+ // Then the below code will not be needed, as the input/output register
+ // classes will be rgpr or gprSP.
+ // For now, we fix the UseMI operand explicitly here:
+ switch(NewUseOpc){
+ case ARM::t2ADDspImm:
+ case ARM::t2SUBspImm:
+ case ARM::t2ADDri:
+ case ARM::t2SUBri:
+ MRI->setRegClass(UseMI.getOperand(0).getReg(), TRC);
+ }
return true;
}
@@ -5350,6 +5345,34 @@ ARMBaseInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
return makeArrayRef(TargetFlags);
}
+Optional<RegImmPair> ARMBaseInstrInfo::isAddImmediate(const MachineInstr &MI,
+ Register Reg) const {
+ int Sign = 1;
+ unsigned Opcode = MI.getOpcode();
+ int64_t Offset = 0;
+
+ // TODO: Handle cases where Reg is a super- or sub-register of the
+ // destination register.
+ if (Reg != MI.getOperand(0).getReg())
+ return None;
+
+ // We describe SUBri or ADDri instructions.
+ if (Opcode == ARM::SUBri)
+ Sign = -1;
+ else if (Opcode != ARM::ADDri)
+ return None;
+
+ // TODO: Third operand can be global address (usually some string). Since
+ // strings can be relocated we cannot calculate their offsets for
+ // now.
+ if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
+ !MI.getOperand(2).isImm())
+ return None;
+
+ Offset = MI.getOperand(2).getImm() * Sign;
+ return RegImmPair{MI.getOperand(1).getReg(), Offset};
+}
+
bool llvm::registerDefinedBetween(unsigned Reg,
MachineBasicBlock::iterator From,
MachineBasicBlock::iterator To,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index c232b6f0b45d..f6d4ebe3a090 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -99,12 +99,11 @@ protected:
MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx1,
unsigned OpIdx2) const override;
-
- /// If the specific machine instruction is a instruction that moves/copies
- /// value from one register to another register return true along with
- /// @Source machine operand and @Destination machine operand.
- bool isCopyInstrImpl(const MachineInstr &MI, const MachineOperand *&Source,
- const MachineOperand *&Destination) const override;
+ /// If the specific machine instruction is an instruction that moves/copies
+ /// value from one register to another register return destination and source
+ /// registers as machine operands.
+ Optional<DestSourcePair>
+ isCopyInstrImpl(const MachineInstr &MI) const override;
public:
// Return whether the target has an explicit NOP encoding.
@@ -203,7 +202,7 @@ public:
const ARMSubtarget &Subtarget) const;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -455,6 +454,9 @@ public:
// 3 - predicate reg
return MI.getOperand(3).getReg();
}
+
+ Optional<RegImmPair> isAddImmediate(const MachineInstr &MI,
+ Register Reg) const override;
};
/// Get the operands corresponding to the given \p Pred value. By default, the
@@ -486,6 +488,27 @@ bool isUncondBranchOpcode(int Opc) {
return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B;
}
+// This table shows the VPT instruction variants, i.e. the different
+// mask field encodings, see also B5.6. Predication/conditional execution in
+// the ArmARM.
+enum VPTMaskValue {
+ T = 8, // 0b1000
+ TT = 4, // 0b0100
+ TE = 12, // 0b1100
+ TTT = 2, // 0b0010
+ TTE = 6, // 0b0110
+ TEE = 10, // 0b1010
+ TET = 14, // 0b1110
+ TTTT = 1, // 0b0001
+ TTTE = 3, // 0b0011
+ TTEE = 5, // 0b0101
+ TTET = 7, // 0b0111
+ TEEE = 9, // 0b1001
+ TEET = 11, // 0b1011
+ TETT = 13, // 0b1101
+ TETE = 15 // 0b1111
+};
+
static inline bool isVPTOpcode(int Opc) {
return Opc == ARM::MVE_VPTv16i8 || Opc == ARM::MVE_VPTv16u8 ||
Opc == ARM::MVE_VPTv16s8 || Opc == ARM::MVE_VPTv8i16 ||
@@ -502,6 +525,97 @@ static inline bool isVPTOpcode(int Opc) {
}
static inline
+unsigned VCMPOpcodeToVPT(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ return 0;
+ case ARM::MVE_VCMPf32:
+ return ARM::MVE_VPTv4f32;
+ case ARM::MVE_VCMPf16:
+ return ARM::MVE_VPTv8f16;
+ case ARM::MVE_VCMPi8:
+ return ARM::MVE_VPTv16i8;
+ case ARM::MVE_VCMPi16:
+ return ARM::MVE_VPTv8i16;
+ case ARM::MVE_VCMPi32:
+ return ARM::MVE_VPTv4i32;
+ case ARM::MVE_VCMPu8:
+ return ARM::MVE_VPTv16u8;
+ case ARM::MVE_VCMPu16:
+ return ARM::MVE_VPTv8u16;
+ case ARM::MVE_VCMPu32:
+ return ARM::MVE_VPTv4u32;
+ case ARM::MVE_VCMPs8:
+ return ARM::MVE_VPTv16s8;
+ case ARM::MVE_VCMPs16:
+ return ARM::MVE_VPTv8s16;
+ case ARM::MVE_VCMPs32:
+ return ARM::MVE_VPTv4s32;
+
+ case ARM::MVE_VCMPf32r:
+ return ARM::MVE_VPTv4f32r;
+ case ARM::MVE_VCMPf16r:
+ return ARM::MVE_VPTv8f16r;
+ case ARM::MVE_VCMPi8r:
+ return ARM::MVE_VPTv16i8r;
+ case ARM::MVE_VCMPi16r:
+ return ARM::MVE_VPTv8i16r;
+ case ARM::MVE_VCMPi32r:
+ return ARM::MVE_VPTv4i32r;
+ case ARM::MVE_VCMPu8r:
+ return ARM::MVE_VPTv16u8r;
+ case ARM::MVE_VCMPu16r:
+ return ARM::MVE_VPTv8u16r;
+ case ARM::MVE_VCMPu32r:
+ return ARM::MVE_VPTv4u32r;
+ case ARM::MVE_VCMPs8r:
+ return ARM::MVE_VPTv16s8r;
+ case ARM::MVE_VCMPs16r:
+ return ARM::MVE_VPTv8s16r;
+ case ARM::MVE_VCMPs32r:
+ return ARM::MVE_VPTv4s32r;
+ }
+}
+
+static inline
+unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("unhandled vctp opcode");
+ break;
+ case ARM::MVE_VCTP8:
+ return IsDoLoop ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8;
+ case ARM::MVE_VCTP16:
+ return IsDoLoop ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16;
+ case ARM::MVE_VCTP32:
+ return IsDoLoop ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32;
+ case ARM::MVE_VCTP64:
+ return IsDoLoop ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64;
+ }
+ return 0;
+}
+
+static inline
+bool isVCTP(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ break;
+ case ARM::MVE_VCTP8:
+ case ARM::MVE_VCTP16:
+ case ARM::MVE_VCTP32:
+ case ARM::MVE_VCTP64:
+ return true;
+ }
+ return false;
+}
+
+static inline
+bool isLoopStart(MachineInstr &MI) {
+ return MI.getOpcode() == ARM::t2DoLoopStart ||
+ MI.getOpcode() == ARM::t2WhileLoopStart;
+}
+
+static inline
bool isCondBranchOpcode(int Opc) {
return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 1eaf871867e0..52e6d05c3155 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -75,6 +75,8 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
// GHC set of callee saved regs is empty as all those regs are
// used for passing STG regs around
return CSR_NoRegs_SaveList;
+ } else if (F.getCallingConv() == CallingConv::CFGuard_Check) {
+ return CSR_Win_AAPCS_CFGuard_Check_SaveList;
} else if (F.hasFnAttribute("interrupt")) {
if (STI.isMClass()) {
// M-class CPUs have hardware which saves the registers needed to allow a
@@ -123,7 +125,8 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
if (CC == CallingConv::GHC)
// This is academic because all GHC calls are (supposed to be) tail calls
return CSR_NoRegs_RegMask;
-
+ if (CC == CallingConv::CFGuard_Check)
+ return CSR_Win_AAPCS_CFGuard_Check_RegMask;
if (STI.getTargetLowering()->supportSwiftError() &&
MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
return STI.isTargetDarwin() ? CSR_iOS_SwiftError_RegMask
@@ -191,7 +194,7 @@ getReservedRegs(const MachineFunction &MF) const {
markSuperRegs(Reserved, ARM::PC);
markSuperRegs(Reserved, ARM::FPSCR);
markSuperRegs(Reserved, ARM::APSR_NZCV);
- if (TFI->hasFP(MF) || STI.isTargetDarwin())
+ if (TFI->hasFP(MF))
markSuperRegs(Reserved, getFramePointerReg(STI));
if (hasBasePointer(MF))
markSuperRegs(Reserved, BasePtr);
@@ -385,7 +388,7 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
return true;
// Thumb has trouble with negative offsets from the FP. Thumb2 has a limited
- // negative range for ldr/str (255), and thumb1 is positive offsets only.
+ // negative range for ldr/str (255), and Thumb1 is positive offsets only.
//
// It's going to be better to use the SP or Base Pointer instead. When there
// are variable sized objects, we can't reference off of the SP, so we
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
index d3b595ce8323..ce260a9ba145 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -106,7 +106,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
MIRBuilder.buildConstant(OffsetReg, Offset);
Register AddrReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
+ MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg);
MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
return AddrReg;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp
index 92ebc542b423..a47c59512592 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp
@@ -18,8 +18,8 @@
using namespace llvm;
// APCS f64 is in register pairs, possibly split to stack
-static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
+static bool f64AssignAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
CCState &State, bool CanFail) {
static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
@@ -48,9 +48,9 @@ static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
-static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
+static bool CC_ARM_APCS_Custom_f64(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags,
CCState &State) {
if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
return false;
@@ -61,8 +61,8 @@ static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
}
// AAPCS f64 is in aligned register pairs
-static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
+static bool f64AssignAAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
CCState &State, bool CanFail) {
static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
@@ -102,9 +102,9 @@ static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
-static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
+static bool CC_ARM_AAPCS_Custom_f64(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags,
CCState &State) {
if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
return false;
@@ -114,8 +114,8 @@ static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true; // we handled it
}
-static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo, CCState &State) {
+static bool f64RetAssign(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, CCState &State) {
static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
@@ -134,9 +134,9 @@ static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
-static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
+static bool RetCC_ARM_APCS_Custom_f64(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags,
CCState &State) {
if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
return false;
@@ -145,9 +145,9 @@ static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true; // we handled it
}
-static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags,
CCState &State) {
return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
State);
@@ -169,10 +169,10 @@ static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
// InConsecutiveRegsLast set. We must process all members of the HA before
// we can allocate it, as we need to know the total number of registers that
// will be needed in order to (attempt to) allocate a contiguous block.
-static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
+static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
+ MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags,
CCState &State) {
SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
@@ -181,7 +181,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
assert(PendingMembers[0].getLocVT() == LocVT);
// Add the argument to the list to be allocated once we know the size of the
- // aggregate. Store the type's required alignmnent as extra info for later: in
+ // aggregate. Store the type's required alignment as extra info for later: in
// the [N x i64] case all trace has been removed by the time we actually get
// to do allocation.
PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.h
index 615634551d90..7c692f03b440 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -32,6 +32,9 @@ bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
CCState &State);
+bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
CCState &State);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td
index 61d2d83ddc40..5df5b56f5afa 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -20,7 +20,7 @@ def CC_ARM_APCS : CallingConv<[
// Handles byval parameters.
CCIfByVal<CCPassByVal<4, 4>>,
-
+
CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
// Pass SwiftSelf in a callee saved register.
@@ -80,7 +80,7 @@ def FastCC_ARM_APCS : CallingConv<[
S9, S10, S11, S12, S13, S14, S15]>>,
// CPRCs may be allocated to co-processor registers or the stack - they
- // may never be allocated to core registers.
+ // may never be allocated to core registers.
CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToStackWithShadow<8, 4, [Q0, Q1, Q2, Q3]>>,
CCIfType<[v2f64], CCAssignToStackWithShadow<16, 4, [Q0, Q1, Q2, Q3]>>,
@@ -165,8 +165,8 @@ def CC_ARM_AAPCS : CallingConv<[
CCIfNest<CCAssignToReg<[R12]>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -182,8 +182,8 @@ def CC_ARM_AAPCS : CallingConv<[
let Entry = 1 in
def RetCC_ARM_AAPCS : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16,v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -208,8 +208,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -230,8 +230,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
let Entry = 1 in
def RetCC_ARM_AAPCS_VFP : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -246,6 +246,16 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
CCDelegateTo<RetCC_ARM_AAPCS_Common>
]>;
+
+// Windows Control Flow Guard checks take a single argument (the target function
+// address) and have no return value.
+let Entry = 1 in
+def CC_ARM_Win32_CFGuard_Check : CallingConv<[
+ CCIfType<[i32], CCAssignToReg<[R0]>>
+]>;
+
+
+
//===----------------------------------------------------------------------===//
// Callee-saved register lists.
//===----------------------------------------------------------------------===//
@@ -256,6 +266,11 @@ def CSR_FPRegs : CalleeSavedRegs<(add (sequence "D%u", 0, 31))>;
def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4,
(sequence "D%u", 15, 8))>;
+// The Windows Control Flow Guard Check function preserves the same registers as
+// AAPCS, and also preserves all floating point registers.
+def CSR_Win_AAPCS_CFGuard_Check : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7,
+ R6, R5, R4, (sequence "D%u", 15, 0))>;
+
// R8 is used to pass swifterror, remove it from CSR.
def CSR_AAPCS_SwiftError : CalleeSavedRegs<(sub CSR_AAPCS, R8)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
deleted file mode 100644
index 1c2c8aef55bb..000000000000
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ /dev/null
@@ -1,1069 +0,0 @@
-//===----- ARMCodeGenPrepare.cpp ------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass inserts intrinsics to handle small types that would otherwise be
-/// promoted during legalization. Here we can manually promote types or insert
-/// intrinsics which can handle narrow types that aren't supported by the
-/// register classes.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM.h"
-#include "ARMSubtarget.h"
-#include "ARMTargetMachine.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-
-#define DEBUG_TYPE "arm-codegenprepare"
-
-using namespace llvm;
-
-static cl::opt<bool>
-DisableCGP("arm-disable-cgp", cl::Hidden, cl::init(true),
- cl::desc("Disable ARM specific CodeGenPrepare pass"));
-
-static cl::opt<bool>
-EnableDSP("arm-enable-scalar-dsp", cl::Hidden, cl::init(false),
- cl::desc("Use DSP instructions for scalar operations"));
-
-static cl::opt<bool>
-EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false),
- cl::desc("Use DSP instructions for scalar operations\
- with immediate operands"));
-
-// The goal of this pass is to enable more efficient code generation for
-// operations on narrow types (i.e. types with < 32-bits) and this is a
-// motivating IR code example:
-//
-// define hidden i32 @cmp(i8 zeroext) {
-// %2 = add i8 %0, -49
-// %3 = icmp ult i8 %2, 3
-// ..
-// }
-//
-// The issue here is that i8 is type-legalized to i32 because i8 is not a
-// legal type. Thus, arithmetic is done in integer-precision, but then the
-// byte value is masked out as follows:
-//
-// t19: i32 = add t4, Constant:i32<-49>
-// t24: i32 = and t19, Constant:i32<255>
-//
-// Consequently, we generate code like this:
-//
-// subs r0, #49
-// uxtb r1, r0
-// cmp r1, #3
-//
-// This shows that masking out the byte value results in generation of
-// the UXTB instruction. This is not optimal as r0 already contains the byte
-// value we need, and so instead we can just generate:
-//
-// sub.w r1, r0, #49
-// cmp r1, #3
-//
-// We achieve this by type promoting the IR to i32 like so for this example:
-//
-// define i32 @cmp(i8 zeroext %c) {
-// %0 = zext i8 %c to i32
-// %c.off = add i32 %0, -49
-// %1 = icmp ult i32 %c.off, 3
-// ..
-// }
-//
-// For this to be valid and legal, we need to prove that the i32 add is
-// producing the same value as the i8 addition, and that e.g. no overflow
-// happens.
-//
-// A brief sketch of the algorithm and some terminology.
-// We pattern match interesting IR patterns:
-// - which have "sources": instructions producing narrow values (i8, i16), and
-// - they have "sinks": instructions consuming these narrow values.
-//
-// We collect all instruction connecting sources and sinks in a worklist, so
-// that we can mutate these instruction and perform type promotion when it is
-// legal to do so.
-
-namespace {
-class IRPromoter {
- SmallPtrSet<Value*, 8> NewInsts;
- SmallPtrSet<Instruction*, 4> InstsToRemove;
- DenseMap<Value*, SmallVector<Type*, 4>> TruncTysMap;
- SmallPtrSet<Value*, 8> Promoted;
- Module *M = nullptr;
- LLVMContext &Ctx;
- // The type we promote to: always i32
- IntegerType *ExtTy = nullptr;
- // The type of the value that the search began from, either i8 or i16.
- // This defines the max range of the values that we allow in the promoted
- // tree.
- IntegerType *OrigTy = nullptr;
- SetVector<Value*> *Visited;
- SmallPtrSetImpl<Value*> *Sources;
- SmallPtrSetImpl<Instruction*> *Sinks;
- SmallPtrSetImpl<Instruction*> *SafeToPromote;
- SmallPtrSetImpl<Instruction*> *SafeWrap;
-
- void ReplaceAllUsersOfWith(Value *From, Value *To);
- void PrepareWrappingAdds(void);
- void ExtendSources(void);
- void ConvertTruncs(void);
- void PromoteTree(void);
- void TruncateSinks(void);
- void Cleanup(void);
-
-public:
- IRPromoter(Module *M) : M(M), Ctx(M->getContext()),
- ExtTy(Type::getInt32Ty(Ctx)) { }
-
-
- void Mutate(Type *OrigTy,
- SetVector<Value*> &Visited,
- SmallPtrSetImpl<Value*> &Sources,
- SmallPtrSetImpl<Instruction*> &Sinks,
- SmallPtrSetImpl<Instruction*> &SafeToPromote,
- SmallPtrSetImpl<Instruction*> &SafeWrap);
-};
-
-class ARMCodeGenPrepare : public FunctionPass {
- const ARMSubtarget *ST = nullptr;
- IRPromoter *Promoter = nullptr;
- std::set<Value*> AllVisited;
- SmallPtrSet<Instruction*, 8> SafeToPromote;
- SmallPtrSet<Instruction*, 4> SafeWrap;
-
- bool isSafeWrap(Instruction *I);
- bool isSupportedValue(Value *V);
- bool isLegalToPromote(Value *V);
- bool TryToPromote(Value *V);
-
-public:
- static char ID;
- static unsigned TypeSize;
- Type *OrigTy = nullptr;
-
- ARMCodeGenPrepare() : FunctionPass(ID) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetPassConfig>();
- }
-
- StringRef getPassName() const override { return "ARM IR optimizations"; }
-
- bool doInitialization(Module &M) override;
- bool runOnFunction(Function &F) override;
- bool doFinalization(Module &M) override;
-};
-
-}
-
-static bool GenerateSignBits(Value *V) {
- if (!isa<Instruction>(V))
- return false;
-
- unsigned Opc = cast<Instruction>(V)->getOpcode();
- return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
- Opc == Instruction::SRem || Opc == Instruction::SExt;
-}
-
-static bool EqualTypeSize(Value *V) {
- return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
-}
-
-static bool LessOrEqualTypeSize(Value *V) {
- return V->getType()->getScalarSizeInBits() <= ARMCodeGenPrepare::TypeSize;
-}
-
-static bool GreaterThanTypeSize(Value *V) {
- return V->getType()->getScalarSizeInBits() > ARMCodeGenPrepare::TypeSize;
-}
-
-static bool LessThanTypeSize(Value *V) {
- return V->getType()->getScalarSizeInBits() < ARMCodeGenPrepare::TypeSize;
-}
-
-/// Some instructions can use 8- and 16-bit operands, and we don't need to
-/// promote anything larger. We disallow booleans to make life easier when
-/// dealing with icmps but allow any other integer that is <= 16 bits. Void
-/// types are accepted so we can handle switches.
-static bool isSupportedType(Value *V) {
- Type *Ty = V->getType();
-
- // Allow voids and pointers, these won't be promoted.
- if (Ty->isVoidTy() || Ty->isPointerTy())
- return true;
-
- if (auto *Ld = dyn_cast<LoadInst>(V))
- Ty = cast<PointerType>(Ld->getPointerOperandType())->getElementType();
-
- if (!isa<IntegerType>(Ty) ||
- cast<IntegerType>(V->getType())->getBitWidth() == 1)
- return false;
-
- return LessOrEqualTypeSize(V);
-}
-
-/// Return true if the given value is a source in the use-def chain, producing
-/// a narrow 'TypeSize' value. These values will be zext to start the promotion
-/// of the tree to i32. We guarantee that these won't populate the upper bits
-/// of the register. ZExt on the loads will be free, and the same for call
-/// return values because we only accept ones that guarantee a zeroext ret val.
-/// Many arguments will have the zeroext attribute too, so those would be free
-/// too.
-static bool isSource(Value *V) {
- if (!isa<IntegerType>(V->getType()))
- return false;
-
- // TODO Allow zext to be sources.
- if (isa<Argument>(V))
- return true;
- else if (isa<LoadInst>(V))
- return true;
- else if (isa<BitCastInst>(V))
- return true;
- else if (auto *Call = dyn_cast<CallInst>(V))
- return Call->hasRetAttr(Attribute::AttrKind::ZExt);
- else if (auto *Trunc = dyn_cast<TruncInst>(V))
- return EqualTypeSize(Trunc);
- return false;
-}
-
-/// Return true if V will require any promoted values to be truncated for the
-/// the IR to remain valid. We can't mutate the value type of these
-/// instructions.
-static bool isSink(Value *V) {
- // TODO The truncate also isn't actually necessary because we would already
- // proved that the data value is kept within the range of the original data
- // type.
-
- // Sinks are:
- // - points where the value in the register is being observed, such as an
- // icmp, switch or store.
- // - points where value types have to match, such as calls and returns.
- // - zext are included to ease the transformation and are generally removed
- // later on.
- if (auto *Store = dyn_cast<StoreInst>(V))
- return LessOrEqualTypeSize(Store->getValueOperand());
- if (auto *Return = dyn_cast<ReturnInst>(V))
- return LessOrEqualTypeSize(Return->getReturnValue());
- if (auto *ZExt = dyn_cast<ZExtInst>(V))
- return GreaterThanTypeSize(ZExt);
- if (auto *Switch = dyn_cast<SwitchInst>(V))
- return LessThanTypeSize(Switch->getCondition());
- if (auto *ICmp = dyn_cast<ICmpInst>(V))
- return ICmp->isSigned() || LessThanTypeSize(ICmp->getOperand(0));
-
- return isa<CallInst>(V);
-}
-
-/// Return whether this instruction can safely wrap.
-bool ARMCodeGenPrepare::isSafeWrap(Instruction *I) {
- // We can support a, potentially, wrapping instruction (I) if:
- // - It is only used by an unsigned icmp.
- // - The icmp uses a constant.
- // - The wrapping value (I) is decreasing, i.e would underflow - wrapping
- // around zero to become a larger number than before.
- // - The wrapping instruction (I) also uses a constant.
- //
- // We can then use the two constants to calculate whether the result would
- // wrap in respect to itself in the original bitwidth. If it doesn't wrap,
- // just underflows the range, the icmp would give the same result whether the
- // result has been truncated or not. We calculate this by:
- // - Zero extending both constants, if needed, to 32-bits.
- // - Take the absolute value of I's constant, adding this to the icmp const.
- // - Check that this value is not out of range for small type. If it is, it
- // means that it has underflowed enough to wrap around the icmp constant.
- //
- // For example:
- //
- // %sub = sub i8 %a, 2
- // %cmp = icmp ule i8 %sub, 254
- //
- // If %a = 0, %sub = -2 == FE == 254
- // But if this is evalulated as a i32
- // %sub = -2 == FF FF FF FE == 4294967294
- // So the unsigned compares (i8 and i32) would not yield the same result.
- //
- // Another way to look at it is:
- // %a - 2 <= 254
- // %a + 2 <= 254 + 2
- // %a <= 256
- // And we can't represent 256 in the i8 format, so we don't support it.
- //
- // Whereas:
- //
- // %sub i8 %a, 1
- // %cmp = icmp ule i8 %sub, 254
- //
- // If %a = 0, %sub = -1 == FF == 255
- // As i32:
- // %sub = -1 == FF FF FF FF == 4294967295
- //
- // In this case, the unsigned compare results would be the same and this
- // would also be true for ult, uge and ugt:
- // - (255 < 254) == (0xFFFFFFFF < 254) == false
- // - (255 <= 254) == (0xFFFFFFFF <= 254) == false
- // - (255 > 254) == (0xFFFFFFFF > 254) == true
- // - (255 >= 254) == (0xFFFFFFFF >= 254) == true
- //
- // To demonstrate why we can't handle increasing values:
- //
- // %add = add i8 %a, 2
- // %cmp = icmp ult i8 %add, 127
- //
- // If %a = 254, %add = 256 == (i8 1)
- // As i32:
- // %add = 256
- //
- // (1 < 127) != (256 < 127)
-
- unsigned Opc = I->getOpcode();
- if (Opc != Instruction::Add && Opc != Instruction::Sub)
- return false;
-
- if (!I->hasOneUse() ||
- !isa<ICmpInst>(*I->user_begin()) ||
- !isa<ConstantInt>(I->getOperand(1)))
- return false;
-
- ConstantInt *OverflowConst = cast<ConstantInt>(I->getOperand(1));
- bool NegImm = OverflowConst->isNegative();
- bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
- ((Opc == Instruction::Add) && NegImm);
- if (!IsDecreasing)
- return false;
-
- // Don't support an icmp that deals with sign bits.
- auto *CI = cast<ICmpInst>(*I->user_begin());
- if (CI->isSigned() || CI->isEquality())
- return false;
-
- ConstantInt *ICmpConst = nullptr;
- if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(0)))
- ICmpConst = Const;
- else if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(1)))
- ICmpConst = Const;
- else
- return false;
-
- // Now check that the result can't wrap on itself.
- APInt Total = ICmpConst->getValue().getBitWidth() < 32 ?
- ICmpConst->getValue().zext(32) : ICmpConst->getValue();
-
- Total += OverflowConst->getValue().getBitWidth() < 32 ?
- OverflowConst->getValue().abs().zext(32) : OverflowConst->getValue().abs();
-
- APInt Max = APInt::getAllOnesValue(ARMCodeGenPrepare::TypeSize);
-
- if (Total.getBitWidth() > Max.getBitWidth()) {
- if (Total.ugt(Max.zext(Total.getBitWidth())))
- return false;
- } else if (Max.getBitWidth() > Total.getBitWidth()) {
- if (Total.zext(Max.getBitWidth()).ugt(Max))
- return false;
- } else if (Total.ugt(Max))
- return false;
-
- LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
- SafeWrap.insert(I);
- return true;
-}
-
-static bool shouldPromote(Value *V) {
- if (!isa<IntegerType>(V->getType()) || isSink(V))
- return false;
-
- if (isSource(V))
- return true;
-
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
-
- if (isa<ICmpInst>(I))
- return false;
-
- return true;
-}
-
-/// Return whether we can safely mutate V's type to ExtTy without having to be
-/// concerned with zero extending or truncation.
-static bool isPromotedResultSafe(Value *V) {
- if (GenerateSignBits(V))
- return false;
-
- if (!isa<Instruction>(V))
- return true;
-
- if (!isa<OverflowingBinaryOperator>(V))
- return true;
-
- return cast<Instruction>(V)->hasNoUnsignedWrap();
-}
-
-/// Return the intrinsic for the instruction that can perform the same
-/// operation but on a narrow type. This is using the parallel dsp intrinsics
-/// on scalar values.
-static Intrinsic::ID getNarrowIntrinsic(Instruction *I) {
- // Whether we use the signed or unsigned versions of these intrinsics
- // doesn't matter because we're not using the GE bits that they set in
- // the APSR.
- switch(I->getOpcode()) {
- default:
- break;
- case Instruction::Add:
- return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_uadd16 :
- Intrinsic::arm_uadd8;
- case Instruction::Sub:
- return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_usub16 :
- Intrinsic::arm_usub8;
- }
- llvm_unreachable("unhandled opcode for narrow intrinsic");
-}
-
-void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
- SmallVector<Instruction*, 4> Users;
- Instruction *InstTo = dyn_cast<Instruction>(To);
- bool ReplacedAll = true;
-
- LLVM_DEBUG(dbgs() << "ARM CGP: Replacing " << *From << " with " << *To
- << "\n");
-
- for (Use &U : From->uses()) {
- auto *User = cast<Instruction>(U.getUser());
- if (InstTo && User->isIdenticalTo(InstTo)) {
- ReplacedAll = false;
- continue;
- }
- Users.push_back(User);
- }
-
- for (auto *U : Users)
- U->replaceUsesOfWith(From, To);
-
- if (ReplacedAll)
- if (auto *I = dyn_cast<Instruction>(From))
- InstsToRemove.insert(I);
-}
-
-void IRPromoter::PrepareWrappingAdds() {
- LLVM_DEBUG(dbgs() << "ARM CGP: Prepare underflowing adds.\n");
- IRBuilder<> Builder{Ctx};
-
- // For adds that safely wrap and use a negative immediate as operand 1, we
- // create an equivalent instruction using a positive immediate.
- // That positive immediate can then be zext along with all the other
- // immediates later.
- for (auto *I : *SafeWrap) {
- if (I->getOpcode() != Instruction::Add)
- continue;
-
- LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n");
- assert((isa<ConstantInt>(I->getOperand(1)) &&
- cast<ConstantInt>(I->getOperand(1))->isNegative()) &&
- "Wrapping should have a negative immediate as the second operand");
-
- auto Const = cast<ConstantInt>(I->getOperand(1));
- auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs());
- Builder.SetInsertPoint(I);
- Value *NewVal = Builder.CreateSub(I->getOperand(0), NewConst);
- if (auto *NewInst = dyn_cast<Instruction>(NewVal)) {
- NewInst->copyIRFlags(I);
- NewInsts.insert(NewInst);
- }
- InstsToRemove.insert(I);
- I->replaceAllUsesWith(NewVal);
- LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n");
- }
- for (auto *I : NewInsts)
- Visited->insert(I);
-}
-
-void IRPromoter::ExtendSources() {
- IRBuilder<> Builder{Ctx};
-
- auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
- assert(V->getType() != ExtTy && "zext already extends to i32");
- LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
- Builder.SetInsertPoint(InsertPt);
- if (auto *I = dyn_cast<Instruction>(V))
- Builder.SetCurrentDebugLocation(I->getDebugLoc());
-
- Value *ZExt = Builder.CreateZExt(V, ExtTy);
- if (auto *I = dyn_cast<Instruction>(ZExt)) {
- if (isa<Argument>(V))
- I->moveBefore(InsertPt);
- else
- I->moveAfter(InsertPt);
- NewInsts.insert(I);
- }
-
- ReplaceAllUsersOfWith(V, ZExt);
- };
-
- // Now, insert extending instructions between the sources and their users.
- LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n");
- for (auto V : *Sources) {
- LLVM_DEBUG(dbgs() << " - " << *V << "\n");
- if (auto *I = dyn_cast<Instruction>(V))
- InsertZExt(I, I);
- else if (auto *Arg = dyn_cast<Argument>(V)) {
- BasicBlock &BB = Arg->getParent()->front();
- InsertZExt(Arg, &*BB.getFirstInsertionPt());
- } else {
- llvm_unreachable("unhandled source that needs extending");
- }
- Promoted.insert(V);
- }
-}
-
-void IRPromoter::PromoteTree() {
- LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
-
- IRBuilder<> Builder{Ctx};
-
- // Mutate the types of the instructions within the tree. Here we handle
- // constant operands.
- for (auto *V : *Visited) {
- if (Sources->count(V))
- continue;
-
- auto *I = cast<Instruction>(V);
- if (Sinks->count(I))
- continue;
-
- for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) {
- Value *Op = I->getOperand(i);
- if ((Op->getType() == ExtTy) || !isa<IntegerType>(Op->getType()))
- continue;
-
- if (auto *Const = dyn_cast<ConstantInt>(Op)) {
- Constant *NewConst = ConstantExpr::getZExt(Const, ExtTy);
- I->setOperand(i, NewConst);
- } else if (isa<UndefValue>(Op))
- I->setOperand(i, UndefValue::get(ExtTy));
- }
-
- if (shouldPromote(I)) {
- I->mutateType(ExtTy);
- Promoted.insert(I);
- }
- }
-
- // Finally, any instructions that should be promoted but haven't yet been,
- // need to be handled using intrinsics.
- for (auto *V : *Visited) {
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- continue;
-
- if (Sources->count(I) || Sinks->count(I))
- continue;
-
- if (!shouldPromote(I) || SafeToPromote->count(I) || NewInsts.count(I))
- continue;
-
- assert(EnableDSP && "DSP intrinisc insertion not enabled!");
-
- // Replace unsafe instructions with appropriate intrinsic calls.
- LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
- << *I << "\n");
- Function *DSPInst =
- Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
- Builder.SetInsertPoint(I);
- Builder.SetCurrentDebugLocation(I->getDebugLoc());
- Value *Args[] = { I->getOperand(0), I->getOperand(1) };
- CallInst *Call = Builder.CreateCall(DSPInst, Args);
- NewInsts.insert(Call);
- ReplaceAllUsersOfWith(I, Call);
- }
-}
-
-void IRPromoter::TruncateSinks() {
- LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
-
- IRBuilder<> Builder{Ctx};
-
- auto InsertTrunc = [&](Value *V, Type *TruncTy) -> Instruction* {
- if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType()))
- return nullptr;
-
- if ((!Promoted.count(V) && !NewInsts.count(V)) || Sources->count(V))
- return nullptr;
-
- LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for "
- << *V << "\n");
- Builder.SetInsertPoint(cast<Instruction>(V));
- auto *Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(V, TruncTy));
- if (Trunc)
- NewInsts.insert(Trunc);
- return Trunc;
- };
-
- // Fix up any stores or returns that use the results of the promoted
- // chain.
- for (auto I : *Sinks) {
- LLVM_DEBUG(dbgs() << "ARM CGP: For Sink: " << *I << "\n");
-
- // Handle calls separately as we need to iterate over arg operands.
- if (auto *Call = dyn_cast<CallInst>(I)) {
- for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) {
- Value *Arg = Call->getArgOperand(i);
- Type *Ty = TruncTysMap[Call][i];
- if (Instruction *Trunc = InsertTrunc(Arg, Ty)) {
- Trunc->moveBefore(Call);
- Call->setArgOperand(i, Trunc);
- }
- }
- continue;
- }
-
- // Special case switches because we need to truncate the condition.
- if (auto *Switch = dyn_cast<SwitchInst>(I)) {
- Type *Ty = TruncTysMap[Switch][0];
- if (Instruction *Trunc = InsertTrunc(Switch->getCondition(), Ty)) {
- Trunc->moveBefore(Switch);
- Switch->setCondition(Trunc);
- }
- continue;
- }
-
- // Now handle the others.
- for (unsigned i = 0; i < I->getNumOperands(); ++i) {
- Type *Ty = TruncTysMap[I][i];
- if (Instruction *Trunc = InsertTrunc(I->getOperand(i), Ty)) {
- Trunc->moveBefore(I);
- I->setOperand(i, Trunc);
- }
- }
- }
-}
-
-void IRPromoter::Cleanup() {
- LLVM_DEBUG(dbgs() << "ARM CGP: Cleanup..\n");
- // Some zexts will now have become redundant, along with their trunc
- // operands, so remove them
- for (auto V : *Visited) {
- if (!isa<ZExtInst>(V))
- continue;
-
- auto ZExt = cast<ZExtInst>(V);
- if (ZExt->getDestTy() != ExtTy)
- continue;
-
- Value *Src = ZExt->getOperand(0);
- if (ZExt->getSrcTy() == ZExt->getDestTy()) {
- LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast: " << *ZExt
- << "\n");
- ReplaceAllUsersOfWith(ZExt, Src);
- continue;
- }
-
- // Unless they produce a value that is narrower than ExtTy, we can
- // replace the result of the zext with the input of a newly inserted
- // trunc.
- if (NewInsts.count(Src) && isa<TruncInst>(Src) &&
- Src->getType() == OrigTy) {
- auto *Trunc = cast<TruncInst>(Src);
- assert(Trunc->getOperand(0)->getType() == ExtTy &&
- "expected inserted trunc to be operating on i32");
- ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
- }
- }
-
- for (auto *I : InstsToRemove) {
- LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
- I->dropAllReferences();
- I->eraseFromParent();
- }
-
- InstsToRemove.clear();
- NewInsts.clear();
- TruncTysMap.clear();
- Promoted.clear();
- SafeToPromote->clear();
- SafeWrap->clear();
-}
-
-void IRPromoter::ConvertTruncs() {
- LLVM_DEBUG(dbgs() << "ARM CGP: Converting truncs..\n");
- IRBuilder<> Builder{Ctx};
-
- for (auto *V : *Visited) {
- if (!isa<TruncInst>(V) || Sources->count(V))
- continue;
-
- auto *Trunc = cast<TruncInst>(V);
- Builder.SetInsertPoint(Trunc);
- IntegerType *SrcTy = cast<IntegerType>(Trunc->getOperand(0)->getType());
- IntegerType *DestTy = cast<IntegerType>(TruncTysMap[Trunc][0]);
-
- unsigned NumBits = DestTy->getScalarSizeInBits();
- ConstantInt *Mask =
- ConstantInt::get(SrcTy, APInt::getMaxValue(NumBits).getZExtValue());
- Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask);
-
- if (auto *I = dyn_cast<Instruction>(Masked))
- NewInsts.insert(I);
-
- ReplaceAllUsersOfWith(Trunc, Masked);
- }
-}
-
-void IRPromoter::Mutate(Type *OrigTy,
- SetVector<Value*> &Visited,
- SmallPtrSetImpl<Value*> &Sources,
- SmallPtrSetImpl<Instruction*> &Sinks,
- SmallPtrSetImpl<Instruction*> &SafeToPromote,
- SmallPtrSetImpl<Instruction*> &SafeWrap) {
- LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
- << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
-
- assert(isa<IntegerType>(OrigTy) && "expected integer type");
- this->OrigTy = cast<IntegerType>(OrigTy);
- assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits() &&
- "original type not smaller than extended type");
-
- this->Visited = &Visited;
- this->Sources = &Sources;
- this->Sinks = &Sinks;
- this->SafeToPromote = &SafeToPromote;
- this->SafeWrap = &SafeWrap;
-
- // Cache original types of the values that will likely need truncating
- for (auto *I : Sinks) {
- if (auto *Call = dyn_cast<CallInst>(I)) {
- for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) {
- Value *Arg = Call->getArgOperand(i);
- TruncTysMap[Call].push_back(Arg->getType());
- }
- } else if (auto *Switch = dyn_cast<SwitchInst>(I))
- TruncTysMap[I].push_back(Switch->getCondition()->getType());
- else {
- for (unsigned i = 0; i < I->getNumOperands(); ++i)
- TruncTysMap[I].push_back(I->getOperand(i)->getType());
- }
- }
- for (auto *V : Visited) {
- if (!isa<TruncInst>(V) || Sources.count(V))
- continue;
- auto *Trunc = cast<TruncInst>(V);
- TruncTysMap[Trunc].push_back(Trunc->getDestTy());
- }
-
- // Convert adds using negative immediates to equivalent instructions that use
- // positive constants.
- PrepareWrappingAdds();
-
- // Insert zext instructions between sources and their users.
- ExtendSources();
-
- // Promote visited instructions, mutating their types in place. Also insert
- // DSP intrinsics, if enabled, for adds and subs which would be unsafe to
- // promote.
- PromoteTree();
-
- // Convert any truncs, that aren't sources, into AND masks.
- ConvertTruncs();
-
- // Insert trunc instructions for use by calls, stores etc...
- TruncateSinks();
-
- // Finally, remove unecessary zexts and truncs, delete old instructions and
- // clear the data structures.
- Cleanup();
-
- LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete\n");
-}
-
-/// We accept most instructions, as well as Arguments and ConstantInsts. We
-/// Disallow casts other than zext and truncs and only allow calls if their
-/// return value is zeroext. We don't allow opcodes that can introduce sign
-/// bits.
-bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
- if (auto *I = dyn_cast<Instruction>(V)) {
- switch (I->getOpcode()) {
- default:
- return isa<BinaryOperator>(I) && isSupportedType(I) &&
- !GenerateSignBits(I);
- case Instruction::GetElementPtr:
- case Instruction::Store:
- case Instruction::Br:
- case Instruction::Switch:
- return true;
- case Instruction::PHI:
- case Instruction::Select:
- case Instruction::Ret:
- case Instruction::Load:
- case Instruction::Trunc:
- case Instruction::BitCast:
- return isSupportedType(I);
- case Instruction::ZExt:
- return isSupportedType(I->getOperand(0));
- case Instruction::ICmp:
- // Now that we allow small types than TypeSize, only allow icmp of
- // TypeSize because they will require a trunc to be legalised.
- // TODO: Allow icmp of smaller types, and calculate at the end
- // whether the transform would be beneficial.
- if (isa<PointerType>(I->getOperand(0)->getType()))
- return true;
- return EqualTypeSize(I->getOperand(0));
- case Instruction::Call: {
- // Special cases for calls as we need to check for zeroext
- // TODO We should accept calls even if they don't have zeroext, as they
- // can still be sinks.
- auto *Call = cast<CallInst>(I);
- return isSupportedType(Call) &&
- Call->hasRetAttr(Attribute::AttrKind::ZExt);
- }
- }
- } else if (isa<Constant>(V) && !isa<ConstantExpr>(V)) {
- return isSupportedType(V);
- } else if (isa<Argument>(V))
- return isSupportedType(V);
-
- return isa<BasicBlock>(V);
-}
-
-/// Check that the type of V would be promoted and that the original type is
-/// smaller than the targeted promoted type. Check that we're not trying to
-/// promote something larger than our base 'TypeSize' type.
-bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
-
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return true;
-
- if (SafeToPromote.count(I))
- return true;
-
- if (isPromotedResultSafe(V) || isSafeWrap(I)) {
- SafeToPromote.insert(I);
- return true;
- }
-
- if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
- return false;
-
- // If promotion is not safe, can we use a DSP instruction to natively
- // handle the narrow type?
- if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
- return false;
-
- if (ST->isThumb() && !ST->hasThumb2())
- return false;
-
- // TODO
- // Would it be profitable? For Thumb code, these parallel DSP instructions
- // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
- // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
- // halved. They also do not take immediates as operands.
- for (auto &Op : I->operands()) {
- if (isa<Constant>(Op)) {
- if (!EnableDSPWithImms)
- return false;
- }
- }
- LLVM_DEBUG(dbgs() << "ARM CGP: Will use an intrinsic for: " << *I << "\n");
- return true;
-}
-
-bool ARMCodeGenPrepare::TryToPromote(Value *V) {
- OrigTy = V->getType();
- TypeSize = OrigTy->getPrimitiveSizeInBits();
- if (TypeSize > 16 || TypeSize < 8)
- return false;
-
- SafeToPromote.clear();
- SafeWrap.clear();
-
- if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
- return false;
-
- LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << ", TypeSize = "
- << TypeSize << "\n");
-
- SetVector<Value*> WorkList;
- SmallPtrSet<Value*, 8> Sources;
- SmallPtrSet<Instruction*, 4> Sinks;
- SetVector<Value*> CurrentVisited;
- WorkList.insert(V);
-
- // Return true if V was added to the worklist as a supported instruction,
- // if it was already visited, or if we don't need to explore it (e.g.
- // pointer values and GEPs), and false otherwise.
- auto AddLegalInst = [&](Value *V) {
- if (CurrentVisited.count(V))
- return true;
-
- // Ignore GEPs because they don't need promoting and the constant indices
- // will prevent the transformation.
- if (isa<GetElementPtrInst>(V))
- return true;
-
- if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
- LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n");
- return false;
- }
-
- WorkList.insert(V);
- return true;
- };
-
- // Iterate through, and add to, a tree of operands and users in the use-def.
- while (!WorkList.empty()) {
- Value *V = WorkList.back();
- WorkList.pop_back();
- if (CurrentVisited.count(V))
- continue;
-
- // Ignore non-instructions, other than arguments.
- if (!isa<Instruction>(V) && !isSource(V))
- continue;
-
- // If we've already visited this value from somewhere, bail now because
- // the tree has already been explored.
- // TODO: This could limit the transform, ie if we try to promote something
- // from an i8 and fail first, before trying an i16.
- if (AllVisited.count(V))
- return false;
-
- CurrentVisited.insert(V);
- AllVisited.insert(V);
-
- // Calls can be both sources and sinks.
- if (isSink(V))
- Sinks.insert(cast<Instruction>(V));
-
- if (isSource(V))
- Sources.insert(V);
-
- if (!isSink(V) && !isSource(V)) {
- if (auto *I = dyn_cast<Instruction>(V)) {
- // Visit operands of any instruction visited.
- for (auto &U : I->operands()) {
- if (!AddLegalInst(U))
- return false;
- }
- }
- }
-
- // Don't visit users of a node which isn't going to be mutated unless its a
- // source.
- if (isSource(V) || shouldPromote(V)) {
- for (Use &U : V->uses()) {
- if (!AddLegalInst(U.getUser()))
- return false;
- }
- }
- }
-
- LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
- for (auto *I : CurrentVisited)
- I->dump();
- );
- unsigned ToPromote = 0;
- for (auto *V : CurrentVisited) {
- if (Sources.count(V))
- continue;
- if (Sinks.count(cast<Instruction>(V)))
- continue;
- ++ToPromote;
- }
-
- if (ToPromote < 2)
- return false;
-
- Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote,
- SafeWrap);
- return true;
-}
-
-bool ARMCodeGenPrepare::doInitialization(Module &M) {
- Promoter = new IRPromoter(&M);
- return false;
-}
-
-bool ARMCodeGenPrepare::runOnFunction(Function &F) {
- if (skipFunction(F) || DisableCGP)
- return false;
-
- auto *TPC = &getAnalysis<TargetPassConfig>();
- if (!TPC)
- return false;
-
- const TargetMachine &TM = TPC->getTM<TargetMachine>();
- ST = &TM.getSubtarget<ARMSubtarget>(F);
- bool MadeChange = false;
- LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F.getName() << "\n");
-
- // Search up from icmps to try to promote their operands.
- for (BasicBlock &BB : F) {
- auto &Insts = BB.getInstList();
- for (auto &I : Insts) {
- if (AllVisited.count(&I))
- continue;
-
- if (isa<ICmpInst>(I)) {
- auto &CI = cast<ICmpInst>(I);
-
- // Skip signed or pointer compares
- if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType()))
- continue;
-
- LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
-
- for (auto &Op : CI.operands()) {
- if (auto *I = dyn_cast<Instruction>(Op))
- MadeChange |= TryToPromote(I);
- }
- }
- }
- LLVM_DEBUG(if (verifyFunction(F, &dbgs())) {
- dbgs() << F;
- report_fatal_error("Broken function after type promotion");
- });
- }
- if (MadeChange)
- LLVM_DEBUG(dbgs() << "After ARMCodeGenPrepare: " << F << "\n");
-
- return MadeChange;
-}
-
-bool ARMCodeGenPrepare::doFinalization(Module &M) {
- delete Promoter;
- return false;
-}
-
-INITIALIZE_PASS_BEGIN(ARMCodeGenPrepare, DEBUG_TYPE,
- "ARM IR optimizations", false, false)
-INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations",
- false, false)
-
-char ARMCodeGenPrepare::ID = 0;
-unsigned ARMCodeGenPrepare::TypeSize = 0;
-
-FunctionPass *llvm::createARMCodeGenPreparePass() {
- return new ARMCodeGenPrepare();
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 24ca25f73e96..634fb89b8e89 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -1917,6 +1917,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
MachineInstrBuilder MIB = BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(),
TII->get(ARM::t2LE));
+ // Swapped a t2Bcc for a t2LE, so no need to update the size of the block.
MIB.add(Br.MI->getOperand(0));
Br.MI->eraseFromParent();
Br.MI = MIB;
@@ -1975,21 +1976,20 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
.addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags());
Cmp.MI->eraseFromParent();
- BBInfoVector &BBInfo = BBUtils->getBBInfo();
- BBInfo[MBB->getNumber()].Size -= 2;
if (Br.MI->getOpcode() == ARM::tBcc) {
Br.MI->eraseFromParent();
Br.MI = NewBR;
- } else if (&MBB->back() != Br.MI) {
- // We've generated an LE and already erased the original conditional
- // branch. The CBN?Z is now used to branch to the other successor, so an
- // unconditional branch terminator is now redundant.
+ BBUtils->adjustBBSize(MBB, -2);
+ } else if (MBB->back().getOpcode() != ARM::t2LE) {
+ // An LE has been generated, but it's not the terminator - that is an
+ // unconditional branch. However, the logic has now been reversed with the
+ // CBN?Z being the conditional branch and the LE being the unconditional
+ // branch. So this means we can remove the redundant unconditional branch
+ // at the end of the block.
MachineInstr *LastMI = &MBB->back();
- if (LastMI != Br.MI) {
- BBInfo[MBB->getNumber()].Size -= LastMI->getDesc().getSize();
- LastMI->eraseFromParent();
- }
+ BBUtils->adjustBBSize(MBB, -LastMI->getDesc().getSize());
+ LastMI->eraseFromParent();
}
BBUtils->adjustBBOffsetsAfter(MBB);
++NumCBZ;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 563fdda56104..2c3ac816219f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1213,9 +1213,10 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MBBI = NewMI;
return true;
}
+ case ARM::VMOVHcc:
case ARM::VMOVScc:
case ARM::VMOVDcc: {
- unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
+ unsigned newOpc = Opcode != ARM::VMOVDcc ? ARM::VMOVS : ARM::VMOVD;
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
MI.getOperand(1).getReg())
.add(MI.getOperand(2))
@@ -1951,6 +1952,24 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MI.eraseFromParent();
return true;
}
+ case ARM::LOADDUAL:
+ case ARM::STOREDUAL: {
+ Register PairReg = MI.getOperand(0).getReg();
+
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == ARM::LOADDUAL ? ARM::LDRD : ARM::STRD))
+ .addReg(TRI->getSubReg(PairReg, ARM::gsub_0),
+ Opcode == ARM::LOADDUAL ? RegState::Define : 0)
+ .addReg(TRI->getSubReg(PairReg, ARM::gsub_1),
+ Opcode == ARM::LOADDUAL ? RegState::Define : 0);
+ for (unsigned i = 1; i < MI.getNumOperands(); i++)
+ MIB.add(MI.getOperand(i));
+ MIB.add(predOps(ARMCC::AL));
+ MIB.cloneMemRefs(MI);
+ MI.eraseFromParent();
+ return true;
+ }
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
index 1fc5ff6921c6..6e19db3c7e22 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -1879,6 +1879,8 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
report_fatal_error("Can't return in GHC call convention");
else
return CC_ARM_APCS_GHC;
+ case CallingConv::CFGuard_Check:
+ return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
}
}
@@ -2564,8 +2566,12 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
return SelectCall(&I, "memset");
}
case Intrinsic::trap: {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(
- Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP));
+ unsigned Opcode;
+ if (Subtarget->isThumb())
+ Opcode = ARM::tTRAP;
+ else
+ Opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opcode));
return true;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 01ae93086dcb..cb98b2b34efd 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2128,10 +2128,16 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
AFI->setLRIsSpilledForFarJump(true);
}
AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
+}
+
+void ARMFrameLowering::getCalleeSaves(const MachineFunction &MF,
+ BitVector &SavedRegs) const {
+ TargetFrameLowering::getCalleeSaves(MF, SavedRegs);
// If we have the "returned" parameter attribute which guarantees that we
// return the value which was passed in r0 unmodified (e.g. C++ 'structors),
// record that fact for IPRA.
+ const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
if (AFI->getPreservesR0())
SavedRegs.set(ARM::R0);
}
@@ -2418,7 +2424,8 @@ void ARMFrameLowering::adjustForSegmentedStacks(
} else {
// Get TLS base address from the coprocessor
// mrc p15, #0, SR0, c13, c0, #3
- BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0)
+ BuildMI(McrMBB, DL, TII.get(Thumb ? ARM::t2MRC : ARM::MRC),
+ ScratchReg0)
.addImm(15)
.addImm(0)
.addImm(13)
@@ -2432,7 +2439,8 @@ void ARMFrameLowering::adjustForSegmentedStacks(
// Get the stack limit from the right offset
// ldr SR0, [sr0, #4 * TlsOffset]
- BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0)
+ BuildMI(GetMBB, DL, TII.get(Thumb ? ARM::t2LDRi12 : ARM::LDRi12),
+ ScratchReg0)
.addReg(ScratchReg0)
.addImm(4 * TlsOffset)
.add(predOps(ARMCC::AL));
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
index 6d8aee597945..0462b01af707 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -53,6 +53,8 @@ public:
int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg, int SPAdj) const;
+ void getCalleeSaves(const MachineFunction &MF,
+ BitVector &SavedRegs) const override;
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h
index b5ac694e01f7..ca02cc739e11 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMHazardRecognizer.h
@@ -27,14 +27,13 @@ class MachineInstr;
/// ARM preRA scheduler uses an unspecialized instance of the
/// ScoreboardHazardRecognizer.
class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
- MachineInstr *LastMI;
- unsigned FpMLxStalls;
+ MachineInstr *LastMI = nullptr;
+ unsigned FpMLxStalls = 0;
public:
ARMHazardRecognizer(const InstrItineraryData *ItinData,
const ScheduleDAG *DAG)
- : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"),
- LastMI(nullptr) {}
+ : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched") {}
HazardType getHazardType(SUnit *SU, int Stalls) override;
void Reset() override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 8f6515c423eb..76a9ac12062d 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -28,6 +28,7 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -144,6 +145,8 @@ public:
// Thumb 2 Addressing Modes:
bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+ template <unsigned Shift>
+ bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm);
bool SelectT2AddrModeImm8(SDValue N, SDValue &Base,
SDValue &OffImm);
bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
@@ -158,6 +161,9 @@ public:
SDValue &OffReg, SDValue &ShImm);
bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm);
+ template<int Min, int Max>
+ bool SelectImmediateInRange(SDValue N, SDValue &OffImm);
+
inline bool is_so_imm(unsigned Imm) const {
return ARM_AM::getSOImmVal(Imm) != -1;
}
@@ -209,6 +215,59 @@ private:
unsigned NumVecs, const uint16_t *DOpcodes,
const uint16_t *QOpcodes);
+ /// Helper functions for setting up clusters of MVE predication operands.
+ template <typename SDValueVector>
+ void AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc,
+ SDValue PredicateMask);
+ template <typename SDValueVector>
+ void AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc,
+ SDValue PredicateMask, SDValue Inactive);
+
+ template <typename SDValueVector>
+ void AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc);
+ template <typename SDValueVector>
+ void AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc, EVT InactiveTy);
+
+ /// SelectMVE_WB - Select MVE writeback load/store intrinsics.
+ void SelectMVE_WB(SDNode *N, const uint16_t *Opcodes, bool Predicated);
+
+ /// SelectMVE_LongShift - Select MVE 64-bit scalar shift intrinsics.
+ void SelectMVE_LongShift(SDNode *N, uint16_t Opcode, bool Immediate,
+ bool HasSaturationOperand);
+
+ /// SelectMVE_VADCSBC - Select MVE vector add/sub-with-carry intrinsics.
+ void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
+ uint16_t OpcodeWithNoCarry, bool Add, bool Predicated);
+
+ /// Select long MVE vector reductions with two vector operands
+ /// Stride is the number of vector element widths the instruction can operate
+ /// on:
+ /// 2 for long non-rounding variants, vml{a,s}ldav[a][x]: [i16, i32]
+ /// 1 for long rounding variants: vrml{a,s}ldavh[a][x]: [i32]
+ /// Stride is used when addressing the OpcodesS array which contains multiple
+ /// opcodes for each element width.
+ /// TySize is the index into the list of element types listed above
+ void SelectBaseMVE_VMLLDAV(SDNode *N, bool Predicated,
+ const uint16_t *OpcodesS, const uint16_t *OpcodesU,
+ size_t Stride, size_t TySize);
+
+ /// Select a 64-bit MVE vector reduction with two vector operands
+ /// arm_mve_vmlldava_[predicated]
+ void SelectMVE_VMLLDAV(SDNode *N, bool Predicated, const uint16_t *OpcodesS,
+ const uint16_t *OpcodesU);
+ /// Select a 72-bit MVE vector rounding reduction with two vector operands
+ /// int_arm_mve_vrmlldavha[_predicated]
+ void SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated, const uint16_t *OpcodesS,
+ const uint16_t *OpcodesU);
+
+ /// SelectMVE_VLD - Select MVE interleaving load intrinsics. NumVecs
+ /// should be 2 or 4. The opcode array specifies the instructions
+ /// used for 8, 16 and 32-bit lane sizes respectively, and each
+ /// pointer points to a set of NumVecs sub-opcodes used for the
+ /// different stages (e.g. VLD20 versus VLD21) of each load family.
+ void SelectMVE_VLD(SDNode *N, unsigned NumVecs,
+ const uint16_t *const *Opcodes);
+
/// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs
/// should be 1, 2, 3 or 4. The opcode array specifies the instructions used
/// for loading D registers.
@@ -1237,6 +1296,33 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
return true;
}
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base,
+ SDValue &OffImm) {
+ if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) {
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -255, 256, RHSC)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+ OffImm =
+ CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+
+ // Base only.
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+}
+
bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
SDValue &Base, SDValue &OffImm) {
// Match simple R - imm8 operands.
@@ -1319,11 +1405,27 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
SDValue &OffImm,
unsigned Shift) {
unsigned Opcode = Op->getOpcode();
- ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
- ? cast<LoadSDNode>(Op)->getAddressingMode()
- : cast<StoreSDNode>(Op)->getAddressingMode();
+ ISD::MemIndexedMode AM;
+ switch (Opcode) {
+ case ISD::LOAD:
+ AM = cast<LoadSDNode>(Op)->getAddressingMode();
+ break;
+ case ISD::STORE:
+ AM = cast<StoreSDNode>(Op)->getAddressingMode();
+ break;
+ case ISD::MLOAD:
+ AM = cast<MaskedLoadSDNode>(Op)->getAddressingMode();
+ break;
+ case ISD::MSTORE:
+ AM = cast<MaskedStoreSDNode>(Op)->getAddressingMode();
+ break;
+ default:
+ llvm_unreachable("Unexpected Opcode for Imm7Offset");
+ }
+
int RHSC;
- if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { // 7 bits.
+ // 7 bit constant, shifted by Shift.
+ if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) {
OffImm =
((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
? CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32)
@@ -1334,6 +1436,16 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
return false;
}
+template <int Min, int Max>
+bool ARMDAGToDAGISel::SelectImmediateInRange(SDValue N, SDValue &OffImm) {
+ int Val;
+ if (isScaledConstantInRange(N, 1, Min, Max, Val)) {
+ OffImm = CurDAG->getTargetConstant(Val, SDLoc(N), MVT::i32);
+ return true;
+ }
+ return false;
+}
+
bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
SDValue &Base,
SDValue &OffReg, SDValue &ShImm) {
@@ -1593,58 +1705,93 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
}
bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
- LoadSDNode *LD = cast<LoadSDNode>(N);
- ISD::MemIndexedMode AM = LD->getAddressingMode();
- if (AM == ISD::UNINDEXED)
- return false;
- EVT LoadedVT = LD->getMemoryVT();
- if (!LoadedVT.isVector())
- return false;
- bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
- SDValue Offset;
- bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+ EVT LoadedVT;
unsigned Opcode = 0;
- unsigned Align = LD->getAlignment();
- bool IsLE = Subtarget->isLittle();
+ bool isSExtLd, isPre;
+ unsigned Align;
+ ARMVCC::VPTCodes Pred;
+ SDValue PredReg;
+ SDValue Chain, Base, Offset;
+
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ if (AM == ISD::UNINDEXED)
+ return false;
+ LoadedVT = LD->getMemoryVT();
+ if (!LoadedVT.isVector())
+ return false;
+
+ Chain = LD->getChain();
+ Base = LD->getBasePtr();
+ Offset = LD->getOffset();
+ Align = LD->getAlignment();
+ isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+ isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+ Pred = ARMVCC::None;
+ PredReg = CurDAG->getRegister(0, MVT::i32);
+ } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ if (AM == ISD::UNINDEXED)
+ return false;
+ LoadedVT = LD->getMemoryVT();
+ if (!LoadedVT.isVector())
+ return false;
+ Chain = LD->getChain();
+ Base = LD->getBasePtr();
+ Offset = LD->getOffset();
+ Align = LD->getAlignment();
+ isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+ isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+ Pred = ARMVCC::Then;
+ PredReg = LD->getMask();
+ } else
+ llvm_unreachable("Expected a Load or a Masked Load!");
+
+ // We allow LE non-masked loads to change the type (for example use a vldrb.8
+ // as opposed to a vldrw.32). This can allow extra addressing modes or
+ // alignments for what is otherwise an equivalent instruction.
+ bool CanChangeType = Subtarget->isLittle() && !isa<MaskedLoadSDNode>(N);
+
+ SDValue NewOffset;
if (Align >= 2 && LoadedVT == MVT::v4i16 &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) {
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) {
if (isSExtLd)
Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post;
else
Opcode = isPre ? ARM::MVE_VLDRHU32_pre : ARM::MVE_VLDRHU32_post;
} else if (LoadedVT == MVT::v8i8 &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) {
if (isSExtLd)
Opcode = isPre ? ARM::MVE_VLDRBS16_pre : ARM::MVE_VLDRBS16_post;
else
Opcode = isPre ? ARM::MVE_VLDRBU16_pre : ARM::MVE_VLDRBU16_post;
} else if (LoadedVT == MVT::v4i8 &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) {
if (isSExtLd)
Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post;
else
Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post;
} else if (Align >= 4 &&
- (IsLE || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 2))
+ (CanChangeType || LoadedVT == MVT::v4i32 ||
+ LoadedVT == MVT::v4f32) &&
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 2))
Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post;
else if (Align >= 2 &&
- (IsLE || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1))
+ (CanChangeType || LoadedVT == MVT::v8i16 ||
+ LoadedVT == MVT::v8f16) &&
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1))
Opcode = isPre ? ARM::MVE_VLDRHU16_pre : ARM::MVE_VLDRHU16_post;
- else if ((IsLE || LoadedVT == MVT::v16i8) &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0))
+ else if ((CanChangeType || LoadedVT == MVT::v16i8) &&
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0))
Opcode = isPre ? ARM::MVE_VLDRBU8_pre : ARM::MVE_VLDRBU8_post;
else
return false;
- SDValue Chain = LD->getChain();
- SDValue Base = LD->getBasePtr();
- SDValue Ops[] = {Base, Offset,
- CurDAG->getTargetConstant(ARMVCC::None, SDLoc(N), MVT::i32),
- CurDAG->getRegister(0, MVT::i32), Chain};
- SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), LD->getValueType(0),
+ SDValue Ops[] = {Base, NewOffset,
+ CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32), PredReg,
+ Chain};
+ SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), N->getValueType(0),
MVT::i32, MVT::Other, Ops);
transferMemOperands(N, New);
ReplaceUses(SDValue(N, 0), SDValue(New, 1));
@@ -2304,6 +2451,268 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
CurDAG->RemoveDeadNode(N);
}
+template <typename SDValueVector>
+void ARMDAGToDAGISel::AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc,
+ SDValue PredicateMask) {
+ Ops.push_back(CurDAG->getTargetConstant(ARMVCC::Then, Loc, MVT::i32));
+ Ops.push_back(PredicateMask);
+}
+
+template <typename SDValueVector>
+void ARMDAGToDAGISel::AddMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc,
+ SDValue PredicateMask,
+ SDValue Inactive) {
+ Ops.push_back(CurDAG->getTargetConstant(ARMVCC::Then, Loc, MVT::i32));
+ Ops.push_back(PredicateMask);
+ Ops.push_back(Inactive);
+}
+
+template <typename SDValueVector>
+void ARMDAGToDAGISel::AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc) {
+ Ops.push_back(CurDAG->getTargetConstant(ARMVCC::None, Loc, MVT::i32));
+ Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+}
+
+template <typename SDValueVector>
+void ARMDAGToDAGISel::AddEmptyMVEPredicateToOps(SDValueVector &Ops, SDLoc Loc,
+ EVT InactiveTy) {
+ Ops.push_back(CurDAG->getTargetConstant(ARMVCC::None, Loc, MVT::i32));
+ Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+ Ops.push_back(SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, InactiveTy), 0));
+}
+
+void ARMDAGToDAGISel::SelectMVE_WB(SDNode *N, const uint16_t *Opcodes,
+ bool Predicated) {
+ SDLoc Loc(N);
+ SmallVector<SDValue, 8> Ops;
+
+ uint16_t Opcode;
+ switch (N->getValueType(1).getVectorElementType().getSizeInBits()) {
+ case 32:
+ Opcode = Opcodes[0];
+ break;
+ case 64:
+ Opcode = Opcodes[1];
+ break;
+ default:
+ llvm_unreachable("bad vector element size in SelectMVE_WB");
+ }
+
+ Ops.push_back(N->getOperand(2)); // vector of base addresses
+
+ int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+ Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate offset
+
+ if (Predicated)
+ AddMVEPredicateToOps(Ops, Loc, N->getOperand(4));
+ else
+ AddEmptyMVEPredicateToOps(Ops, Loc);
+
+ Ops.push_back(N->getOperand(0)); // chain
+
+ CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+}
+
+void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode,
+ bool Immediate,
+ bool HasSaturationOperand) {
+ SDLoc Loc(N);
+ SmallVector<SDValue, 8> Ops;
+
+ // Two 32-bit halves of the value to be shifted
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(N->getOperand(2));
+
+ // The shift count
+ if (Immediate) {
+ int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+ Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count
+ } else {
+ Ops.push_back(N->getOperand(3));
+ }
+
+ // The immediate saturation operand, if any
+ if (HasSaturationOperand) {
+ int32_t SatOp = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
+ int SatBit = (SatOp == 64 ? 0 : 1);
+ Ops.push_back(getI32Imm(SatBit, Loc));
+ }
+
+ // MVE scalar shifts are IT-predicable, so include the standard
+ // predicate arguments.
+ Ops.push_back(getAL(CurDAG, Loc));
+ Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+
+ CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+}
+
+void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
+ uint16_t OpcodeWithNoCarry,
+ bool Add, bool Predicated) {
+ SDLoc Loc(N);
+ SmallVector<SDValue, 8> Ops;
+ uint16_t Opcode;
+
+ unsigned FirstInputOp = Predicated ? 2 : 1;
+
+ // Two input vectors and the input carry flag
+ Ops.push_back(N->getOperand(FirstInputOp));
+ Ops.push_back(N->getOperand(FirstInputOp + 1));
+ SDValue CarryIn = N->getOperand(FirstInputOp + 2);
+ ConstantSDNode *CarryInConstant = dyn_cast<ConstantSDNode>(CarryIn);
+ uint32_t CarryMask = 1 << 29;
+ uint32_t CarryExpected = Add ? 0 : CarryMask;
+ if (CarryInConstant &&
+ (CarryInConstant->getZExtValue() & CarryMask) == CarryExpected) {
+ Opcode = OpcodeWithNoCarry;
+ } else {
+ Ops.push_back(CarryIn);
+ Opcode = OpcodeWithCarry;
+ }
+
+ if (Predicated)
+ AddMVEPredicateToOps(Ops, Loc,
+ N->getOperand(FirstInputOp + 3), // predicate
+ N->getOperand(FirstInputOp - 1)); // inactive
+ else
+ AddEmptyMVEPredicateToOps(Ops, Loc, N->getValueType(0));
+
+ CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+}
+
+static bool SDValueToConstBool(SDValue SDVal) {
+ assert(isa<ConstantSDNode>(SDVal) && "expected a compile-time constant");
+ ConstantSDNode *SDValConstant = dyn_cast<ConstantSDNode>(SDVal);
+ uint64_t Value = SDValConstant->getZExtValue();
+ assert((Value == 0 || Value == 1) && "expected value 0 or 1");
+ return Value;
+}
+
+void ARMDAGToDAGISel::SelectBaseMVE_VMLLDAV(SDNode *N, bool Predicated,
+ const uint16_t *OpcodesS,
+ const uint16_t *OpcodesU,
+ size_t Stride, size_t TySize) {
+ assert(TySize < Stride && "Invalid TySize");
+ bool IsUnsigned = SDValueToConstBool(N->getOperand(1));
+ bool IsSub = SDValueToConstBool(N->getOperand(2));
+ bool IsExchange = SDValueToConstBool(N->getOperand(3));
+ if (IsUnsigned) {
+ assert(!IsSub &&
+ "Unsigned versions of vmlsldav[a]/vrmlsldavh[a] do not exist");
+ assert(!IsExchange &&
+ "Unsigned versions of vmlaldav[a]x/vrmlaldavh[a]x do not exist");
+ }
+
+ auto OpIsZero = [N](size_t OpNo) {
+ if (ConstantSDNode *OpConst = dyn_cast<ConstantSDNode>(N->getOperand(OpNo)))
+ if (OpConst->getZExtValue() == 0)
+ return true;
+ return false;
+ };
+
+ // If the input accumulator value is not zero, select an instruction with
+ // accumulator, otherwise select an instruction without accumulator
+ bool IsAccum = !(OpIsZero(4) && OpIsZero(5));
+
+ const uint16_t *Opcodes = IsUnsigned ? OpcodesU : OpcodesS;
+ if (IsSub)
+ Opcodes += 4 * Stride;
+ if (IsExchange)
+ Opcodes += 2 * Stride;
+ if (IsAccum)
+ Opcodes += Stride;
+ uint16_t Opcode = Opcodes[TySize];
+
+ SDLoc Loc(N);
+ SmallVector<SDValue, 8> Ops;
+ // Push the accumulator operands, if they are used
+ if (IsAccum) {
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(N->getOperand(5));
+ }
+ // Push the two vector operands
+ Ops.push_back(N->getOperand(6));
+ Ops.push_back(N->getOperand(7));
+
+ if (Predicated)
+ AddMVEPredicateToOps(Ops, Loc, N->getOperand(8));
+ else
+ AddEmptyMVEPredicateToOps(Ops, Loc);
+
+ CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+}
+
+void ARMDAGToDAGISel::SelectMVE_VMLLDAV(SDNode *N, bool Predicated,
+ const uint16_t *OpcodesS,
+ const uint16_t *OpcodesU) {
+ EVT VecTy = N->getOperand(6).getValueType();
+ size_t SizeIndex;
+ switch (VecTy.getVectorElementType().getSizeInBits()) {
+ case 16:
+ SizeIndex = 0;
+ break;
+ case 32:
+ SizeIndex = 1;
+ break;
+ default:
+ llvm_unreachable("bad vector element size");
+ }
+
+ SelectBaseMVE_VMLLDAV(N, Predicated, OpcodesS, OpcodesU, 2, SizeIndex);
+}
+
+void ARMDAGToDAGISel::SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated,
+ const uint16_t *OpcodesS,
+ const uint16_t *OpcodesU) {
+ assert(
+ N->getOperand(6).getValueType().getVectorElementType().getSizeInBits() ==
+ 32 &&
+ "bad vector element size");
+ SelectBaseMVE_VMLLDAV(N, Predicated, OpcodesS, OpcodesU, 1, 0);
+}
+
+void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs,
+ const uint16_t *const *Opcodes) {
+ EVT VT = N->getValueType(0);
+ SDLoc Loc(N);
+
+ const uint16_t *OurOpcodes;
+ switch (VT.getVectorElementType().getSizeInBits()) {
+ case 8:
+ OurOpcodes = Opcodes[0];
+ break;
+ case 16:
+ OurOpcodes = Opcodes[1];
+ break;
+ case 32:
+ OurOpcodes = Opcodes[2];
+ break;
+ default:
+ llvm_unreachable("bad vector element size in SelectMVE_VLD");
+ }
+
+ EVT DataTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, NumVecs * 2);
+ EVT ResultTys[] = {DataTy, MVT::Other};
+
+ auto Data = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, DataTy), 0);
+ SDValue Chain = N->getOperand(0);
+ for (unsigned Stage = 0; Stage < NumVecs; ++Stage) {
+ SDValue Ops[] = {Data, N->getOperand(2), Chain};
+ auto LoadInst =
+ CurDAG->getMachineNode(OurOpcodes[Stage], Loc, ResultTys, Ops);
+ Data = SDValue(LoadInst, 0);
+ Chain = SDValue(LoadInst, 1);
+ }
+
+ for (unsigned i = 0; i < NumVecs; i++)
+ ReplaceUses(SDValue(N, i),
+ CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, Data));
+ ReplaceUses(SDValue(N, NumVecs), Chain);
+ CurDAG->RemoveDeadNode(N);
+}
+
void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
bool isUpdating, unsigned NumVecs,
const uint16_t *DOpcodes,
@@ -3089,6 +3498,11 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
// Other cases are autogenerated.
break;
}
+ case ISD::MLOAD:
+ if (Subtarget->hasMVEIntegerOps() && tryMVEIndexedLoad(N))
+ return;
+ // Other cases are autogenerated.
+ break;
case ARMISD::WLS:
case ARMISD::LE: {
SDValue Ops[] = { N->getOperand(1),
@@ -3101,6 +3515,26 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
CurDAG->RemoveDeadNode(N);
return;
}
+ case ARMISD::LDRD: {
+ if (Subtarget->isThumb2())
+ break; // TableGen handles isel in this case.
+ SDValue Base, RegOffset, ImmOffset;
+ const SDValue &Chain = N->getOperand(0);
+ const SDValue &Addr = N->getOperand(1);
+ SelectAddrMode3(Addr, Base, RegOffset, ImmOffset);
+ SDValue Ops[] = {Base, RegOffset, ImmOffset, Chain};
+ SDNode *New = CurDAG->getMachineNode(ARM::LOADDUAL, dl,
+ {MVT::Untyped, MVT::Other}, Ops);
+ SDValue Lo = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32,
+ SDValue(New, 0));
+ SDValue Hi = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32,
+ SDValue(New, 0));
+ ReplaceUses(SDValue(N, 0), Lo);
+ ReplaceUses(SDValue(N, 1), Hi);
+ ReplaceUses(SDValue(N, 2), SDValue(New, 1));
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
case ARMISD::LOOP_DEC: {
SDValue Ops[] = { N->getOperand(1),
N->getOperand(2),
@@ -4028,6 +4462,117 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
return;
}
+
+ case Intrinsic::arm_mve_vldr_gather_base_wb:
+ case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
+ static const uint16_t Opcodes[] = {ARM::MVE_VLDRWU32_qi_pre,
+ ARM::MVE_VLDRDU64_qi_pre};
+ SelectMVE_WB(N, Opcodes,
+ IntNo == Intrinsic::arm_mve_vldr_gather_base_wb_predicated);
+ return;
+ }
+
+ case Intrinsic::arm_mve_vld2q: {
+ static const uint16_t Opcodes8[] = {ARM::MVE_VLD20_8, ARM::MVE_VLD21_8};
+ static const uint16_t Opcodes16[] = {ARM::MVE_VLD20_16,
+ ARM::MVE_VLD21_16};
+ static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32,
+ ARM::MVE_VLD21_32};
+ static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32};
+ SelectMVE_VLD(N, 2, Opcodes);
+ return;
+ }
+
+ case Intrinsic::arm_mve_vld4q: {
+ static const uint16_t Opcodes8[] = {ARM::MVE_VLD40_8, ARM::MVE_VLD41_8,
+ ARM::MVE_VLD42_8, ARM::MVE_VLD43_8};
+ static const uint16_t Opcodes16[] = {ARM::MVE_VLD40_16, ARM::MVE_VLD41_16,
+ ARM::MVE_VLD42_16,
+ ARM::MVE_VLD43_16};
+ static const uint16_t Opcodes32[] = {ARM::MVE_VLD40_32, ARM::MVE_VLD41_32,
+ ARM::MVE_VLD42_32,
+ ARM::MVE_VLD43_32};
+ static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32};
+ SelectMVE_VLD(N, 4, Opcodes);
+ return;
+ }
+ }
+ break;
+ }
+
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default:
+ break;
+
+ case Intrinsic::arm_mve_urshrl:
+ SelectMVE_LongShift(N, ARM::MVE_URSHRL, true, false);
+ return;
+ case Intrinsic::arm_mve_uqshll:
+ SelectMVE_LongShift(N, ARM::MVE_UQSHLL, true, false);
+ return;
+ case Intrinsic::arm_mve_srshrl:
+ SelectMVE_LongShift(N, ARM::MVE_SRSHRL, true, false);
+ return;
+ case Intrinsic::arm_mve_sqshll:
+ SelectMVE_LongShift(N, ARM::MVE_SQSHLL, true, false);
+ return;
+ case Intrinsic::arm_mve_uqrshll:
+ SelectMVE_LongShift(N, ARM::MVE_UQRSHLL, false, true);
+ return;
+ case Intrinsic::arm_mve_sqrshrl:
+ SelectMVE_LongShift(N, ARM::MVE_SQRSHRL, false, true);
+ return;
+ case Intrinsic::arm_mve_lsll:
+ SelectMVE_LongShift(N, ARM::MVE_LSLLr, false, false);
+ return;
+ case Intrinsic::arm_mve_asrl:
+ SelectMVE_LongShift(N, ARM::MVE_ASRLr, false, false);
+ return;
+
+ case Intrinsic::arm_mve_vadc:
+ case Intrinsic::arm_mve_vadc_predicated:
+ SelectMVE_VADCSBC(N, ARM::MVE_VADC, ARM::MVE_VADCI, true,
+ IntNo == Intrinsic::arm_mve_vadc_predicated);
+ return;
+
+ case Intrinsic::arm_mve_vmlldava:
+ case Intrinsic::arm_mve_vmlldava_predicated: {
+ static const uint16_t OpcodesU[] = {
+ ARM::MVE_VMLALDAVu16, ARM::MVE_VMLALDAVu32,
+ ARM::MVE_VMLALDAVau16, ARM::MVE_VMLALDAVau32,
+ };
+ static const uint16_t OpcodesS[] = {
+ ARM::MVE_VMLALDAVs16, ARM::MVE_VMLALDAVs32,
+ ARM::MVE_VMLALDAVas16, ARM::MVE_VMLALDAVas32,
+ ARM::MVE_VMLALDAVxs16, ARM::MVE_VMLALDAVxs32,
+ ARM::MVE_VMLALDAVaxs16, ARM::MVE_VMLALDAVaxs32,
+ ARM::MVE_VMLSLDAVs16, ARM::MVE_VMLSLDAVs32,
+ ARM::MVE_VMLSLDAVas16, ARM::MVE_VMLSLDAVas32,
+ ARM::MVE_VMLSLDAVxs16, ARM::MVE_VMLSLDAVxs32,
+ ARM::MVE_VMLSLDAVaxs16, ARM::MVE_VMLSLDAVaxs32,
+ };
+ SelectMVE_VMLLDAV(N, IntNo == Intrinsic::arm_mve_vmlldava_predicated,
+ OpcodesS, OpcodesU);
+ return;
+ }
+
+ case Intrinsic::arm_mve_vrmlldavha:
+ case Intrinsic::arm_mve_vrmlldavha_predicated: {
+ static const uint16_t OpcodesU[] = {
+ ARM::MVE_VRMLALDAVHu32, ARM::MVE_VRMLALDAVHau32,
+ };
+ static const uint16_t OpcodesS[] = {
+ ARM::MVE_VRMLALDAVHs32, ARM::MVE_VRMLALDAVHas32,
+ ARM::MVE_VRMLALDAVHxs32, ARM::MVE_VRMLALDAVHaxs32,
+ ARM::MVE_VRMLSLDAVHs32, ARM::MVE_VRMLSLDAVHas32,
+ ARM::MVE_VRMLSLDAVHxs32, ARM::MVE_VRMLSLDAVHaxs32,
+ };
+ SelectMVE_VRMLLDAVH(N, IntNo == Intrinsic::arm_mve_vrmlldavha_predicated,
+ OpcodesS, OpcodesU);
+ return;
+ }
}
break;
}
@@ -4551,10 +5096,6 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
switch(ConstraintID) {
default:
llvm_unreachable("Unexpected asm memory constraint");
- case InlineAsm::Constraint_i:
- // FIXME: It seems strange that 'i' is needed here since it's supposed to
- // be an immediate and not a memory constraint.
- LLVM_FALLTHROUGH;
case InlineAsm::Constraint_m:
case InlineAsm::Constraint_o:
case InlineAsm::Constraint_Q:
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
index db26feb57010..cf738cd66434 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -78,6 +78,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
@@ -142,6 +143,11 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
cl::desc("Maximum size of ALL constants to promote into a constant pool"),
cl::init(128));
+static cl::opt<unsigned>
+MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
+ cl::desc("Maximum interleave factor for MVE VLDn to generate."),
+ cl::init(2));
+
// The APCS parameter registers.
static const MCPhysReg GPRArgRegs[] = {
ARM::R0, ARM::R1, ARM::R2, ARM::R3
@@ -209,6 +215,9 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
VT != MVT::v2i64 && VT != MVT::v1i64)
for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
setOperationAction(Opcode, VT, Legal);
+ if (!VT.isFloatingPoint())
+ for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
+ setOperationAction(Opcode, VT, Legal);
}
void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
@@ -296,6 +305,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, VT, Legal);
setIndexedStoreAction(im, VT, Legal);
+ setIndexedMaskedLoadAction(im, VT, Legal);
+ setIndexedMaskedStoreAction(im, VT, Legal);
}
}
@@ -322,6 +333,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, VT, Legal);
setIndexedStoreAction(im, VT, Legal);
+ setIndexedMaskedLoadAction(im, VT, Legal);
+ setIndexedMaskedStoreAction(im, VT, Legal);
}
if (HasMVEFP) {
@@ -366,6 +379,13 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
+ // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal);
+
// Some truncating stores are legal too.
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
@@ -374,12 +394,12 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
// Pre and Post inc on these are legal, given the correct extends
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
- setIndexedLoadAction(im, MVT::v8i8, Legal);
- setIndexedStoreAction(im, MVT::v8i8, Legal);
- setIndexedLoadAction(im, MVT::v4i8, Legal);
- setIndexedStoreAction(im, MVT::v4i8, Legal);
- setIndexedLoadAction(im, MVT::v4i16, Legal);
- setIndexedStoreAction(im, MVT::v4i16, Legal);
+ for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
+ setIndexedLoadAction(im, VT, Legal);
+ setIndexedStoreAction(im, VT, Legal);
+ setIndexedMaskedLoadAction(im, VT, Legal);
+ setIndexedMaskedStoreAction(im, VT, Legal);
+ }
}
// Predicate types
@@ -446,7 +466,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
{ RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
{ RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
{ RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
- { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ },
// Double-precision comparisons.
{ RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
@@ -456,7 +475,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
{ RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
{ RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
{ RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
- { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ },
// Floating-point to integer conversions.
// i64 conversions are done via library routines even when generating VFP
@@ -520,7 +538,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
{ RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
- { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
// Single-precision floating-point arithmetic helper functions
// RTABI chapter 4.1.2, Table 4
@@ -538,7 +555,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
{ RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
- { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
// Floating-point to integer conversions.
// RTABI chapter 4.1.2, Table 6
@@ -964,19 +980,26 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
}
if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
- if (Subtarget->hasFullFP16())
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
+ if (Subtarget->hasFullFP16()) {
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+ }
}
- if (!Subtarget->hasFP16())
+ if (!Subtarget->hasFP16()) {
setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
-
- if (!Subtarget->hasFP64())
- setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
+ }
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -1050,6 +1073,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRA, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+ setOperationAction(ISD::LOAD, MVT::i64, Custom);
+ setOperationAction(ISD::STORE, MVT::i64, Custom);
// MVE lowers 64 bit shifts to lsll and lsrl
// assuming that ISD::SRL and SRA of i64 are already marked custom
@@ -1170,9 +1195,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
}
- if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
- for (auto &VT : {MVT::f32, MVT::f64})
- setOperationAction(ISD::FPOWI, VT, Custom);
+ if (Subtarget->getTargetTriple().isOSMSVCRT()) {
+ // MSVCRT doesn't have powi; fall back to pow
+ setLibcallName(RTLIB::POWI_F32, nullptr);
+ setLibcallName(RTLIB::POWI_F64, nullptr);
+ }
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
@@ -1571,6 +1598,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::PRELOAD: return "ARMISD::PRELOAD";
+ case ARMISD::LDRD: return "ARMISD::LDRD";
+ case ARMISD::STRD: return "ARMISD::STRD";
+
case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK";
case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK";
@@ -1855,6 +1885,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
case CallingConv::ARM_AAPCS:
case CallingConv::ARM_APCS:
case CallingConv::GHC:
+ case CallingConv::CFGuard_Check:
return CC;
case CallingConv::PreserveMost:
return CallingConv::PreserveMost;
@@ -1914,6 +1945,8 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
case CallingConv::PreserveMost:
return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+ case CallingConv::CFGuard_Check:
+ return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
}
}
@@ -2062,11 +2095,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
MachineFunction::CallSiteInfo CSInfo;
bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool isThisReturn = false;
- auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
bool PreferIndirect = false;
// Disable tail calls if they're not supported.
- if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
+ if (!Subtarget->supportsTailCall())
isTailCall = false;
if (isa<GlobalAddressSDNode>(Callee)) {
@@ -2331,12 +2363,14 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
} else if (Subtarget->isTargetCOFF()) {
assert(Subtarget->isTargetWindows() &&
"Windows is the only supported COFF target");
- unsigned TargetFlags = GV->hasDLLImportStorageClass()
- ? ARMII::MO_DLLIMPORT
- : ARMII::MO_NO_FLAG;
+ unsigned TargetFlags = ARMII::MO_NO_FLAG;
+ if (GV->hasDLLImportStorageClass())
+ TargetFlags = ARMII::MO_DLLIMPORT;
+ else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+ TargetFlags = ARMII::MO_COFFSTUB;
Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
TargetFlags);
- if (GV->hasDLLImportStorageClass())
+ if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
Callee =
DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
@@ -2941,9 +2975,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
if (!Subtarget->supportsTailCall())
return false;
- auto Attr =
- CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
- if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+ if (!CI->isTailCall())
return false;
return true;
@@ -3629,6 +3661,49 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
}
+ case Intrinsic::arm_cls: {
+ const SDValue &Operand = Op.getOperand(1);
+ const EVT VTy = Op.getValueType();
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
+ SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
+ SDValue SHL =
+ DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
+ SDValue OR =
+ DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
+ SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
+ return Result;
+ }
+ case Intrinsic::arm_cls64: {
+ // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
+ // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
+ const SDValue &Operand = Op.getOperand(1);
+ const EVT VTy = Op.getValueType();
+
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
+ DAG.getConstant(1, dl, VTy));
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
+ DAG.getConstant(0, dl, VTy));
+ SDValue Constant0 = DAG.getConstant(0, dl, VTy);
+ SDValue Constant1 = DAG.getConstant(1, dl, VTy);
+ SDValue Constant31 = DAG.getConstant(31, dl, VTy);
+ SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
+ SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
+ SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
+ SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
+ SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
+ SDValue CheckLo =
+ DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
+ SDValue HiIsZero =
+ DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
+ SDValue AdjustedLo =
+ DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
+ SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
+ SDValue Result =
+ DAG.getSelect(dl, VTy, CheckLo,
+ DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
+ return Result;
+ }
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -3698,6 +3773,10 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
case Intrinsic::arm_neon_vtbl2:
return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::arm_mve_pred_i2v:
+ case Intrinsic::arm_mve_pred_v2i:
+ return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1));
}
}
@@ -4887,7 +4966,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
Opcode = ARMISD::CSINC;
std::swap(TrueVal, FalseVal);
std::swap(TVal, FVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
}
if (Opcode) {
@@ -4897,7 +4976,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
std::swap(TrueVal, FalseVal);
std::swap(TVal, FVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
}
// Attempt to use ZR checking TVal is 0, possibly inverting the condition
@@ -4906,7 +4985,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
if (FVal == 0 && Opcode != ARMISD::CSINC) {
std::swap(TrueVal, FalseVal);
std::swap(TVal, FVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
}
if (TVal == 0)
TrueVal = DAG.getRegister(ARM::ZR, MVT::i32);
@@ -4950,7 +5029,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
std::swap(TrueVal, FalseVal);
}
}
@@ -5310,17 +5389,31 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
if (VT.isVector())
return LowerVectorFP_TO_INT(Op, DAG);
- if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) {
+
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+
+ if (isUnsupportedFloatingType(SrcVal.getValueType())) {
RTLIB::Libcall LC;
- if (Op.getOpcode() == ISD::FP_TO_SINT)
- LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
+ if (Op.getOpcode() == ISD::FP_TO_SINT ||
+ Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
+ LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
Op.getValueType());
else
- LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
+ LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
Op.getValueType());
+ SDLoc Loc(Op);
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
- CallOptions, SDLoc(Op)).first;
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ SDValue Result;
+ std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
+ CallOptions, Loc, Chain);
+ return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
+ }
+
+ // FIXME: Remove this when we have strict fp instruction selection patterns
+ if (IsStrict) {
+ DAG.mutateStrictFPToFP(Op.getNode());
}
return Op;
@@ -5517,7 +5610,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<unsigned>(RegName)
.Case("sp", ARM::SP)
@@ -7745,6 +7838,92 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
DAG.getConstant(ARMCC::NE, dl, MVT::i32));
}
+static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
+ ArrayRef<int> ShuffleMask,
+ SelectionDAG &DAG) {
+ // Attempt to lower the vector shuffle using as many whole register movs as
+ // possible. This is useful for types smaller than 32bits, which would
+ // often otherwise become a series for grp movs.
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ if (VT.getScalarSizeInBits() >= 32)
+ return SDValue();
+
+ assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
+ "Unexpected vector type");
+ int NumElts = VT.getVectorNumElements();
+ int QuarterSize = NumElts / 4;
+ // The four final parts of the vector, as i32's
+ SDValue Parts[4];
+
+ // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
+ // <u,u,u,u>), returning the vmov lane index
+ auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
+ // Detect which mov lane this would be from the first non-undef element.
+ int MovIdx = -1;
+ for (int i = 0; i < Length; i++) {
+ if (ShuffleMask[Start + i] >= 0) {
+ if (ShuffleMask[Start + i] % Length != i)
+ return -1;
+ MovIdx = ShuffleMask[Start + i] / Length;
+ break;
+ }
+ }
+ // If all items are undef, leave this for other combines
+ if (MovIdx == -1)
+ return -1;
+ // Check the remaining values are the correct part of the same mov
+ for (int i = 1; i < Length; i++) {
+ if (ShuffleMask[Start + i] >= 0 &&
+ (ShuffleMask[Start + i] / Length != MovIdx ||
+ ShuffleMask[Start + i] % Length != i))
+ return -1;
+ }
+ return MovIdx;
+ };
+
+ for (int Part = 0; Part < 4; ++Part) {
+ // Does this part look like a mov
+ int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
+ if (Elt != -1) {
+ SDValue Input = Op->getOperand(0);
+ if (Elt >= 4) {
+ Input = Op->getOperand(1);
+ Elt -= 4;
+ }
+ SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input);
+ Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast,
+ DAG.getConstant(Elt, dl, MVT::i32));
+ }
+ }
+
+ // Nothing interesting found, just return
+ if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
+ return SDValue();
+
+ // The other parts need to be built with the old shuffle vector, cast to a
+ // v4i32 and extract_vector_elts
+ if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
+ SmallVector<int, 16> NewShuffleMask;
+ for (int Part = 0; Part < 4; ++Part)
+ for (int i = 0; i < QuarterSize; i++)
+ NewShuffleMask.push_back(
+ Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
+ SDValue NewShuffle = DAG.getVectorShuffle(
+ VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
+ SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle);
+
+ for (int Part = 0; Part < 4; ++Part)
+ if (!Parts[Part])
+ Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ BitCast, DAG.getConstant(Part, dl, MVT::i32));
+ }
+ // Build a vector out of the various parts and bitcast it back to the original
+ // type.
+ SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts);
+ return DAG.getBitcast(VT, NewVec);
+}
+
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDValue V1 = Op.getOperand(0);
@@ -7939,6 +8118,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
return NewOp;
+ if (ST->hasMVEIntegerOps())
+ if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
+ return NewOp;
+
return SDValue();
}
@@ -8905,6 +9088,24 @@ static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
}
+void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ EVT MemVT = LD->getMemoryVT();
+ assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
+
+ if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
+ !Subtarget->isThumb1Only() && LD->isVolatile()) {
+ SDLoc dl(N);
+ SDValue Result = DAG.getMemIntrinsicNode(
+ ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
+ {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
+ Result.getValue(0), Result.getValue(1));
+ Results.append({Pair, Result.getValue(2)});
+ }
+}
+
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
EVT MemVT = ST->getMemoryVT();
@@ -8934,6 +9135,40 @@ static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
ST->getMemOperand());
}
+static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+ EVT MemVT = ST->getMemoryVT();
+ assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
+
+ if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
+ !Subtarget->isThumb1Only() && ST->isVolatile()) {
+ SDNode *N = Op.getNode();
+ SDLoc dl(N);
+
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
+ DAG.getTargetConstant(0, dl, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
+ DAG.getTargetConstant(1, dl, MVT::i32));
+
+ return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
+ {ST->getChain(), Lo, Hi, ST->getBasePtr()},
+ MemVT, ST->getMemOperand());
+ } else if (Subtarget->hasMVEIntegerOps() &&
+ ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
+ MemVT == MVT::v16i1))) {
+ return LowerPredicateStore(Op, DAG);
+ }
+
+ return SDValue();
+}
+
+static bool isZeroVector(SDValue N) {
+ return (ISD::isBuildVectorAllZeros(N.getNode()) ||
+ (N->getOpcode() == ARMISD::VMOVIMM &&
+ isNullConstant(N->getOperand(0))));
+}
+
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
MVT VT = Op.getSimpleValueType();
@@ -8941,13 +9176,7 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
SDValue PassThru = N->getPassThru();
SDLoc dl(Op);
- auto IsZero = [](SDValue PassThru) {
- return (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
- (PassThru->getOpcode() == ARMISD::VMOVIMM &&
- isNullConstant(PassThru->getOperand(0))));
- };
-
- if (IsZero(PassThru))
+ if (isZeroVector(PassThru))
return Op;
// MVE Masked loads use zero as the passthru value. Here we convert undef to
@@ -8955,12 +9184,13 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
DAG.getTargetConstant(0, dl, MVT::i32));
SDValue NewLoad = DAG.getMaskedLoad(
- VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
- N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
+ VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
+ N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+ N->getExtensionType(), N->isExpandingLoad());
SDValue Combo = NewLoad;
if (!PassThru.isUndef() &&
(PassThru.getOpcode() != ISD::BITCAST ||
- !IsZero(PassThru->getOperand(0))))
+ !isZeroVector(PassThru->getOperand(0))))
Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
}
@@ -9043,58 +9273,6 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N,
Results.push_back(SDValue(CmpSwap, 2));
}
-static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
- SelectionDAG &DAG) {
- const auto &TLI = DAG.getTargetLoweringInfo();
-
- assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
- "Custom lowering is MSVCRT specific!");
-
- SDLoc dl(Op);
- SDValue Val = Op.getOperand(0);
- MVT Ty = Val->getSimpleValueType(0);
- SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
- SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
- TLI.getPointerTy(DAG.getDataLayout()));
-
- TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
-
- Entry.Node = Val;
- Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
- Entry.IsZExt = true;
- Args.push_back(Entry);
-
- Entry.Node = Exponent;
- Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
- Entry.IsZExt = true;
- Args.push_back(Entry);
-
- Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
-
- // In the in-chain to the call is the entry node If we are emitting a
- // tailcall, the chain will be mutated if the node has a non-entry input
- // chain.
- SDValue InChain = DAG.getEntryNode();
- SDValue TCChain = InChain;
-
- const Function &F = DAG.getMachineFunction().getFunction();
- bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
- F.getReturnType() == LCRTy;
- if (IsTC)
- InChain = TCChain;
-
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl)
- .setChain(InChain)
- .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
- .setTailCall(IsTC);
- std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);
-
- // Return the chain (the DAG root) if it is a tail call
- return !CI.second.getNode() ? DAG.getRoot() : CI.first;
-}
-
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
switch (Op.getOpcode()) {
@@ -9114,6 +9292,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::STRICT_FP_TO_UINT:
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
@@ -9170,7 +9350,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::LOAD:
return LowerPredicateLoad(Op, DAG);
case ISD::STORE:
- return LowerPredicateStore(Op, DAG);
+ return LowerSTORE(Op, DAG, Subtarget);
case ISD::MLOAD:
return LowerMLOAD(Op, DAG);
case ISD::ATOMIC_LOAD:
@@ -9182,9 +9362,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->isTargetWindows())
return LowerDYNAMIC_STACKALLOC(Op, DAG);
llvm_unreachable("Don't know how to custom lower this!");
+ case ISD::STRICT_FP_ROUND:
case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
+ case ISD::STRICT_FP_EXTEND:
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
- case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
case ARMISD::WIN__DBZCHK: return SDValue();
}
}
@@ -9271,7 +9452,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ABS:
lowerABS(N, Results, DAG);
return ;
-
+ case ISD::LOAD:
+ LowerLOAD(N, Results, DAG);
+ break;
}
if (Res.getNode())
Results.push_back(Res);
@@ -11711,7 +11894,8 @@ static SDValue PerformADDCombine(SDNode *N,
/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
///
static SDValue PerformSUBCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -11720,7 +11904,28 @@ static SDValue PerformSUBCombine(SDNode *N,
if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
return Result;
- return SDValue();
+ if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
+ return SDValue();
+
+ // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
+ // so that we can readily pattern match more mve instructions which can use
+ // a scalar operand.
+ SDValue VDup = N->getOperand(1);
+ if (VDup->getOpcode() != ARMISD::VDUP)
+ return SDValue();
+
+ SDValue VMov = N->getOperand(0);
+ if (VMov->getOpcode() == ISD::BITCAST)
+ VMov = VMov->getOperand(0);
+
+ if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
+ return SDValue();
+
+ SDLoc dl(N);
+ SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DCI.DAG.getConstant(0, dl, MVT::i32),
+ VDup->getOperand(0));
+ return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
}
/// PerformVMULCombine
@@ -12736,6 +12941,39 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return SDValue();
}
+static SDValue PerformVCMPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasMVEIntegerOps())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ ARMCC::CondCodes Cond =
+ (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ SDLoc dl(N);
+
+ // vcmp X, 0, cc -> vcmpz X, cc
+ if (isZeroVector(Op1))
+ return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0,
+ N->getOperand(2));
+
+ unsigned SwappedCond = getSwappedCondition(Cond);
+ if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
+ // vcmp 0, X, cc -> vcmpz X, reversed(cc)
+ if (isZeroVector(Op0))
+ return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
+ DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
+ // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
+ if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
+ return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
+ DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
+ }
+
+ return SDValue();
+}
+
/// PerformInsertEltCombine - Target-specific dag combine xforms for
/// ISD::INSERT_VECTOR_ELT.
static SDValue PerformInsertEltCombine(SDNode *N,
@@ -13844,11 +14082,12 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDValue N0 = N->getOperand(0);
- // Check for sign- and zero-extensions of vector extract operations of 8-
- // and 16-bit vector elements. NEON supports these directly. They are
+ // Check for sign- and zero-extensions of vector extract operations of 8- and
+ // 16-bit vector elements. NEON and MVE support these directly. They are
// handled during DAG combining because type legalization will promote them
// to 32-bit types and it is messy to recognize the operations after that.
- if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
+ N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue Vec = N0.getOperand(0);
SDValue Lane = N0.getOperand(1);
EVT VT = N->getValueType(0);
@@ -14067,7 +14306,7 @@ static SDValue PerformHWLoopCombine(SDNode *N,
return SDValue();
if (Negate)
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
return (CC == ISD::SETEQ && Imm == 0) ||
@@ -14371,7 +14610,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
- case ISD::SUB: return PerformSUBCombine(N, DCI);
+ case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
@@ -14415,6 +14654,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return PerformARMBUILD_VECTORCombine(N, DCI);
case ARMISD::PREDICATE_CAST:
return PerformPREDICATE_CASTCombine(N, DCI);
+ case ARMISD::VCMP:
+ return PerformVCMPCombine(N, DCI, Subtarget);
case ARMISD::SMULWB: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
@@ -14523,7 +14764,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
if (!VT.isSimple())
return false;
- // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
+ // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
auto Ty = VT.getSimpleVT().SimpleTy;
@@ -14725,8 +14966,12 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
switch (I->getOpcode()) {
case Instruction::Add:
case Instruction::Mul:
+ case Instruction::ICmp:
return true;
case Instruction::Sub:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
return Operand == 1;
default:
return false;
@@ -14808,6 +15053,40 @@ int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
return -1;
}
+/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+/// expanded to FMAs when this method returns true, otherwise fmuladd is
+/// expanded to fmul + fadd.
+///
+/// ARM supports both fused and unfused multiply-add operations; we already
+/// lower a pair of fmul and fadd to the latter so it's not clear that there
+/// would be a gain or that the gain would be worthwhile enough to risk
+/// correctness bugs.
+///
+/// For MVE, we set this to true as it helps simplify the need for some
+/// patterns (and we don't have the non-fused floating point instruction).
+bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const {
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::v4f32:
+ case MVT::v8f16:
+ return Subtarget->hasMVEFloatOps();
+ case MVT::f16:
+ return Subtarget->useFPVFMx16();
+ case MVT::f32:
+ return Subtarget->useFPVFMx();
+ case MVT::f64:
+ return Subtarget->useFPVFMx64();
+ default:
+ break;
+ }
+
+ return false;
+}
+
static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
if (V < 0)
return false;
@@ -14850,7 +15129,7 @@ static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
V = -V;
}
- unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U);
+ unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
// MVE: size * imm7
if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
@@ -15155,14 +15434,19 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
}
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
- bool isSEXTLoad, bool isLE, SDValue &Base,
- SDValue &Offset, bool &isInc,
- SelectionDAG &DAG) {
+ bool isSEXTLoad, bool IsMasked, bool isLE,
+ SDValue &Base, SDValue &Offset,
+ bool &isInc, SelectionDAG &DAG) {
if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
return false;
if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
return false;
+ // We allow LE non-masked loads to change the type (for example use a vldrb.8
+ // as opposed to a vldrw.32). This can allow extra addressing modes or
+ // alignments for what is otherwise an equivalent instruction.
+ bool CanChangeType = isLE && !IsMasked;
+
ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
int RHSC = (int)RHS->getZExtValue();
@@ -15181,7 +15465,7 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
};
// Try to find a matching instruction based on s/zext, Alignment, Offset and
- // (in BE) type.
+ // (in BE/masked) type.
Base = Ptr->getOperand(0);
if (VT == MVT::v4i16) {
if (Align >= 2 && IsInRange(RHSC, 0x80, 2))
@@ -15189,13 +15473,15 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
} else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
if (IsInRange(RHSC, 0x80, 1))
return true;
- } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) &&
+ } else if (Align >= 4 &&
+ (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
IsInRange(RHSC, 0x80, 4))
return true;
- else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) &&
+ else if (Align >= 2 &&
+ (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
IsInRange(RHSC, 0x80, 2))
return true;
- else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
+ else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
return true;
return false;
}
@@ -15215,6 +15501,7 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
SDValue Ptr;
unsigned Align;
bool isSEXTLoad = false;
+ bool IsMasked = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
Ptr = LD->getBasePtr();
VT = LD->getMemoryVT();
@@ -15224,6 +15511,17 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
Ptr = ST->getBasePtr();
VT = ST->getMemoryVT();
Align = ST->getAlignment();
+ } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+ Ptr = LD->getBasePtr();
+ VT = LD->getMemoryVT();
+ Align = LD->getAlignment();
+ isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+ IsMasked = true;
+ } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
+ Ptr = ST->getBasePtr();
+ VT = ST->getMemoryVT();
+ Align = ST->getAlignment();
+ IsMasked = true;
} else
return false;
@@ -15232,8 +15530,8 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
if (VT.isVector())
isLegal = Subtarget->hasMVEIntegerOps() &&
getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad,
- Subtarget->isLittle(), Base, Offset,
- isInc, DAG);
+ IsMasked, Subtarget->isLittle(), Base,
+ Offset, isInc, DAG);
else {
if (Subtarget->isThumb2())
isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
@@ -15261,6 +15559,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
SDValue Ptr;
unsigned Align;
bool isSEXTLoad = false, isNonExt;
+ bool IsMasked = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
@@ -15272,6 +15571,19 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
Ptr = ST->getBasePtr();
Align = ST->getAlignment();
isNonExt = !ST->isTruncatingStore();
+ } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+ VT = LD->getMemoryVT();
+ Ptr = LD->getBasePtr();
+ Align = LD->getAlignment();
+ isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+ isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
+ IsMasked = true;
+ } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
+ VT = ST->getMemoryVT();
+ Ptr = ST->getBasePtr();
+ Align = ST->getAlignment();
+ isNonExt = !ST->isTruncatingStore();
+ IsMasked = true;
} else
return false;
@@ -15295,7 +15607,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
bool isLegal = false;
if (VT.isVector())
isLegal = Subtarget->hasMVEIntegerOps() &&
- getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad,
+ getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked,
Subtarget->isLittle(), Base, Offset,
isInc, DAG);
else {
@@ -16048,7 +16360,8 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
}
SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
- SDValue SrcVal = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
const unsigned DstSz = Op.getValueType().getSizeInBits();
const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
@@ -16068,34 +16381,35 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDLoc Loc(Op);
RTLIB::Libcall LC;
MakeLibCallOptions CallOptions;
- if (SrcSz == 16) {
- // Instruction from 16 -> 32
- if (Subtarget->hasFP16())
- SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal);
- // Lib call from 16 -> 32
- else {
- LC = RTLIB::getFPEXT(MVT::f16, MVT::f32);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
+ bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
+ MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
+ MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
+ if (Supported) {
+ if (IsStrict) {
+ SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
+ {DstVT, MVT::Other}, {Chain, SrcVal});
+ Chain = SrcVal.getValue(1);
+ } else {
+ SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
+ }
+ } else {
+ LC = RTLIB::getFPEXT(SrcVT, DstVT);
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_EXTEND");
- SrcVal =
- makeLibCall(DAG, LC, MVT::f32, SrcVal, CallOptions, Loc).first;
+ std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
+ Loc, Chain);
}
}
- if (DstSz != 64)
- return SrcVal;
- // For sure now SrcVal is 32 bits
- if (Subtarget->hasFP64()) // Instruction from 32 -> 64
- return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal);
-
- LC = RTLIB::getFPEXT(MVT::f32, MVT::f64);
- assert(LC != RTLIB::UNKNOWN_LIBCALL &&
- "Unexpected type for custom-lowering FP_EXTEND");
- return makeLibCall(DAG, LC, MVT::f64, SrcVal, CallOptions, Loc).first;
+ return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
}
SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
- SDValue SrcVal = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+
+ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
EVT SrcVT = SrcVal.getValueType();
EVT DstVT = Op.getValueType();
const unsigned DstSz = Op.getValueType().getSizeInBits();
@@ -16118,7 +16432,11 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_ROUND");
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, Loc).first;
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ SDValue Result;
+ std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
+ Loc, Chain);
+ return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
}
void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
@@ -16644,15 +16962,20 @@ ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
}
bool ARMTargetLowering::isLegalInterleavedAccessType(
- VectorType *VecTy, const DataLayout &DL) const {
+ unsigned Factor, VectorType *VecTy, const DataLayout &DL) const {
unsigned VecSize = DL.getTypeSizeInBits(VecTy);
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+ if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
+ return false;
+
// Ensure the vector doesn't have f16 elements. Even though we could do an
// i16 vldN, we can't hold the f16 vectors and will end up converting via
// f32.
- if (VecTy->getElementType()->isHalfTy())
+ if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
+ return false;
+ if (Subtarget->hasMVEIntegerOps() && Factor == 3)
return false;
// Ensure the number of vector elements is greater than 1.
@@ -16665,12 +16988,16 @@ bool ARMTargetLowering::isLegalInterleavedAccessType(
// Ensure the total vector size is 64 or a multiple of 128. Types larger than
// 128 will be split into multiple interleaved accesses.
- return VecSize == 64 || VecSize % 128 == 0;
+ if (Subtarget->hasNEON() && VecSize == 64)
+ return true;
+ return VecSize % 128 == 0;
}
unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
if (Subtarget->hasNEON())
return 4;
+ if (Subtarget->hasMVEIntegerOps())
+ return MVEMaxSupportedInterleaveFactor;
return TargetLoweringBase::getMaxSupportedInterleaveFactor();
}
@@ -16702,7 +17029,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
- if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
+ if (!isLegalInterleavedAccessType(Factor, VecTy, DL))
return false;
unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
@@ -16734,13 +17061,37 @@ bool ARMTargetLowering::lowerInterleavedLoad(
assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
- Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
- Type *Tys[] = {VecTy, Int8Ptr};
- static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
- Intrinsic::arm_neon_vld3,
- Intrinsic::arm_neon_vld4};
- Function *VldnFunc =
- Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+ auto createLoadIntrinsic = [&](Value *BaseAddr) {
+ if (Subtarget->hasNEON()) {
+ Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+ Type *Tys[] = {VecTy, Int8Ptr};
+ static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
+ Intrinsic::arm_neon_vld3,
+ Intrinsic::arm_neon_vld4};
+ Function *VldnFunc =
+ Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+
+ SmallVector<Value *, 2> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+ Ops.push_back(Builder.getInt32(LI->getAlignment()));
+
+ return Builder.CreateCall(VldnFunc, Ops, "vldN");
+ } else {
+ assert((Factor == 2 || Factor == 4) &&
+ "expected interleave factor of 2 or 4 for MVE");
+ Intrinsic::ID LoadInts =
+ Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
+ Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo(
+ LI->getPointerAddressSpace());
+ Type *Tys[] = {VecTy, VecEltTy};
+ Function *VldnFunc =
+ Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
+
+ SmallVector<Value *, 2> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy));
+ return Builder.CreateCall(VldnFunc, Ops, "vldN");
+ }
+ };
// Holds sub-vectors extracted from the load intrinsic return values. The
// sub-vectors are associated with the shufflevector instructions they will
@@ -16755,11 +17106,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
VecTy->getVectorNumElements() * Factor);
- SmallVector<Value *, 2> Ops;
- Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
- Ops.push_back(Builder.getInt32(LI->getAlignment()));
-
- CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+ CallInst *VldN = createLoadIntrinsic(BaseAddr);
// Replace uses of each shufflevector with the corresponding vector loaded
// by ldN.
@@ -16838,7 +17185,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
- if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
+ if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL))
return false;
unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
@@ -16882,11 +17229,46 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
auto Mask = SVI->getShuffleMask();
- Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
- Type *Tys[] = {Int8Ptr, SubVecTy};
- static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
- Intrinsic::arm_neon_vst3,
- Intrinsic::arm_neon_vst4};
+ auto createStoreIntrinsic = [&](Value *BaseAddr,
+ SmallVectorImpl<Value *> &Shuffles) {
+ if (Subtarget->hasNEON()) {
+ static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
+ Intrinsic::arm_neon_vst3,
+ Intrinsic::arm_neon_vst4};
+ Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+ Type *Tys[] = {Int8Ptr, SubVecTy};
+
+ Function *VstNFunc = Intrinsic::getDeclaration(
+ SI->getModule(), StoreInts[Factor - 2], Tys);
+
+ SmallVector<Value *, 6> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+ for (auto S : Shuffles)
+ Ops.push_back(S);
+ Ops.push_back(Builder.getInt32(SI->getAlignment()));
+ Builder.CreateCall(VstNFunc, Ops);
+ } else {
+ assert((Factor == 2 || Factor == 4) &&
+ "expected interleave factor of 2 or 4 for MVE");
+ Intrinsic::ID StoreInts =
+ Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
+ Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo(
+ SI->getPointerAddressSpace());
+ Type *Tys[] = {EltPtrTy, SubVecTy};
+ Function *VstNFunc =
+ Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);
+
+ SmallVector<Value *, 6> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy));
+ for (auto S : Shuffles)
+ Ops.push_back(S);
+ for (unsigned F = 0; F < Factor; F++) {
+ Ops.push_back(Builder.getInt32(F));
+ Builder.CreateCall(VstNFunc, Ops);
+ Ops.pop_back();
+ }
+ }
+ };
for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
// If we generating more than one store, we compute the base address of
@@ -16895,17 +17277,13 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
BaseAddr, LaneLen * Factor);
- SmallVector<Value *, 6> Ops;
- Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
-
- Function *VstNFunc =
- Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+ SmallVector<Value *, 4> Shuffles;
// Split the shufflevector operands into sub vectors for the new vstN call.
for (unsigned i = 0; i < Factor; i++) {
unsigned IdxI = StoreCount * LaneLen * Factor + i;
if (Mask[IdxI] >= 0) {
- Ops.push_back(Builder.CreateShuffleVector(
+ Shuffles.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
} else {
unsigned StartMask = 0;
@@ -16922,13 +17300,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
// In the case of all undefs we're defaulting to using elems from 0
// Note: StartMask cannot be negative, it's checked in
// isReInterleaveMask
- Ops.push_back(Builder.CreateShuffleVector(
+ Shuffles.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
}
}
- Ops.push_back(Builder.getInt32(SI->getAlignment()));
- Builder.CreateCall(VstNFunc, Ops);
+ createStoreIntrinsic(BaseAddr, Shuffles);
}
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
index 53813fad5afd..1baa22a4fa56 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -278,7 +278,11 @@ class VectorType;
VST4_UPD,
VST2LN_UPD,
VST3LN_UPD,
- VST4LN_UPD
+ VST4LN_UPD,
+
+ // Load/Store of dual registers
+ LDRD,
+ STRD
};
} // end namespace ARMISD
@@ -377,7 +381,7 @@ class VectorType;
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
- /// Returns true if the addresing mode representing by AM is legal
+ /// Returns true if the addressing mode representing by AM is legal
/// for the Thumb1 target, for a load/store of the specified type.
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
@@ -604,7 +608,7 @@ class VectorType;
/// Returns true if \p VecTy is a legal interleaved access type. This
/// function checks the vector element type and the overall width of the
/// vector.
- bool isLegalInterleavedAccessType(VectorType *VecTy,
+ bool isLegalInterleavedAccessType(unsigned Factor, VectorType *VecTy,
const DataLayout &DL) const;
bool alignLoopsWithOptSize() const override;
@@ -731,23 +735,17 @@ class VectorType;
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const;
+ void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const;
- Register getRegisterByName(const char* RegName, EVT VT,
+ Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
- /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
- /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
- /// expanded to FMAs when this method returns true, otherwise fmuladd is
- /// expanded to fmul + fadd.
- ///
- /// ARM supports both fused and unfused multiply-add operations; we already
- /// lower a pair of fmul and fadd to the latter so it's not clear that there
- /// would be a gain or that the gain would be worthwhile enough to risk
- /// correctness bugs.
- bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; }
+ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const override;
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
index fe696222ec70..ce67af6f1b49 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -243,6 +243,12 @@ def ARMqsub8b : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>;
def ARMqadd16b : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>;
def ARMqsub16b : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>;
+def SDT_ARMldrd : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def ARMldrd : SDNode<"ARMISD::LDRD", SDT_ARMldrd, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDT_ARMstrd : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def ARMstrd : SDNode<"ARMISD::STRD", SDT_ARMstrd, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
// Vector operations shared between NEON and MVE
def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
@@ -297,6 +303,28 @@ class RegConstraint<string C> {
string Constraints = C;
}
+// ARMCC condition codes. See ARMCC::CondCodes
+def ARMCCeq : PatLeaf<(i32 0)>;
+def ARMCCne : PatLeaf<(i32 1)>;
+def ARMCChs : PatLeaf<(i32 2)>;
+def ARMCClo : PatLeaf<(i32 3)>;
+def ARMCCmi : PatLeaf<(i32 4)>;
+def ARMCCpl : PatLeaf<(i32 5)>;
+def ARMCCvs : PatLeaf<(i32 6)>;
+def ARMCCvc : PatLeaf<(i32 7)>;
+def ARMCChi : PatLeaf<(i32 8)>;
+def ARMCCls : PatLeaf<(i32 9)>;
+def ARMCCge : PatLeaf<(i32 10)>;
+def ARMCClt : PatLeaf<(i32 11)>;
+def ARMCCgt : PatLeaf<(i32 12)>;
+def ARMCCle : PatLeaf<(i32 13)>;
+def ARMCCal : PatLeaf<(i32 14)>;
+
+// VCC predicates. See ARMVCC::VPTCodes
+def ARMVCCNone : PatLeaf<(i32 0)>;
+def ARMVCCThen : PatLeaf<(i32 1)>;
+def ARMVCCElse : PatLeaf<(i32 2)>;
+
//===----------------------------------------------------------------------===//
// ARM specific transformation functions and pattern fragments.
//
@@ -913,7 +941,10 @@ def MVEShiftImm1_7AsmOperand: ImmAsmOperand<1,7> {
// encodings allow.
let DiagnosticString = "operand must be an immediate in the range [1,8]";
}
-def mve_shift_imm1_7 : Operand<i32> {
+def mve_shift_imm1_7 : Operand<i32>,
+ // SelectImmediateInRange / isScaledConstantInRange uses a
+ // half-open interval, so the parameters <1,8> mean 1-7 inclusive
+ ComplexPattern<i32, 1, "SelectImmediateInRange<1,8>", [], []> {
let ParserMatchClass = MVEShiftImm1_7AsmOperand;
let EncoderMethod = "getMVEShiftImmOpValue";
}
@@ -926,7 +957,10 @@ def MVEShiftImm1_15AsmOperand: ImmAsmOperand<1,15> {
// encodings allow.
let DiagnosticString = "operand must be an immediate in the range [1,16]";
}
-def mve_shift_imm1_15 : Operand<i32> {
+def mve_shift_imm1_15 : Operand<i32>,
+ // SelectImmediateInRange / isScaledConstantInRange uses a
+ // half-open interval, so the parameters <1,16> mean 1-15 inclusive
+ ComplexPattern<i32, 1, "SelectImmediateInRange<1,16>", [], []> {
let ParserMatchClass = MVEShiftImm1_15AsmOperand;
let EncoderMethod = "getMVEShiftImmOpValue";
}
@@ -2667,6 +2701,14 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
Requires<[IsARM, HasV5TE]>;
}
+let mayLoad = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in {
+def LOADDUAL : ARMPseudoInst<(outs GPRPairOp:$Rt), (ins addrmode3:$addr),
+ 64, IIC_iLoad_d_r, []>,
+ Requires<[IsARM, HasV5TE]> {
+ let AM = AddrMode3;
+}
+}
+
def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
NoItinerary, "lda", "\t$Rt, $addr", []>;
def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
@@ -2942,6 +2984,19 @@ let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
}
}
+let mayStore = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in {
+def STOREDUAL : ARMPseudoInst<(outs), (ins GPRPairOp:$Rt, addrmode3:$addr),
+ 64, IIC_iStore_d_r, []>,
+ Requires<[IsARM, HasV5TE]> {
+ let AM = AddrMode3;
+}
+}
+
+let Predicates = [IsARM, HasV5TE] in {
+def : Pat<(ARMstrd GPR:$Rt, GPR:$Rt2, addrmode3:$addr),
+ (STOREDUAL (REG_SEQUENCE GPRPair, GPR:$Rt, gsub_0, GPR:$Rt2, gsub_1), addrmode3:$addr)>;
+}
+
// Indexed stores
multiclass AI2_stridx<bit isByte, string opc,
InstrItinClass iii, InstrItinClass iir> {
@@ -6214,7 +6269,7 @@ def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$temp),
}
def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary,
- [(atomic_fence imm:$ordering, 0)]> {
+ [(atomic_fence timm:$ordering, 0)]> {
let hasSideEffects = 1;
let Size = 0;
let AsmString = "@ COMPILER BARRIER";
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
index 4f67cd6e47cc..604291be822c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -275,6 +275,83 @@ class mve_addr_q_shift<int shift> : MemOperand {
let MIOperandInfo = (ops MQPR:$base, i32imm:$imm);
}
+// A family of classes wrapping up information about the vector types
+// used by MVE.
+class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred,
+ bits<2> size, string suffixletter, bit unsigned> {
+ // The LLVM ValueType representing the vector, so we can use it in
+ // ISel patterns.
+ ValueType Vec = vec;
+
+ // The LLVM ValueType representing a vector with elements double the size
+ // of those in Vec, so we can use it in ISel patterns. It is up to the
+ // invoker of this class to ensure that this is a correct choice.
+ ValueType DblVec = dblvec;
+
+ // An LLVM ValueType representing a corresponding vector of
+ // predicate bits, for use in ISel patterns that handle an IR
+ // intrinsic describing the predicated form of the instruction.
+ //
+ // Usually, for a vector of N things, this will be vNi1. But for
+ // vectors of 2 values, we make an exception, and use v4i1 instead
+ // of v2i1. Rationale: MVE codegen doesn't support doing all the
+ // auxiliary operations on v2i1 (vector shuffles etc), and also,
+ // there's no MVE compare instruction that will _generate_ v2i1
+ // directly.
+ ValueType Pred = pred;
+
+ // The most common representation of the vector element size in MVE
+ // instruction encodings: a 2-bit value V representing an (8<<V)-bit
+ // vector element.
+ bits<2> Size = size;
+
+ // For vectors explicitly mentioning a signedness of integers: 0 for
+ // signed and 1 for unsigned. For anything else, undefined.
+ bit Unsigned = unsigned;
+
+ // The number of bits in a vector element, in integer form.
+ int LaneBits = !shl(8, Size);
+
+ // The suffix used in assembly language on an instruction operating
+ // on this lane if it only cares about number of bits.
+ string BitsSuffix = !if(!eq(suffixletter, "p"),
+ !if(!eq(unsigned, 0b0), "8", "16"),
+ !cast<string>(LaneBits));
+
+ // The suffix used on an instruction that mentions the whole type.
+ string Suffix = suffixletter ## BitsSuffix;
+
+ // The letter part of the suffix only.
+ string SuffixLetter = suffixletter;
+}
+
+// Integer vector types that don't treat signed and unsigned differently.
+def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "i", ?>;
+def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "i", ?>;
+def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "i", ?>;
+def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "i", ?>;
+
+// Explicitly signed and unsigned integer vectors. They map to the
+// same set of LLVM ValueTypes as above, but are represented
+// differently in assembly and instruction encodings.
+def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "s", 0b0>;
+def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "s", 0b0>;
+def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "s", 0b0>;
+def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "s", 0b0>;
+def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "u", 0b1>;
+def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "u", 0b1>;
+def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "u", 0b1>;
+def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "u", 0b1>;
+
+// FP vector types.
+def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1, 0b01, "f", ?>;
+def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1, 0b10, "f", ?>;
+def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, 0b11, "f", ?>;
+
+// Polynomial vector types.
+def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b11, "p", 0b0>;
+def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b11, "p", 0b1>;
+
// --------- Start of base classes for the instructions themselves
class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
@@ -346,9 +423,12 @@ class MVE_ScalarShiftSingleReg<string iname, dag iops, string asm, string cstr,
let Inst{19-16} = RdaDest{3-0};
}
-class MVE_ScalarShiftSRegImm<string iname, bits<2> op5_4, list<dag> pattern=[]>
+class MVE_ScalarShiftSRegImm<string iname, bits<2> op5_4>
: MVE_ScalarShiftSingleReg<iname, (ins rGPR:$RdaSrc, long_shift:$imm),
- "$RdaSrc, $imm", "$RdaDest = $RdaSrc", pattern> {
+ "$RdaSrc, $imm", "$RdaDest = $RdaSrc",
+ [(set rGPR:$RdaDest,
+ (i32 (!cast<Intrinsic>("int_arm_mve_" # iname)
+ (i32 rGPR:$RdaSrc), (i32 imm:$imm))))]> {
bits<5> imm;
let Inst{15} = 0b0;
@@ -364,9 +444,12 @@ def MVE_SRSHR : MVE_ScalarShiftSRegImm<"srshr", 0b10>;
def MVE_UQSHL : MVE_ScalarShiftSRegImm<"uqshl", 0b00>;
def MVE_URSHR : MVE_ScalarShiftSRegImm<"urshr", 0b01>;
-class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4, list<dag> pattern=[]>
+class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4>
: MVE_ScalarShiftSingleReg<iname, (ins rGPR:$RdaSrc, rGPR:$Rm),
- "$RdaSrc, $Rm", "$RdaDest = $RdaSrc", pattern> {
+ "$RdaSrc, $Rm", "$RdaDest = $RdaSrc",
+ [(set rGPR:$RdaDest,
+ (i32 (!cast<Intrinsic>("int_arm_mve_" # iname)
+ (i32 rGPR:$RdaSrc), (i32 rGPR:$Rm))))]> {
bits<4> Rm;
let Inst{15-12} = Rm{3-0};
@@ -487,10 +570,10 @@ class MVE_rDest<dag oops, dag iops, InstrItinClass itin,
let Inst{4} = 0b0;
}
-class MVE_VABAV<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
+class MVE_VABAV<string suffix, bit U, bits<2> size>
: MVE_rDest<(outs rGPR:$Rda), (ins rGPR:$Rda_src, MQPR:$Qn, MQPR:$Qm),
NoItinerary, "vabav", suffix, "$Rda, $Qn, $Qm", "$Rda = $Rda_src",
- pattern> {
+ []> {
bits<4> Qm;
bits<4> Qn;
bits<4> Rda;
@@ -509,12 +592,35 @@ class MVE_VABAV<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
let Inst{0} = 0b1;
}
-def MVE_VABAVs8 : MVE_VABAV<"s8", 0b0, 0b00>;
-def MVE_VABAVs16 : MVE_VABAV<"s16", 0b0, 0b01>;
-def MVE_VABAVs32 : MVE_VABAV<"s32", 0b0, 0b10>;
-def MVE_VABAVu8 : MVE_VABAV<"u8", 0b1, 0b00>;
-def MVE_VABAVu16 : MVE_VABAV<"u16", 0b1, 0b01>;
-def MVE_VABAVu32 : MVE_VABAV<"u32", 0b1, 0b10>;
+multiclass MVE_VABAV_m<MVEVectorVTInfo VTI> {
+ def "" : MVE_VABAV<VTI.Suffix, VTI.Unsigned, VTI.Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(i32 (int_arm_mve_vabav
+ (i32 VTI.Unsigned),
+ (i32 rGPR:$Rda_src),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+ (i32 (Inst (i32 rGPR:$Rda_src),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
+
+ def : Pat<(i32 (int_arm_mve_vabav_predicated
+ (i32 VTI.Unsigned),
+ (i32 rGPR:$Rda_src),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (i32 (Inst (i32 rGPR:$Rda_src),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+ }
+}
+
+defm MVE_VABAVs8 : MVE_VABAV_m<MVE_v16s8>;
+defm MVE_VABAVs16 : MVE_VABAV_m<MVE_v8s16>;
+defm MVE_VABAVs32 : MVE_VABAV_m<MVE_v4s32>;
+defm MVE_VABAVu8 : MVE_VABAV_m<MVE_v16u8>;
+defm MVE_VABAVu16 : MVE_VABAV_m<MVE_v8u16>;
+defm MVE_VABAVu32 : MVE_VABAV_m<MVE_v4u32>;
class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
bit A, bit U, bits<2> size, list<dag> pattern=[]>
@@ -658,17 +764,31 @@ class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
let Inst{0} = 0b0;
}
-multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
- def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b1, bit_7>;
- def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b1, bit_7>;
- def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b1, bit_7>;
- def u8 : MVE_VMINMAXV<iname, "u8", 0b1, 0b00, 0b1, bit_7>;
- def u16 : MVE_VMINMAXV<iname, "u16", 0b1, 0b01, 0b1, bit_7>;
- def u32 : MVE_VMINMAXV<iname, "u32", 0b1, 0b10, 0b1, bit_7>;
+multiclass MVE_VMINMAXV_p<string iname, bit bit_17, bit bit_7,
+ MVEVectorVTInfo VTI, Intrinsic intr> {
+ def "": MVE_VMINMAXV<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
+ bit_17, bit_7>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in
+ def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))),
+ (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
+}
+
+multiclass MVE_VMINMAXV_ty<string iname, bit bit_7,
+ Intrinsic intr_s, Intrinsic intr_u> {
+ defm s8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16s8, intr_s>;
+ defm s16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8s16, intr_s>;
+ defm s32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4s32, intr_s>;
+ defm u8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16u8, intr_u>;
+ defm u16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8u16, intr_u>;
+ defm u32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4u32, intr_u>;
}
-defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 0b1>;
-defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0b0>;
+defm MVE_VMINV : MVE_VMINMAXV_ty<
+ "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>;
+defm MVE_VMAXV : MVE_VMINMAXV_ty<
+ "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>;
let Predicates = [HasMVEInt] in {
def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))),
@@ -709,10 +829,9 @@ defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>;
defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>;
class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
- bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
- list<dag> pattern=[]>
+ bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0>
: MVE_rDest<(outs tGPREven:$RdaDest), iops, NoItinerary, iname, suffix,
- "$RdaDest, $Qn, $Qm", cstr, pattern> {
+ "$RdaDest, $Qn, $Qm", cstr, []> {
bits<4> RdaDest;
bits<3> Qm;
bits<3> Qn;
@@ -730,47 +849,88 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
let Inst{0} = bit_0;
}
-multiclass MVE_VMLAMLSDAV_A<string iname, string x, string suffix,
- bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
- list<dag> pattern=[]> {
- def ""#x#suffix : MVE_VMLAMLSDAV<iname # x, suffix,
+multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI,
+ bit sz, bit bit_28, bit X, bit bit_8, bit bit_0> {
+ def ""#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # x, VTI.Suffix,
(ins MQPR:$Qn, MQPR:$Qm), "",
- sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
- def "a"#x#suffix : MVE_VMLAMLSDAV<iname # "a" # x, suffix,
+ sz, bit_28, 0b0, X, bit_8, bit_0>;
+ def "a"#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # "a" # x, VTI.Suffix,
(ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
"$RdaDest = $RdaSrc",
- sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
+ sz, bit_28, 0b1, X, bit_8, bit_0>;
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(i32 (int_arm_mve_vmldava
+ (i32 VTI.Unsigned),
+ (i32 bit_0) /* subtract */,
+ (i32 X) /* exchange */,
+ (i32 0) /* accumulator */,
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+ (i32 (!cast<Instruction>(NAME # x # VTI.Suffix)
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
+
+ def : Pat<(i32 (int_arm_mve_vmldava_predicated
+ (i32 VTI.Unsigned),
+ (i32 bit_0) /* subtract */,
+ (i32 X) /* exchange */,
+ (i32 0) /* accumulator */,
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (i32 (!cast<Instruction>(NAME # x # VTI.Suffix)
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+
+ def : Pat<(i32 (int_arm_mve_vmldava
+ (i32 VTI.Unsigned),
+ (i32 bit_0) /* subtract */,
+ (i32 X) /* exchange */,
+ (i32 tGPREven:$RdaSrc),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+ (i32 (!cast<Instruction>(NAME # "a" # x # VTI.Suffix)
+ (i32 tGPREven:$RdaSrc),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm)))>;
+
+ def : Pat<(i32 (int_arm_mve_vmldava_predicated
+ (i32 VTI.Unsigned),
+ (i32 bit_0) /* subtract */,
+ (i32 X) /* exchange */,
+ (i32 tGPREven:$RdaSrc),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (i32 (!cast<Instruction>(NAME # "a" # x # VTI.Suffix)
+ (i32 tGPREven:$RdaSrc),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+ }
}
-multiclass MVE_VMLAMLSDAV_AX<string iname, string suffix, bit sz, bit bit_28,
- bit bit_8, bit bit_0, list<dag> pattern=[]> {
- defm "" : MVE_VMLAMLSDAV_A<iname, "", suffix, sz, bit_28,
- 0b0, bit_8, bit_0, pattern>;
- defm "" : MVE_VMLAMLSDAV_A<iname, "x", suffix, sz, bit_28,
- 0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLAMLSDAV_AX<string iname, MVEVectorVTInfo VTI, bit sz,
+ bit bit_28, bit bit_8, bit bit_0> {
+ defm "" : MVE_VMLAMLSDAV_A<iname, "", VTI, sz, bit_28,
+ 0b0, bit_8, bit_0>;
+ defm "" : MVE_VMLAMLSDAV_A<iname, "x", VTI, sz, bit_28,
+ 0b1, bit_8, bit_0>;
}
-multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit bit_8,
- list<dag> pattern=[]> {
- defm "" : MVE_VMLAMLSDAV_AX<"vmladav", "s"#suffix,
- sz, 0b0, bit_8, 0b0, pattern>;
- defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", "u"#suffix,
- sz, 0b1, 0b0, bit_8, 0b0, pattern>;
+multiclass MVE_VMLADAV_multi<MVEVectorVTInfo SVTI, MVEVectorVTInfo UVTI,
+ bit sz, bit bit_8> {
+ defm "" : MVE_VMLAMLSDAV_AX<"vmladav", SVTI,
+ sz, 0b0, bit_8, 0b0>;
+ defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", UVTI,
+ sz, 0b1, 0b0, bit_8, 0b0>;
}
-multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28,
- list<dag> pattern=[]> {
- defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", "s"#suffix,
- sz, bit_28, 0b0, 0b1, pattern>;
+multiclass MVE_VMLSDAV_multi<MVEVectorVTInfo VTI, bit sz, bit bit_28> {
+ defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", VTI,
+ sz, bit_28, 0b0, 0b1>;
}
-defm MVE_VMLADAV : MVE_VMLADAV_multi< "8", 0b0, 0b1>;
-defm MVE_VMLADAV : MVE_VMLADAV_multi<"16", 0b0, 0b0>;
-defm MVE_VMLADAV : MVE_VMLADAV_multi<"32", 0b1, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v16s8, MVE_v16u8, 0b0, 0b1>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v8s16, MVE_v8u16, 0b0, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<MVE_v4s32, MVE_v4u32, 0b1, 0b0>;
-defm MVE_VMLSDAV : MVE_VMLSDAV_multi< "8", 0b0, 0b1>;
-defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"16", 0b0, 0b0>;
-defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"32", 0b1, 0b0>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v16s8, 0b0, 0b1>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v8s16, 0b0, 0b0>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v4s32, 0b1, 0b0>;
// vmlav aliases vmladav
foreach acc = ["", "a"] in {
@@ -932,6 +1092,16 @@ let Predicates = [HasMVEFloat] in {
(v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
(v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+ def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2), (i32 0),
+ (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
+ (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
+ ARMVCCThen, (v4i1 VCCR:$mask),
+ (v4f32 MQPR:$inactive)))>;
+ def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2), (i32 0),
+ (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
+ (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
+ ARMVCCThen, (v8i1 VCCR:$mask),
+ (v8f16 MQPR:$inactive)))>;
}
def MVE_VMINNMf32 : MVE_VMINMAXNM<"vminnm", "f32", 0b0, 0b1>;
@@ -942,6 +1112,16 @@ let Predicates = [HasMVEFloat] in {
(v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
(v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+ def : Pat<(v4f32 (int_arm_mve_min_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
+ (i32 0), (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
+ (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
+ ARMVCCThen, (v4i1 VCCR:$mask),
+ (v4f32 MQPR:$inactive)))>;
+ def : Pat<(v8f16 (int_arm_mve_min_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
+ (i32 0), (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
+ (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
+ ARMVCCThen, (v8i1 VCCR:$mask),
+ (v8f16 MQPR:$inactive)))>;
}
@@ -957,50 +1137,48 @@ class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size,
let Inst{8} = 0b0;
let Inst{6} = 0b1;
let Inst{4} = bit_4;
+ let validForTailPredication = 1;
}
-multiclass MVE_VMINMAX_all_sizes<string iname, bit bit_4> {
- def s8 : MVE_VMINMAX<iname, "s8", 0b0, 0b00, bit_4>;
- def s16 : MVE_VMINMAX<iname, "s16", 0b0, 0b01, bit_4>;
- def s32 : MVE_VMINMAX<iname, "s32", 0b0, 0b10, bit_4>;
- def u8 : MVE_VMINMAX<iname, "u8", 0b1, 0b00, bit_4>;
- def u16 : MVE_VMINMAX<iname, "u16", 0b1, 0b01, bit_4>;
- def u32 : MVE_VMINMAX<iname, "u32", 0b1, 0b10, bit_4>;
-}
+multiclass MVE_VMINMAX_m<string iname, bit bit_4, MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VMINMAX<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, bit_4>;
+ defvar Inst = !cast<Instruction>(NAME);
-defm MVE_VMAX : MVE_VMINMAX_all_sizes<"vmax", 0b0>;
-defm MVE_VMIN : MVE_VMINMAX_all_sizes<"vmin", 0b1>;
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated min/max
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (smin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
- (v16i8 (MVE_VMINs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (smin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
- (v8i16 (MVE_VMINs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (smin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
- (v4i32 (MVE_VMINs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-
- def : Pat<(v16i8 (smax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
- (v16i8 (MVE_VMAXs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (smax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
- (v8i16 (MVE_VMAXs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (smax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
- (v4i32 (MVE_VMAXs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-
- def : Pat<(v16i8 (umin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
- (v16i8 (MVE_VMINu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (umin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
- (v8i16 (MVE_VMINu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (umin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
- (v4i32 (MVE_VMINu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-
- def : Pat<(v16i8 (umax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
- (v16i8 (MVE_VMAXu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (umax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
- (v8i16 (MVE_VMAXu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (umax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
- (v4i32 (MVE_VMAXu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+ // Predicated min/max
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
}
+multiclass MVE_VMAX<MVEVectorVTInfo VTI>
+ : MVE_VMINMAX_m<"vmax", 0b0, VTI, !if(VTI.Unsigned, umax, smax), int_arm_mve_max_predicated>;
+multiclass MVE_VMIN<MVEVectorVTInfo VTI>
+ : MVE_VMINMAX_m<"vmin", 0b1, VTI, !if(VTI.Unsigned, umin, smin), int_arm_mve_min_predicated>;
+
+defm MVE_VMINs8 : MVE_VMIN<MVE_v16s8>;
+defm MVE_VMINs16 : MVE_VMIN<MVE_v8s16>;
+defm MVE_VMINs32 : MVE_VMIN<MVE_v4s32>;
+defm MVE_VMINu8 : MVE_VMIN<MVE_v16u8>;
+defm MVE_VMINu16 : MVE_VMIN<MVE_v8u16>;
+defm MVE_VMINu32 : MVE_VMIN<MVE_v4u32>;
+
+defm MVE_VMAXs8 : MVE_VMAX<MVE_v16s8>;
+defm MVE_VMAXs16 : MVE_VMAX<MVE_v8s16>;
+defm MVE_VMAXs32 : MVE_VMAX<MVE_v4s32>;
+defm MVE_VMAXu8 : MVE_VMAX<MVE_v16u8>;
+defm MVE_VMAXu16 : MVE_VMAX<MVE_v8u16>;
+defm MVE_VMAXu32 : MVE_VMAX<MVE_v4u32>;
+
// end of mve_comp instructions
// start of mve_bit instructions
@@ -1150,53 +1328,61 @@ foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f
(MVE_VAND MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
}
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
- (v16i8 (MVE_VAND (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
- (v8i16 (MVE_VAND (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
- (v4i32 (MVE_VAND (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
- def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))),
- (v2i64 (MVE_VAND (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
-
- def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
- (v16i8 (MVE_VORR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
- (v8i16 (MVE_VORR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
- (v4i32 (MVE_VORR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
- def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))),
- (v2i64 (MVE_VORR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
-
- def : Pat<(v16i8 (xor (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
- (v16i8 (MVE_VEOR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (xor (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
- (v8i16 (MVE_VEOR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (xor (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
- (v4i32 (MVE_VEOR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
- def : Pat<(v2i64 (xor (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))),
- (v2i64 (MVE_VEOR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
-
- def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (vnotq MQPR:$val2))),
- (v16i8 (MVE_VBIC (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (vnotq MQPR:$val2))),
- (v8i16 (MVE_VBIC (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (vnotq MQPR:$val2))),
- (v4i32 (MVE_VBIC (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
- def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (vnotq MQPR:$val2))),
- (v2i64 (MVE_VBIC (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
-
- def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (vnotq MQPR:$val2))),
- (v16i8 (MVE_VORN (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (vnotq MQPR:$val2))),
- (v8i16 (MVE_VORN (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (vnotq MQPR:$val2))),
- (v4i32 (MVE_VORN (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
- def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (vnotq MQPR:$val2))),
- (v2i64 (MVE_VORN (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
+multiclass MVE_bit_op<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> {
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated operation
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+ // Predicated operation
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (instruction
+ (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
+}
+
+defm : MVE_bit_op<MVE_v16i8, and, int_arm_mve_and_predicated, MVE_VAND>;
+defm : MVE_bit_op<MVE_v8i16, and, int_arm_mve_and_predicated, MVE_VAND>;
+defm : MVE_bit_op<MVE_v4i32, and, int_arm_mve_and_predicated, MVE_VAND>;
+defm : MVE_bit_op<MVE_v2i64, and, int_arm_mve_and_predicated, MVE_VAND>;
+
+defm : MVE_bit_op<MVE_v16i8, or, int_arm_mve_orr_predicated, MVE_VORR>;
+defm : MVE_bit_op<MVE_v8i16, or, int_arm_mve_orr_predicated, MVE_VORR>;
+defm : MVE_bit_op<MVE_v4i32, or, int_arm_mve_orr_predicated, MVE_VORR>;
+defm : MVE_bit_op<MVE_v2i64, or, int_arm_mve_orr_predicated, MVE_VORR>;
+
+defm : MVE_bit_op<MVE_v16i8, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
+defm : MVE_bit_op<MVE_v8i16, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
+defm : MVE_bit_op<MVE_v4i32, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
+defm : MVE_bit_op<MVE_v2i64, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
+
+multiclass MVE_bit_op_with_inv<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> {
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated operation
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (vnotq (VTI.Vec MQPR:$Qn)))),
+ (VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+ // Predicated operation
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (instruction
+ (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
}
+defm : MVE_bit_op_with_inv<MVE_v16i8, and, int_arm_mve_bic_predicated, MVE_VBIC>;
+defm : MVE_bit_op_with_inv<MVE_v8i16, and, int_arm_mve_bic_predicated, MVE_VBIC>;
+defm : MVE_bit_op_with_inv<MVE_v4i32, and, int_arm_mve_bic_predicated, MVE_VBIC>;
+defm : MVE_bit_op_with_inv<MVE_v2i64, and, int_arm_mve_bic_predicated, MVE_VBIC>;
+
+defm : MVE_bit_op_with_inv<MVE_v16i8, or, int_arm_mve_orn_predicated, MVE_VORN>;
+defm : MVE_bit_op_with_inv<MVE_v8i16, or, int_arm_mve_orn_predicated, MVE_VORN>;
+defm : MVE_bit_op_with_inv<MVE_v4i32, or, int_arm_mve_orn_predicated, MVE_VORN>;
+defm : MVE_bit_op_with_inv<MVE_v2i64, or, int_arm_mve_orn_predicated, MVE_VORN>;
+
class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps>
: MVE_p<(outs MQPR:$Qd), inOps, NoItinerary,
iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> {
@@ -1429,8 +1615,9 @@ class MVE_int<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
let Inst{3-1} = Qm{2-0};
}
-class MVE_VMULt1<string suffix, bits<2> size, list<dag> pattern=[]>
- : MVE_int<"vmul", suffix, size, pattern> {
+class MVE_VMULt1<string iname, string suffix, bits<2> size,
+ list<dag> pattern=[]>
+ : MVE_int<iname, suffix, size, pattern> {
let Inst{28} = 0b0;
let Inst{25-23} = 0b110;
@@ -1438,22 +1625,36 @@ class MVE_VMULt1<string suffix, bits<2> size, list<dag> pattern=[]>
let Inst{12-8} = 0b01001;
let Inst{4} = 0b1;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
-def MVE_VMULt1i8 : MVE_VMULt1<"i8", 0b00>;
-def MVE_VMULt1i16 : MVE_VMULt1<"i16", 0b01>;
-def MVE_VMULt1i32 : MVE_VMULt1<"i32", 0b10>;
+multiclass MVE_VMUL_m<string iname, MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VMULt1<iname, VTI.Suffix, VTI.Size>;
+ defvar Inst = !cast<Instruction>(NAME);
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
- (v16i8 (MVE_VMULt1i8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
- (v8i16 (MVE_VMULt1i16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
- (v4i32 (MVE_VMULt1i32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated multiply
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+ // Predicated multiply
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
}
-class MVE_VQxDMULH<string iname, string suffix, bits<2> size, bit rounding,
+multiclass MVE_VMUL<MVEVectorVTInfo VTI>
+ : MVE_VMUL_m<"vmul", VTI, mul, int_arm_mve_mul_predicated>;
+
+defm MVE_VMULi8 : MVE_VMUL<MVE_v16i8>;
+defm MVE_VMULi16 : MVE_VMUL<MVE_v8i16>;
+defm MVE_VMULi32 : MVE_VMUL<MVE_v4i32>;
+
+class MVE_VQxDMULH_Base<string iname, string suffix, bits<2> size, bit rounding,
list<dag> pattern=[]>
: MVE_int<iname, suffix, size, pattern> {
@@ -1465,18 +1666,40 @@ class MVE_VQxDMULH<string iname, string suffix, bits<2> size, bit rounding,
let Inst{0} = 0b0;
}
-class MVE_VQDMULH<string suffix, bits<2> size, list<dag> pattern=[]>
- : MVE_VQxDMULH<"vqdmulh", suffix, size, 0b0, pattern>;
-class MVE_VQRDMULH<string suffix, bits<2> size, list<dag> pattern=[]>
- : MVE_VQxDMULH<"vqrdmulh", suffix, size, 0b1, pattern>;
+multiclass MVE_VQxDMULH_m<string iname, MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int,
+ bit rounding> {
+ def "" : MVE_VQxDMULH_Base<iname, VTI.Suffix, VTI.Size, rounding>;
+ defvar Inst = !cast<Instruction>(NAME);
-def MVE_VQDMULHi8 : MVE_VQDMULH<"s8", 0b00>;
-def MVE_VQDMULHi16 : MVE_VQDMULH<"s16", 0b01>;
-def MVE_VQDMULHi32 : MVE_VQDMULH<"s32", 0b10>;
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated multiply
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+ // Predicated multiply
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
+}
-def MVE_VQRDMULHi8 : MVE_VQRDMULH<"s8", 0b00>;
-def MVE_VQRDMULHi16 : MVE_VQRDMULH<"s16", 0b01>;
-def MVE_VQRDMULHi32 : MVE_VQRDMULH<"s32", 0b10>;
+multiclass MVE_VQxDMULH<string iname, MVEVectorVTInfo VTI, bit rounding>
+ : MVE_VQxDMULH_m<iname, VTI, !if(rounding, int_arm_mve_vqrdmulh,
+ int_arm_mve_vqdmulh),
+ !if(rounding, int_arm_mve_qrdmulh_predicated,
+ int_arm_mve_qdmulh_predicated),
+ rounding>;
+
+defm MVE_VQDMULHi8 : MVE_VQxDMULH<"vqdmulh", MVE_v16s8, 0b0>;
+defm MVE_VQDMULHi16 : MVE_VQxDMULH<"vqdmulh", MVE_v8s16, 0b0>;
+defm MVE_VQDMULHi32 : MVE_VQxDMULH<"vqdmulh", MVE_v4s32, 0b0>;
+
+defm MVE_VQRDMULHi8 : MVE_VQxDMULH<"vqrdmulh", MVE_v16s8, 0b1>;
+defm MVE_VQRDMULHi16 : MVE_VQxDMULH<"vqrdmulh", MVE_v8s16, 0b1>;
+defm MVE_VQRDMULHi32 : MVE_VQxDMULH<"vqrdmulh", MVE_v4s32, 0b1>;
class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract,
list<dag> pattern=[]>
@@ -1491,39 +1714,40 @@ class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract,
let validForTailPredication = 1;
}
-class MVE_VADD<string suffix, bits<2> size, list<dag> pattern=[]>
- : MVE_VADDSUB<"vadd", suffix, size, 0b0, pattern>;
-class MVE_VSUB<string suffix, bits<2> size, list<dag> pattern=[]>
- : MVE_VADDSUB<"vsub", suffix, size, 0b1, pattern>;
+multiclass MVE_VADDSUB_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VADDSUB<iname, VTI.Suffix, VTI.Size, subtract>;
+ defvar Inst = !cast<Instruction>(NAME);
-def MVE_VADDi8 : MVE_VADD<"i8", 0b00>;
-def MVE_VADDi16 : MVE_VADD<"i16", 0b01>;
-def MVE_VADDi32 : MVE_VADD<"i32", 0b10>;
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated add/subtract
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
- (v16i8 (MVE_VADDi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
- (v8i16 (MVE_VADDi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
- (v4i32 (MVE_VADDi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+ // Predicated add/subtract
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
}
-def MVE_VSUBi8 : MVE_VSUB<"i8", 0b00>;
-def MVE_VSUBi16 : MVE_VSUB<"i16", 0b01>;
-def MVE_VSUBi32 : MVE_VSUB<"i32", 0b10>;
+multiclass MVE_VADD<MVEVectorVTInfo VTI>
+ : MVE_VADDSUB_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>;
+multiclass MVE_VSUB<MVEVectorVTInfo VTI>
+ : MVE_VADDSUB_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>;
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
- (v16i8 (MVE_VSUBi8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
- def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
- (v8i16 (MVE_VSUBi16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
- def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
- (v4i32 (MVE_VSUBi32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-}
+defm MVE_VADDi8 : MVE_VADD<MVE_v16i8>;
+defm MVE_VADDi16 : MVE_VADD<MVE_v8i16>;
+defm MVE_VADDi32 : MVE_VADD<MVE_v4i32>;
+
+defm MVE_VSUBi8 : MVE_VSUB<MVE_v16i8>;
+defm MVE_VSUBi16 : MVE_VSUB<MVE_v8i16>;
+defm MVE_VSUBi32 : MVE_VSUB<MVE_v4i32>;
class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract,
- bits<2> size, ValueType vt>
+ bits<2> size>
: MVE_int<iname, suffix, size, []> {
let Inst{28} = U;
@@ -1535,50 +1759,75 @@ class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract,
let Inst{4} = 0b1;
let Inst{0} = 0b0;
let validForTailPredication = 1;
+}
- ValueType VT = vt;
+class MVE_VQADD_<string suffix, bit U, bits<2> size>
+ : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size>;
+class MVE_VQSUB_<string suffix, bit U, bits<2> size>
+ : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size>;
+
+multiclass MVE_VQADD_m<MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VQADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated saturating add
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+ // Predicated saturating add
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
}
-class MVE_VQADD<string suffix, bit U, bits<2> size, ValueType VT>
- : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, VT>;
-class MVE_VQSUB<string suffix, bit U, bits<2> size, ValueType VT>
- : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, VT>;
+multiclass MVE_VQADD<MVEVectorVTInfo VTI, SDNode unpred_op>
+ : MVE_VQADD_m<VTI, unpred_op, int_arm_mve_qadd_predicated>;
+
+defm MVE_VQADDs8 : MVE_VQADD<MVE_v16s8, saddsat>;
+defm MVE_VQADDs16 : MVE_VQADD<MVE_v8s16, saddsat>;
+defm MVE_VQADDs32 : MVE_VQADD<MVE_v4s32, saddsat>;
+defm MVE_VQADDu8 : MVE_VQADD<MVE_v16u8, uaddsat>;
+defm MVE_VQADDu16 : MVE_VQADD<MVE_v8u16, uaddsat>;
+defm MVE_VQADDu32 : MVE_VQADD<MVE_v4u32, uaddsat>;
+
+multiclass MVE_VQSUB_m<MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VQSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated saturating subtract
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+ // Predicated saturating subtract
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
+}
-def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00, v16i8>;
-def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01, v8i16>;
-def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10, v4i32>;
-def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00, v16i8>;
-def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01, v8i16>;
-def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10, v4i32>;
+multiclass MVE_VQSUB<MVEVectorVTInfo VTI, SDNode unpred_op>
+ : MVE_VQSUB_m<VTI, unpred_op, int_arm_mve_qsub_predicated>;
-def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00, v16i8>;
-def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01, v8i16>;
-def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10, v4i32>;
-def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00, v16i8>;
-def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01, v8i16>;
-def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10, v4i32>;
+defm MVE_VQSUBs8 : MVE_VQSUB<MVE_v16s8, ssubsat>;
+defm MVE_VQSUBs16 : MVE_VQSUB<MVE_v8s16, ssubsat>;
+defm MVE_VQSUBs32 : MVE_VQSUB<MVE_v4s32, ssubsat>;
+defm MVE_VQSUBu8 : MVE_VQSUB<MVE_v16u8, usubsat>;
+defm MVE_VQSUBu16 : MVE_VQSUB<MVE_v8u16, usubsat>;
+defm MVE_VQSUBu32 : MVE_VQSUB<MVE_v4u32, usubsat>;
-let Predicates = [HasMVEInt] in {
- foreach instr = [MVE_VQADDu8, MVE_VQADDu16, MVE_VQADDu32] in
- foreach VT = [instr.VT] in
- def : Pat<(VT (uaddsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
- (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
- foreach instr = [MVE_VQADDs8, MVE_VQADDs16, MVE_VQADDs32] in
- foreach VT = [instr.VT] in
- def : Pat<(VT (saddsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
- (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
- foreach instr = [MVE_VQSUBu8, MVE_VQSUBu16, MVE_VQSUBu32] in
- foreach VT = [instr.VT] in
- def : Pat<(VT (usubsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
- (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
- foreach instr = [MVE_VQSUBs8, MVE_VQSUBs16, MVE_VQSUBs32] in
- foreach VT = [instr.VT] in
- def : Pat<(VT (ssubsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
- (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
-}
-
-
-class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
+class MVE_VABD_int<string suffix, bit U, bits<2> size,
+ list<dag> pattern=[]>
: MVE_int<"vabd", suffix, size, pattern> {
let Inst{28} = U;
@@ -1590,14 +1839,38 @@ class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
let validForTailPredication = 1;
}
-def MVE_VABDs8 : MVE_VABD_int<"s8", 0b0, 0b00>;
-def MVE_VABDs16 : MVE_VABD_int<"s16", 0b0, 0b01>;
-def MVE_VABDs32 : MVE_VABD_int<"s32", 0b0, 0b10>;
-def MVE_VABDu8 : MVE_VABD_int<"u8", 0b1, 0b00>;
-def MVE_VABDu16 : MVE_VABD_int<"u16", 0b1, 0b01>;
-def MVE_VABDu32 : MVE_VABD_int<"u32", 0b1, 0b10>;
+multiclass MVE_VABD_m<MVEVectorVTInfo VTI,
+ Intrinsic unpred_int, Intrinsic pred_int> {
+ def "" : MVE_VABD_int<VTI.Suffix, VTI.Unsigned, VTI.Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated absolute difference
+ def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+ // Predicated absolute difference
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
+}
+
+multiclass MVE_VABD<MVEVectorVTInfo VTI>
+ : MVE_VABD_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
-class MVE_VRHADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
+defm MVE_VABDs8 : MVE_VABD<MVE_v16s8>;
+defm MVE_VABDs16 : MVE_VABD<MVE_v8s16>;
+defm MVE_VABDs32 : MVE_VABD<MVE_v4s32>;
+defm MVE_VABDu8 : MVE_VABD<MVE_v16u8>;
+defm MVE_VABDu16 : MVE_VABD<MVE_v8u16>;
+defm MVE_VABDu32 : MVE_VABD<MVE_v4u32>;
+
+class MVE_VRHADD_Base<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
: MVE_int<"vrhadd", suffix, size, pattern> {
let Inst{28} = U;
@@ -1609,12 +1882,36 @@ class MVE_VRHADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
let validForTailPredication = 1;
}
-def MVE_VRHADDs8 : MVE_VRHADD<"s8", 0b0, 0b00>;
-def MVE_VRHADDs16 : MVE_VRHADD<"s16", 0b0, 0b01>;
-def MVE_VRHADDs32 : MVE_VRHADD<"s32", 0b0, 0b10>;
-def MVE_VRHADDu8 : MVE_VRHADD<"u8", 0b1, 0b00>;
-def MVE_VRHADDu16 : MVE_VRHADD<"u16", 0b1, 0b01>;
-def MVE_VRHADDu32 : MVE_VRHADD<"u32", 0b1, 0b10>;
+multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VRHADD_Base<VTI.Suffix, VTI.Unsigned, VTI.Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated rounding add-with-divide-by-two
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+ // Predicated add-with-divide-by-two
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
+}
+
+multiclass MVE_VRHADD<MVEVectorVTInfo VTI>
+ : MVE_VRHADD_m<VTI, int_arm_mve_vrhadd, int_arm_mve_rhadd_predicated>;
+
+defm MVE_VRHADDs8 : MVE_VRHADD<MVE_v16s8>;
+defm MVE_VRHADDs16 : MVE_VRHADD<MVE_v8s16>;
+defm MVE_VRHADDs32 : MVE_VRHADD<MVE_v4s32>;
+defm MVE_VRHADDu8 : MVE_VRHADD<MVE_v16u8>;
+defm MVE_VRHADDu16 : MVE_VRHADD<MVE_v8u16>;
+defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>;
class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
bits<2> size, list<dag> pattern=[]>
@@ -1631,81 +1928,73 @@ class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
let validForTailPredication = 1;
}
-class MVE_VHADD<string suffix, bit U, bits<2> size,
+class MVE_VHADD_<string suffix, bit U, bits<2> size,
list<dag> pattern=[]>
: MVE_VHADDSUB<"vhadd", suffix, U, 0b0, size, pattern>;
-class MVE_VHSUB<string suffix, bit U, bits<2> size,
+class MVE_VHSUB_<string suffix, bit U, bits<2> size,
list<dag> pattern=[]>
: MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>;
-def MVE_VHADDs8 : MVE_VHADD<"s8", 0b0, 0b00>;
-def MVE_VHADDs16 : MVE_VHADD<"s16", 0b0, 0b01>;
-def MVE_VHADDs32 : MVE_VHADD<"s32", 0b0, 0b10>;
-def MVE_VHADDu8 : MVE_VHADD<"u8", 0b1, 0b00>;
-def MVE_VHADDu16 : MVE_VHADD<"u16", 0b1, 0b01>;
-def MVE_VHADDu32 : MVE_VHADD<"u32", 0b1, 0b10>;
-
-def MVE_VHSUBs8 : MVE_VHSUB<"s8", 0b0, 0b00>;
-def MVE_VHSUBs16 : MVE_VHSUB<"s16", 0b0, 0b01>;
-def MVE_VHSUBs32 : MVE_VHSUB<"s32", 0b0, 0b10>;
-def MVE_VHSUBu8 : MVE_VHSUB<"u8", 0b1, 0b00>;
-def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>;
-def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>;
+multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VHADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated add-and-divide-by-two
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+ // Predicated add-and-divide-by-two
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned),
+ (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
+}
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (ARMvshrsImm
- (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
- (v16i8 (MVE_VHADDs8
- (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
- def : Pat<(v8i16 (ARMvshrsImm
- (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
- (v8i16 (MVE_VHADDs16
- (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
- def : Pat<(v4i32 (ARMvshrsImm
- (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
- (v4i32 (MVE_VHADDs32
- (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
-
- def : Pat<(v16i8 (ARMvshruImm
- (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
- (v16i8 (MVE_VHADDu8
- (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
- def : Pat<(v8i16 (ARMvshruImm
- (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
- (v8i16 (MVE_VHADDu16
- (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
- def : Pat<(v4i32 (ARMvshruImm
- (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
- (v4i32 (MVE_VHADDu32
- (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
-
- def : Pat<(v16i8 (ARMvshrsImm
- (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
- (v16i8 (MVE_VHSUBs8
- (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
- def : Pat<(v8i16 (ARMvshrsImm
- (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
- (v8i16 (MVE_VHSUBs16
- (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
- def : Pat<(v4i32 (ARMvshrsImm
- (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
- (v4i32 (MVE_VHSUBs32
- (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
-
- def : Pat<(v16i8 (ARMvshruImm
- (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
- (v16i8 (MVE_VHSUBu8
- (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
- def : Pat<(v8i16 (ARMvshruImm
- (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
- (v8i16 (MVE_VHSUBu16
- (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
- def : Pat<(v4i32 (ARMvshruImm
- (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
- (v4i32 (MVE_VHSUBu32
- (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+multiclass MVE_VHADD<MVEVectorVTInfo VTI>
+ : MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated>;
+
+defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8>;
+defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16>;
+defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32>;
+defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8>;
+defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16>;
+defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32>;
+
+multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VHSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated subtract-and-divide-by-two
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+ // Predicated subtract-and-divide-by-two
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
}
+multiclass MVE_VHSUB<MVEVectorVTInfo VTI>
+ : MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated>;
+
+defm MVE_VHSUBs8 : MVE_VHSUB<MVE_v16s8>;
+defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16>;
+defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32>;
+defm MVE_VHSUBu8 : MVE_VHSUB<MVE_v16u8>;
+defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16>;
+defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32>;
+
class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
"vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> {
@@ -1873,6 +2162,49 @@ def MVE_VQNEGs8 : MVE_VQABSNEG<"vqneg", "s8", 0b00, 0b1>;
def MVE_VQNEGs16 : MVE_VQABSNEG<"vqneg", "s16", 0b01, 0b1>;
def MVE_VQNEGs32 : MVE_VQABSNEG<"vqneg", "s32", 0b10, 0b1>;
+// int_min/int_max: vector containing INT_MIN/INT_MAX VTI.Size times
+// zero_vec: v4i32-initialized zero vector, potentially wrapped in a bitconvert
+multiclass vqabsneg_pattern<MVEVectorVTInfo VTI, dag int_min, dag int_max,
+ dag zero_vec, MVE_VQABSNEG vqabs_instruction,
+ MVE_VQABSNEG vqneg_instruction> {
+ let Predicates = [HasMVEInt] in {
+ // The below tree can be replaced by a vqabs instruction, as it represents
+ // the following vectorized expression (r being the value in $reg):
+ // r > 0 ? r : (r == INT_MIN ? INT_MAX : -r)
+ def : Pat<(VTI.Vec (vselect
+ (VTI.Pred (ARMvcmpz (VTI.Vec MQPR:$reg), ARMCCgt)),
+ (VTI.Vec MQPR:$reg),
+ (VTI.Vec (vselect
+ (VTI.Pred (ARMvcmp (VTI.Vec MQPR:$reg), int_min, ARMCCeq)),
+ int_max,
+ (sub (VTI.Vec zero_vec), (VTI.Vec MQPR:$reg)))))),
+ (VTI.Vec (vqabs_instruction (VTI.Vec MQPR:$reg)))>;
+ // Similarly, this tree represents vqneg, i.e. the following vectorized expression:
+ // r == INT_MIN ? INT_MAX : -r
+ def : Pat<(VTI.Vec (vselect
+ (VTI.Pred (ARMvcmp (VTI.Vec MQPR:$reg), int_min, ARMCCeq)),
+ int_max,
+ (sub (VTI.Vec zero_vec), (VTI.Vec MQPR:$reg)))),
+ (VTI.Vec (vqneg_instruction (VTI.Vec MQPR:$reg)))>;
+ }
+}
+
+defm MVE_VQABSNEG_Ps8 : vqabsneg_pattern<MVE_v16i8,
+ (v16i8 (ARMvmovImm (i32 3712))),
+ (v16i8 (ARMvmovImm (i32 3711))),
+ (bitconvert (v4i32 (ARMvmovImm (i32 0)))),
+ MVE_VQABSs8, MVE_VQNEGs8>;
+defm MVE_VQABSNEG_Ps16 : vqabsneg_pattern<MVE_v8i16,
+ (v8i16 (ARMvmovImm (i32 2688))),
+ (v8i16 (ARMvmvnImm (i32 2688))),
+ (bitconvert (v4i32 (ARMvmovImm (i32 0)))),
+ MVE_VQABSs16, MVE_VQNEGs16>;
+defm MVE_VQABSNEG_Ps32 : vqabsneg_pattern<MVE_v4i32,
+ (v4i32 (ARMvmovImm (i32 1664))),
+ (v4i32 (ARMvmvnImm (i32 1664))),
+ (ARMvmovImm (i32 0)),
+ MVE_VQABSs32, MVE_VQNEGs32>;
+
class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op,
dag iops, list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), iops, NoItinerary, iname, suffix, "$Qd, $imm",
@@ -1956,6 +2288,7 @@ class MVE_VMINMAXA<string iname, string suffix, bits<2> size,
let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;
+ let validForTailPredication = 1;
}
def MVE_VMAXAs8 : MVE_VMINMAXA<"vmaxa", "s8", 0b00, 0b0>;
@@ -2049,8 +2382,8 @@ let Predicates = [HasMVEInt] in {
class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
- dag immops, list<dag> pattern=[]>
- : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$Qm), immops),
+ Operand immtype, list<dag> pattern=[]>
+ : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm, immtype:$imm),
iname, suffix, "$Qd, $Qm, $imm", vpred_r, "", pattern> {
let Inst{28} = U;
let Inst{25-23} = 0b101;
@@ -2059,6 +2392,9 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
let Inst{11-6} = 0b111101;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+
+ // For the MVE_VSHLL_patterns multiclass to refer to
+ Operand immediateType = immtype;
}
// The immediate VSHLL instructions accept shift counts from 1 up to
@@ -2067,7 +2403,7 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
class MVE_VSHLL_imm8<string iname, string suffix,
bit U, bit th, list<dag> pattern=[]>
- : MVE_VSHLL_imm<iname, suffix, U, th, (ins mve_shift_imm1_7:$imm), pattern> {
+ : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_7, pattern> {
bits<3> imm;
let Inst{20-19} = 0b01;
let Inst{18-16} = imm;
@@ -2075,7 +2411,7 @@ class MVE_VSHLL_imm8<string iname, string suffix,
class MVE_VSHLL_imm16<string iname, string suffix,
bit U, bit th, list<dag> pattern=[]>
- : MVE_VSHLL_imm<iname, suffix, U, th, (ins mve_shift_imm1_15:$imm), pattern> {
+ : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_15, pattern> {
bits<4> imm;
let Inst{20} = 0b1;
let Inst{19-16} = imm;
@@ -2119,11 +2455,50 @@ defm MVE_VSHLL_lws16 : MVE_VSHLL_lw<"vshll", "s16", 0b01, 0b0, "$Qd, $Qm, #16">;
defm MVE_VSHLL_lwu8 : MVE_VSHLL_lw<"vshll", "u8", 0b00, 0b1, "$Qd, $Qm, #8">;
defm MVE_VSHLL_lwu16 : MVE_VSHLL_lw<"vshll", "u16", 0b01, 0b1, "$Qd, $Qm, #16">;
+multiclass MVE_VSHLL_patterns<MVEVectorVTInfo VTI, int top> {
+ defvar suffix = !strconcat(VTI.Suffix, !if(top, "th", "bh"));
+ defvar inst_imm = !cast<MVE_VSHLL_imm>("MVE_VSHLL_imm" # suffix);
+ defvar inst_lw = !cast<MVE_VSHLL_by_lane_width>("MVE_VSHLL_lw" # suffix);
+ defvar unpred_int = int_arm_mve_vshll_imm;
+ defvar pred_int = int_arm_mve_vshll_imm_predicated;
+ defvar imm = inst_imm.immediateType;
+
+ def : Pat<(VTI.DblVec (unpred_int (VTI.Vec MQPR:$src), imm:$imm,
+ (i32 VTI.Unsigned), (i32 top))),
+ (VTI.DblVec (inst_imm (VTI.Vec MQPR:$src), imm:$imm))>;
+ def : Pat<(VTI.DblVec (unpred_int (VTI.Vec MQPR:$src), (i32 VTI.LaneBits),
+ (i32 VTI.Unsigned), (i32 top))),
+ (VTI.DblVec (inst_lw (VTI.Vec MQPR:$src)))>;
+
+ def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), imm:$imm,
+ (i32 VTI.Unsigned), (i32 top),
+ (VTI.Pred VCCR:$mask),
+ (VTI.DblVec MQPR:$inactive))),
+ (VTI.DblVec (inst_imm (VTI.Vec MQPR:$src), imm:$imm,
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.DblVec MQPR:$inactive)))>;
+ def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), (i32 VTI.LaneBits),
+ (i32 VTI.Unsigned), (i32 top),
+ (VTI.Pred VCCR:$mask),
+ (VTI.DblVec MQPR:$inactive))),
+ (VTI.DblVec (inst_lw (VTI.Vec MQPR:$src), ARMVCCThen,
+ (VTI.Pred VCCR:$mask),
+ (VTI.DblVec MQPR:$inactive)))>;
+}
+
+foreach VTI = [MVE_v16s8, MVE_v8s16, MVE_v16u8, MVE_v8u16] in
+ foreach top = [0, 1] in
+ defm : MVE_VSHLL_patterns<VTI, top>;
+
+class MVE_shift_imm_partial<Operand imm, string iname, string suffix>
+ : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$QdSrc, MQPR:$Qm, imm:$imm),
+ iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc"> {
+ Operand immediateType = imm;
+}
+
class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28,
- dag immops, list<dag> pattern=[]>
- : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops),
- iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc",
- pattern> {
+ Operand imm, list<dag> pattern=[]>
+ : MVE_shift_imm_partial<imm, iname, suffix> {
bits<5> imm;
let Inst{28} = bit_28;
@@ -2136,45 +2511,35 @@ class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28,
let Inst{0} = 0b1;
}
-def MVE_VRSHRNi16bh : MVE_VxSHRN<
- "vrshrnb", "i16", 0b0, 0b1, (ins shr_imm8:$imm)> {
+def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8> {
let Inst{20-19} = 0b01;
}
-def MVE_VRSHRNi16th : MVE_VxSHRN<
- "vrshrnt", "i16", 0b1, 0b1,(ins shr_imm8:$imm)> {
+def MVE_VRSHRNi16th : MVE_VxSHRN<"vrshrnt", "i16", 0b1, 0b1, shr_imm8> {
let Inst{20-19} = 0b01;
}
-def MVE_VRSHRNi32bh : MVE_VxSHRN<
- "vrshrnb", "i32", 0b0, 0b1, (ins shr_imm16:$imm)> {
+def MVE_VRSHRNi32bh : MVE_VxSHRN<"vrshrnb", "i32", 0b0, 0b1, shr_imm16> {
let Inst{20} = 0b1;
}
-def MVE_VRSHRNi32th : MVE_VxSHRN<
- "vrshrnt", "i32", 0b1, 0b1, (ins shr_imm16:$imm)> {
+def MVE_VRSHRNi32th : MVE_VxSHRN<"vrshrnt", "i32", 0b1, 0b1, shr_imm16> {
let Inst{20} = 0b1;
}
-def MVE_VSHRNi16bh : MVE_VxSHRN<
- "vshrnb", "i16", 0b0, 0b0, (ins shr_imm8:$imm)> {
+def MVE_VSHRNi16bh : MVE_VxSHRN<"vshrnb", "i16", 0b0, 0b0, shr_imm8> {
let Inst{20-19} = 0b01;
}
-def MVE_VSHRNi16th : MVE_VxSHRN<
- "vshrnt", "i16", 0b1, 0b0, (ins shr_imm8:$imm)> {
+def MVE_VSHRNi16th : MVE_VxSHRN<"vshrnt", "i16", 0b1, 0b0, shr_imm8> {
let Inst{20-19} = 0b01;
}
-def MVE_VSHRNi32bh : MVE_VxSHRN<
- "vshrnb", "i32", 0b0, 0b0, (ins shr_imm16:$imm)> {
+def MVE_VSHRNi32bh : MVE_VxSHRN<"vshrnb", "i32", 0b0, 0b0, shr_imm16> {
let Inst{20} = 0b1;
}
-def MVE_VSHRNi32th : MVE_VxSHRN<
- "vshrnt", "i32", 0b1, 0b0, (ins shr_imm16:$imm)> {
+def MVE_VSHRNi32th : MVE_VxSHRN<"vshrnt", "i32", 0b1, 0b0, shr_imm16> {
let Inst{20} = 0b1;
}
-class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12, dag immops,
- list<dag> pattern=[]>
- : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops),
- iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc",
- pattern> {
+class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12,
+ Operand imm, list<dag> pattern=[]>
+ : MVE_shift_imm_partial<imm, iname, suffix> {
bits<5> imm;
let Inst{28} = bit_28;
@@ -2188,44 +2553,42 @@ class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12, dag imm
}
def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN<
- "vqrshrunb", "s16", 0b1, 0b0, (ins shr_imm8:$imm)> {
+ "vqrshrunb", "s16", 0b1, 0b0, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VQRSHRUNs16th : MVE_VxQRSHRUN<
- "vqrshrunt", "s16", 0b1, 0b1, (ins shr_imm8:$imm)> {
+ "vqrshrunt", "s16", 0b1, 0b1, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VQRSHRUNs32bh : MVE_VxQRSHRUN<
- "vqrshrunb", "s32", 0b1, 0b0, (ins shr_imm16:$imm)> {
+ "vqrshrunb", "s32", 0b1, 0b0, shr_imm16> {
let Inst{20} = 0b1;
}
def MVE_VQRSHRUNs32th : MVE_VxQRSHRUN<
- "vqrshrunt", "s32", 0b1, 0b1, (ins shr_imm16:$imm)> {
+ "vqrshrunt", "s32", 0b1, 0b1, shr_imm16> {
let Inst{20} = 0b1;
}
def MVE_VQSHRUNs16bh : MVE_VxQRSHRUN<
- "vqshrunb", "s16", 0b0, 0b0, (ins shr_imm8:$imm)> {
+ "vqshrunb", "s16", 0b0, 0b0, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VQSHRUNs16th : MVE_VxQRSHRUN<
- "vqshrunt", "s16", 0b0, 0b1, (ins shr_imm8:$imm)> {
+ "vqshrunt", "s16", 0b0, 0b1, shr_imm8> {
let Inst{20-19} = 0b01;
}
def MVE_VQSHRUNs32bh : MVE_VxQRSHRUN<
- "vqshrunb", "s32", 0b0, 0b0, (ins shr_imm16:$imm)> {
+ "vqshrunb", "s32", 0b0, 0b0, shr_imm16> {
let Inst{20} = 0b1;
}
def MVE_VQSHRUNs32th : MVE_VxQRSHRUN<
- "vqshrunt", "s32", 0b0, 0b1, (ins shr_imm16:$imm)> {
+ "vqshrunt", "s32", 0b0, 0b1, shr_imm16> {
let Inst{20} = 0b1;
}
class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12,
- dag immops, list<dag> pattern=[]>
- : MVE_shift_imm<(outs MQPR:$Qd), !con((ins MQPR:$QdSrc, MQPR:$Qm), immops),
- iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc",
- pattern> {
+ Operand imm, list<dag> pattern=[]>
+ : MVE_shift_imm_partial<imm, iname, suffix> {
bits<5> imm;
let Inst{25-23} = 0b101;
@@ -2238,19 +2601,19 @@ class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12,
}
multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> {
- def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, (ins shr_imm8:$imm)> {
+ def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, shr_imm8> {
let Inst{28} = 0b0;
let Inst{20-19} = 0b01;
}
- def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, (ins shr_imm8:$imm)> {
+ def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, shr_imm8> {
let Inst{28} = 0b1;
let Inst{20-19} = 0b01;
}
- def s32 : MVE_VxQRSHRN<iname, "s32", bit_0, bit_12, (ins shr_imm16:$imm)> {
+ def s32 : MVE_VxQRSHRN<iname, "s32", bit_0, bit_12, shr_imm16> {
let Inst{28} = 0b0;
let Inst{20} = 0b1;
}
- def u32 : MVE_VxQRSHRN<iname, "u32", bit_0, bit_12, (ins shr_imm16:$imm)> {
+ def u32 : MVE_VxQRSHRN<iname, "u32", bit_0, bit_12, shr_imm16> {
let Inst{28} = 0b1;
let Inst{20} = 0b1;
}
@@ -2261,6 +2624,63 @@ defm MVE_VQRSHRNth : MVE_VxQRSHRN_types<"vqrshrnt", 0b1, 0b1>;
defm MVE_VQSHRNbh : MVE_VxQRSHRN_types<"vqshrnb", 0b0, 0b0>;
defm MVE_VQSHRNth : MVE_VxQRSHRN_types<"vqshrnt", 0b0, 0b1>;
+multiclass MVE_VSHRN_patterns<MVE_shift_imm_partial inst,
+ MVEVectorVTInfo OutVTI, MVEVectorVTInfo InVTI,
+ bit q, bit r, bit top> {
+ defvar inparams = (? (OutVTI.Vec MQPR:$QdSrc), (InVTI.Vec MQPR:$Qm),
+ (inst.immediateType:$imm), (i32 q), (i32 r),
+ (i32 OutVTI.Unsigned), (i32 InVTI.Unsigned), (i32 top));
+ defvar outparams = (inst (OutVTI.Vec MQPR:$QdSrc), (InVTI.Vec MQPR:$Qm),
+ (imm:$imm));
+
+ def : Pat<(OutVTI.Vec !setop(inparams, int_arm_mve_vshrn)),
+ (OutVTI.Vec outparams)>;
+ def : Pat<(OutVTI.Vec !con(inparams, (int_arm_mve_vshrn_predicated
+ (InVTI.Pred VCCR:$pred)))),
+ (OutVTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred)))>;
+}
+
+defm : MVE_VSHRN_patterns<MVE_VSHRNi16bh, MVE_v16s8, MVE_v8s16, 0,0,0>;
+defm : MVE_VSHRN_patterns<MVE_VSHRNi16th, MVE_v16s8, MVE_v8s16, 0,0,1>;
+defm : MVE_VSHRN_patterns<MVE_VSHRNi32bh, MVE_v8s16, MVE_v4s32, 0,0,0>;
+defm : MVE_VSHRN_patterns<MVE_VSHRNi32th, MVE_v8s16, MVE_v4s32, 0,0,1>;
+defm : MVE_VSHRN_patterns<MVE_VSHRNi16bh, MVE_v16u8, MVE_v8u16, 0,0,0>;
+defm : MVE_VSHRN_patterns<MVE_VSHRNi16th, MVE_v16u8, MVE_v8u16, 0,0,1>;
+defm : MVE_VSHRN_patterns<MVE_VSHRNi32bh, MVE_v8u16, MVE_v4u32, 0,0,0>;
+defm : MVE_VSHRN_patterns<MVE_VSHRNi32th, MVE_v8u16, MVE_v4u32, 0,0,1>;
+defm : MVE_VSHRN_patterns<MVE_VRSHRNi16bh, MVE_v16s8, MVE_v8s16, 0,1,0>;
+defm : MVE_VSHRN_patterns<MVE_VRSHRNi16th, MVE_v16s8, MVE_v8s16, 0,1,1>;
+defm : MVE_VSHRN_patterns<MVE_VRSHRNi32bh, MVE_v8s16, MVE_v4s32, 0,1,0>;
+defm : MVE_VSHRN_patterns<MVE_VRSHRNi32th, MVE_v8s16, MVE_v4s32, 0,1,1>;
+defm : MVE_VSHRN_patterns<MVE_VRSHRNi16bh, MVE_v16u8, MVE_v8u16, 0,1,0>;
+defm : MVE_VSHRN_patterns<MVE_VRSHRNi16th, MVE_v16u8, MVE_v8u16, 0,1,1>;
+defm : MVE_VSHRN_patterns<MVE_VRSHRNi32bh, MVE_v8u16, MVE_v4u32, 0,1,0>;
+defm : MVE_VSHRN_patterns<MVE_VRSHRNi32th, MVE_v8u16, MVE_v4u32, 0,1,1>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRNbhs16, MVE_v16s8, MVE_v8s16, 1,0,0>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRNths16, MVE_v16s8, MVE_v8s16, 1,0,1>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRNbhs32, MVE_v8s16, MVE_v4s32, 1,0,0>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRNths32, MVE_v8s16, MVE_v4s32, 1,0,1>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRNbhu16, MVE_v16u8, MVE_v8u16, 1,0,0>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRNthu16, MVE_v16u8, MVE_v8u16, 1,0,1>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRNbhu32, MVE_v8u16, MVE_v4u32, 1,0,0>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRNthu32, MVE_v8u16, MVE_v4u32, 1,0,1>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhs16, MVE_v16s8, MVE_v8s16, 1,1,0>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRNths16, MVE_v16s8, MVE_v8s16, 1,1,1>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhs32, MVE_v8s16, MVE_v4s32, 1,1,0>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRNths32, MVE_v8s16, MVE_v4s32, 1,1,1>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhu16, MVE_v16u8, MVE_v8u16, 1,1,0>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRNthu16, MVE_v16u8, MVE_v8u16, 1,1,1>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRNbhu32, MVE_v8u16, MVE_v4u32, 1,1,0>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRNthu32, MVE_v8u16, MVE_v4u32, 1,1,1>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRUNs16bh, MVE_v16u8, MVE_v8s16, 1,0,0>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRUNs16th, MVE_v16u8, MVE_v8s16, 1,0,1>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRUNs32bh, MVE_v8u16, MVE_v4s32, 1,0,0>;
+defm : MVE_VSHRN_patterns<MVE_VQSHRUNs32th, MVE_v8u16, MVE_v4s32, 1,0,1>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs16bh, MVE_v16u8, MVE_v8s16, 1,1,0>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs16th, MVE_v16u8, MVE_v8s16, 1,1,1>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs32bh, MVE_v8u16, MVE_v4s32, 1,1,0>;
+defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs32th, MVE_v8u16, MVE_v4s32, 1,1,1>;
+
// end of mve_imm_shift instructions
// start of mve_shift instructions
@@ -2293,13 +2713,31 @@ class MVE_shift_by_vec<string iname, string suffix, bit U,
let validForTailPredication = 1;
}
+multiclass MVE_shift_by_vec_p<string iname, MVEVectorVTInfo VTI, bit q, bit r> {
+ def "" : MVE_shift_by_vec<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, q, r>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ def : Pat<(VTI.Vec (int_arm_mve_vshl_vector
+ (VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh),
+ (i32 q), (i32 r), (i32 VTI.Unsigned))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh)))>;
+
+ def : Pat<(VTI.Vec (int_arm_mve_vshl_vector_predicated
+ (VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh),
+ (i32 q), (i32 r), (i32 VTI.Unsigned),
+ (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$in), (VTI.Vec MQPR:$sh),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+}
+
multiclass mve_shift_by_vec_multi<string iname, bit bit_4, bit bit_8> {
- def s8 : MVE_shift_by_vec<iname, "s8", 0b0, 0b00, bit_4, bit_8>;
- def s16 : MVE_shift_by_vec<iname, "s16", 0b0, 0b01, bit_4, bit_8>;
- def s32 : MVE_shift_by_vec<iname, "s32", 0b0, 0b10, bit_4, bit_8>;
- def u8 : MVE_shift_by_vec<iname, "u8", 0b1, 0b00, bit_4, bit_8>;
- def u16 : MVE_shift_by_vec<iname, "u16", 0b1, 0b01, bit_4, bit_8>;
- def u32 : MVE_shift_by_vec<iname, "u32", 0b1, 0b10, bit_4, bit_8>;
+ defm s8 : MVE_shift_by_vec_p<iname, MVE_v16s8, bit_4, bit_8>;
+ defm s16 : MVE_shift_by_vec_p<iname, MVE_v8s16, bit_4, bit_8>;
+ defm s32 : MVE_shift_by_vec_p<iname, MVE_v4s32, bit_4, bit_8>;
+ defm u8 : MVE_shift_by_vec_p<iname, MVE_v16u8, bit_4, bit_8>;
+ defm u16 : MVE_shift_by_vec_p<iname, MVE_v8u16, bit_4, bit_8>;
+ defm u32 : MVE_shift_by_vec_p<iname, MVE_v4u32, bit_4, bit_8>;
}
defm MVE_VSHL_by_vec : mve_shift_by_vec_multi<"vshl", 0b0, 0b0>;
@@ -2340,11 +2778,18 @@ class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops,
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
let validForTailPredication = 1;
+
+ // For the MVE_shift_imm_patterns multiclass to refer to
+ MVEVectorVTInfo VTI;
+ Operand immediateType;
+ Intrinsic unpred_int;
+ Intrinsic pred_int;
+ dag unsignedFlag = (?);
}
-class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm>
+class MVE_VSxI_imm<string iname, string suffix, bit bit_8, Operand immType>
: MVE_shift_with_imm<iname, suffix, (outs MQPR:$Qd),
- !con((ins MQPR:$Qd_src, MQPR:$Qm), imm),
+ (ins MQPR:$Qd_src, MQPR:$Qm, immType:$imm),
"$Qd, $Qm, $imm", vpred_n, "$Qd = $Qd_src"> {
bits<6> imm;
let Inst{28} = 0b1;
@@ -2353,76 +2798,99 @@ class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm>
let Inst{10-9} = 0b10;
let Inst{8} = bit_8;
let validForTailPredication = 1;
+
+ Operand immediateType = immType;
}
-def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, (ins shr_imm8:$imm)> {
+def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, shr_imm8> {
let Inst{21-19} = 0b001;
}
-def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, (ins shr_imm16:$imm)> {
+def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, shr_imm16> {
let Inst{21-20} = 0b01;
}
-def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, (ins shr_imm32:$imm)> {
+def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, shr_imm32> {
let Inst{21} = 0b1;
}
-def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, (ins imm0_7:$imm)> {
+def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, imm0_7> {
let Inst{21-19} = 0b001;
}
-def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, (ins imm0_15:$imm)> {
+def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, imm0_15> {
let Inst{21-20} = 0b01;
}
-def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,(ins imm0_31:$imm)> {
+def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,imm0_31> {
let Inst{21} = 0b1;
}
-class MVE_VQSHL_imm<string suffix, dag imm>
- : MVE_shift_with_imm<"vqshl", suffix, (outs MQPR:$Qd),
- !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
+multiclass MVE_VSxI_patterns<MVE_VSxI_imm inst, string name,
+ MVEVectorVTInfo VTI> {
+ defvar inparams = (? (VTI.Vec MQPR:$QdSrc), (VTI.Vec MQPR:$Qm),
+ (inst.immediateType:$imm));
+ defvar outparams = (inst (VTI.Vec MQPR:$QdSrc), (VTI.Vec MQPR:$Qm),
+ (inst.immediateType:$imm));
+ defvar unpred_int = !cast<Intrinsic>("int_arm_mve_" # name);
+ defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # name # "_predicated");
+
+ def : Pat<(VTI.Vec !setop(inparams, unpred_int)),
+ (VTI.Vec outparams)>;
+ def : Pat<(VTI.Vec !con(inparams, (pred_int (VTI.Pred VCCR:$pred)))),
+ (VTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred)))>;
+}
+
+defm : MVE_VSxI_patterns<MVE_VSLIimm8, "vsli", MVE_v16i8>;
+defm : MVE_VSxI_patterns<MVE_VSLIimm16, "vsli", MVE_v8i16>;
+defm : MVE_VSxI_patterns<MVE_VSLIimm32, "vsli", MVE_v4i32>;
+defm : MVE_VSxI_patterns<MVE_VSRIimm8, "vsri", MVE_v16i8>;
+defm : MVE_VSxI_patterns<MVE_VSRIimm16, "vsri", MVE_v8i16>;
+defm : MVE_VSxI_patterns<MVE_VSRIimm32, "vsri", MVE_v4i32>;
+
+class MVE_VQSHL_imm<MVEVectorVTInfo VTI_, Operand immType>
+ : MVE_shift_with_imm<"vqshl", VTI_.Suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
vpred_r, ""> {
bits<6> imm;
+ let Inst{28} = VTI_.Unsigned;
let Inst{25-24} = 0b11;
let Inst{21-16} = imm;
let Inst{10-8} = 0b111;
-}
-
-def MVE_VSLIimms8 : MVE_VQSHL_imm<"s8", (ins imm0_7:$imm)> {
- let Inst{28} = 0b0;
- let Inst{21-19} = 0b001;
-}
-
-def MVE_VSLIimmu8 : MVE_VQSHL_imm<"u8", (ins imm0_7:$imm)> {
- let Inst{28} = 0b1;
- let Inst{21-19} = 0b001;
-}
-def MVE_VSLIimms16 : MVE_VQSHL_imm<"s16", (ins imm0_15:$imm)> {
- let Inst{28} = 0b0;
- let Inst{21-20} = 0b01;
+ let VTI = VTI_;
+ let immediateType = immType;
+ let unsignedFlag = (? (i32 VTI.Unsigned));
}
-def MVE_VSLIimmu16 : MVE_VQSHL_imm<"u16", (ins imm0_15:$imm)> {
- let Inst{28} = 0b1;
- let Inst{21-20} = 0b01;
-}
-
-def MVE_VSLIimms32 : MVE_VQSHL_imm<"s32", (ins imm0_31:$imm)> {
- let Inst{28} = 0b0;
- let Inst{21} = 0b1;
-}
-
-def MVE_VSLIimmu32 : MVE_VQSHL_imm<"u32", (ins imm0_31:$imm)> {
- let Inst{28} = 0b1;
- let Inst{21} = 0b1;
+let unpred_int = int_arm_mve_vqshl_imm,
+ pred_int = int_arm_mve_vqshl_imm_predicated in {
+ def MVE_VQSHLimms8 : MVE_VQSHL_imm<MVE_v16s8, imm0_7> {
+ let Inst{21-19} = 0b001;
+ }
+ def MVE_VQSHLimmu8 : MVE_VQSHL_imm<MVE_v16u8, imm0_7> {
+ let Inst{21-19} = 0b001;
+ }
+
+ def MVE_VQSHLimms16 : MVE_VQSHL_imm<MVE_v8s16, imm0_15> {
+ let Inst{21-20} = 0b01;
+ }
+ def MVE_VQSHLimmu16 : MVE_VQSHL_imm<MVE_v8u16, imm0_15> {
+ let Inst{21-20} = 0b01;
+ }
+
+ def MVE_VQSHLimms32 : MVE_VQSHL_imm<MVE_v4s32, imm0_31> {
+ let Inst{21} = 0b1;
+ }
+ def MVE_VQSHLimmu32 : MVE_VQSHL_imm<MVE_v4u32, imm0_31> {
+ let Inst{21} = 0b1;
+ }
}
-class MVE_VQSHLU_imm<string suffix, dag imm>
- : MVE_shift_with_imm<"vqshlu", suffix, (outs MQPR:$Qd),
- !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
+class MVE_VQSHLU_imm<MVEVectorVTInfo VTI_, Operand immType>
+ : MVE_shift_with_imm<"vqshlu", VTI_.Suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
vpred_r, ""> {
bits<6> imm;
@@ -2430,61 +2898,103 @@ class MVE_VQSHLU_imm<string suffix, dag imm>
let Inst{25-24} = 0b11;
let Inst{21-16} = imm;
let Inst{10-8} = 0b110;
-}
-def MVE_VQSHLU_imms8 : MVE_VQSHLU_imm<"s8", (ins imm0_7:$imm)> {
- let Inst{21-19} = 0b001;
+ let VTI = VTI_;
+ let immediateType = immType;
}
-def MVE_VQSHLU_imms16 : MVE_VQSHLU_imm<"s16", (ins imm0_15:$imm)> {
- let Inst{21-20} = 0b01;
-}
+let unpred_int = int_arm_mve_vqshlu_imm,
+ pred_int = int_arm_mve_vqshlu_imm_predicated in {
+ def MVE_VQSHLU_imms8 : MVE_VQSHLU_imm<MVE_v16s8, imm0_7> {
+ let Inst{21-19} = 0b001;
+ }
-def MVE_VQSHLU_imms32 : MVE_VQSHLU_imm<"s32", (ins imm0_31:$imm)> {
- let Inst{21} = 0b1;
+ def MVE_VQSHLU_imms16 : MVE_VQSHLU_imm<MVE_v8s16, imm0_15> {
+ let Inst{21-20} = 0b01;
+ }
+
+ def MVE_VQSHLU_imms32 : MVE_VQSHLU_imm<MVE_v4s32, imm0_31> {
+ let Inst{21} = 0b1;
+ }
}
-class MVE_VRSHR_imm<string suffix, dag imm>
- : MVE_shift_with_imm<"vrshr", suffix, (outs MQPR:$Qd),
- !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
+class MVE_VRSHR_imm<MVEVectorVTInfo VTI_, Operand immType>
+ : MVE_shift_with_imm<"vrshr", VTI_.Suffix, (outs MQPR:$Qd),
+ (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
vpred_r, ""> {
bits<6> imm;
+ let Inst{28} = VTI_.Unsigned;
let Inst{25-24} = 0b11;
let Inst{21-16} = imm;
let Inst{10-8} = 0b010;
-}
-def MVE_VRSHR_imms8 : MVE_VRSHR_imm<"s8", (ins shr_imm8:$imm)> {
- let Inst{28} = 0b0;
- let Inst{21-19} = 0b001;
+ let VTI = VTI_;
+ let immediateType = immType;
+ let unsignedFlag = (? (i32 VTI.Unsigned));
}
-def MVE_VRSHR_immu8 : MVE_VRSHR_imm<"u8", (ins shr_imm8:$imm)> {
- let Inst{28} = 0b1;
- let Inst{21-19} = 0b001;
-}
+let unpred_int = int_arm_mve_vrshr_imm,
+ pred_int = int_arm_mve_vrshr_imm_predicated in {
+ def MVE_VRSHR_imms8 : MVE_VRSHR_imm<MVE_v16s8, shr_imm8> {
+ let Inst{21-19} = 0b001;
+ }
-def MVE_VRSHR_imms16 : MVE_VRSHR_imm<"s16", (ins shr_imm16:$imm)> {
- let Inst{28} = 0b0;
- let Inst{21-20} = 0b01;
-}
+ def MVE_VRSHR_immu8 : MVE_VRSHR_imm<MVE_v16u8, shr_imm8> {
+ let Inst{21-19} = 0b001;
+ }
-def MVE_VRSHR_immu16 : MVE_VRSHR_imm<"u16", (ins shr_imm16:$imm)> {
- let Inst{28} = 0b1;
- let Inst{21-20} = 0b01;
-}
+ def MVE_VRSHR_imms16 : MVE_VRSHR_imm<MVE_v8s16, shr_imm16> {
+ let Inst{21-20} = 0b01;
+ }
-def MVE_VRSHR_imms32 : MVE_VRSHR_imm<"s32", (ins shr_imm32:$imm)> {
- let Inst{28} = 0b0;
- let Inst{21} = 0b1;
-}
+ def MVE_VRSHR_immu16 : MVE_VRSHR_imm<MVE_v8u16, shr_imm16> {
+ let Inst{21-20} = 0b01;
+ }
-def MVE_VRSHR_immu32 : MVE_VRSHR_imm<"u32", (ins shr_imm32:$imm)> {
- let Inst{28} = 0b1;
- let Inst{21} = 0b1;
+ def MVE_VRSHR_imms32 : MVE_VRSHR_imm<MVE_v4s32, shr_imm32> {
+ let Inst{21} = 0b1;
+ }
+
+ def MVE_VRSHR_immu32 : MVE_VRSHR_imm<MVE_v4u32, shr_imm32> {
+ let Inst{21} = 0b1;
+ }
}
+multiclass MVE_shift_imm_patterns<MVE_shift_with_imm inst> {
+ def : Pat<(inst.VTI.Vec !con((inst.unpred_int (inst.VTI.Vec MQPR:$src),
+ inst.immediateType:$imm),
+ inst.unsignedFlag)),
+ (inst.VTI.Vec (inst (inst.VTI.Vec MQPR:$src),
+ inst.immediateType:$imm))>;
+
+ def : Pat<(inst.VTI.Vec !con((inst.pred_int (inst.VTI.Vec MQPR:$src),
+ inst.immediateType:$imm),
+ inst.unsignedFlag,
+ (? (inst.VTI.Pred VCCR:$mask),
+ (inst.VTI.Vec MQPR:$inactive)))),
+ (inst.VTI.Vec (inst (inst.VTI.Vec MQPR:$src),
+ inst.immediateType:$imm,
+ ARMVCCThen, (inst.VTI.Pred VCCR:$mask),
+ (inst.VTI.Vec MQPR:$inactive)))>;
+}
+
+defm : MVE_shift_imm_patterns<MVE_VQSHLimms8>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLimmu8>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLimms16>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLimmu16>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLimms32>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLimmu32>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms8>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms16>;
+defm : MVE_shift_imm_patterns<MVE_VQSHLU_imms32>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_imms8>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_immu8>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_imms16>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_immu16>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_imms32>;
+defm : MVE_shift_imm_patterns<MVE_VRSHR_immu32>;
+
class MVE_VSHR_imm<string suffix, dag imm>
: MVE_shift_with_imm<"vshr", suffix, (outs MQPR:$Qd),
!con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
@@ -2550,27 +3060,39 @@ def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm)> {
let Inst{21} = 0b1;
}
+multiclass MVE_immediate_shift_patterns_inner<
+ MVEVectorVTInfo VTI, Operand imm_operand_type, SDNode unpred_op,
+ Intrinsic pred_int, Instruction inst, list<int> unsignedFlag = []> {
+
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$src), imm_operand_type:$imm)),
+ (VTI.Vec (inst (VTI.Vec MQPR:$src), imm_operand_type:$imm))>;
+
+ def : Pat<(VTI.Vec !con((pred_int (VTI.Vec MQPR:$src), imm_operand_type:$imm),
+ !dag(pred_int, unsignedFlag, ?),
+ (pred_int (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))),
+ (VTI.Vec (inst (VTI.Vec MQPR:$src), imm_operand_type:$imm,
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+}
+
+multiclass MVE_immediate_shift_patterns<MVEVectorVTInfo VTI,
+ Operand imm_operand_type> {
+ defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type,
+ ARMvshlImm, int_arm_mve_shl_imm_predicated,
+ !cast<Instruction>("MVE_VSHL_immi" # VTI.BitsSuffix)>;
+ defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type,
+ ARMvshruImm, int_arm_mve_shr_imm_predicated,
+ !cast<Instruction>("MVE_VSHR_immu" # VTI.BitsSuffix), [1]>;
+ defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type,
+ ARMvshrsImm, int_arm_mve_shr_imm_predicated,
+ !cast<Instruction>("MVE_VSHR_imms" # VTI.BitsSuffix), [0]>;
+}
+
let Predicates = [HasMVEInt] in {
- def : Pat<(v4i32 (ARMvshlImm (v4i32 MQPR:$src), imm0_31:$imm)),
- (v4i32 (MVE_VSHL_immi32 (v4i32 MQPR:$src), imm0_31:$imm))>;
- def : Pat<(v8i16 (ARMvshlImm (v8i16 MQPR:$src), imm0_15:$imm)),
- (v8i16 (MVE_VSHL_immi16 (v8i16 MQPR:$src), imm0_15:$imm))>;
- def : Pat<(v16i8 (ARMvshlImm (v16i8 MQPR:$src), imm0_7:$imm)),
- (v16i8 (MVE_VSHL_immi8 (v16i8 MQPR:$src), imm0_7:$imm))>;
-
- def : Pat<(v4i32 (ARMvshruImm (v4i32 MQPR:$src), imm0_31:$imm)),
- (v4i32 (MVE_VSHR_immu32 (v4i32 MQPR:$src), imm0_31:$imm))>;
- def : Pat<(v8i16 (ARMvshruImm (v8i16 MQPR:$src), imm0_15:$imm)),
- (v8i16 (MVE_VSHR_immu16 (v8i16 MQPR:$src), imm0_15:$imm))>;
- def : Pat<(v16i8 (ARMvshruImm (v16i8 MQPR:$src), imm0_7:$imm)),
- (v16i8 (MVE_VSHR_immu8 (v16i8 MQPR:$src), imm0_7:$imm))>;
-
- def : Pat<(v4i32 (ARMvshrsImm (v4i32 MQPR:$src), imm0_31:$imm)),
- (v4i32 (MVE_VSHR_imms32 (v4i32 MQPR:$src), imm0_31:$imm))>;
- def : Pat<(v8i16 (ARMvshrsImm (v8i16 MQPR:$src), imm0_15:$imm)),
- (v8i16 (MVE_VSHR_imms16 (v8i16 MQPR:$src), imm0_15:$imm))>;
- def : Pat<(v16i8 (ARMvshrsImm (v16i8 MQPR:$src), imm0_7:$imm)),
- (v16i8 (MVE_VSHR_imms8 (v16i8 MQPR:$src), imm0_7:$imm))>;
+ defm : MVE_immediate_shift_patterns<MVE_v16i8, imm0_7>;
+ defm : MVE_immediate_shift_patterns<MVE_v8i16, imm0_15>;
+ defm : MVE_immediate_shift_patterns<MVE_v4i32, imm0_31>;
}
// end of mve_shift instructions
@@ -2652,8 +3174,8 @@ class MVEFloatArithNeon<string iname, string suffix, bit size,
let Inst{16} = 0b0;
}
-class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]>
- : MVEFloatArithNeon<"vmul", suffix, size, (outs MQPR:$Qd),
+class MVE_VMUL_fp<string iname, string suffix, bit size, list<dag> pattern=[]>
+ : MVEFloatArithNeon<iname, suffix, size, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", vpred_r, "",
pattern> {
bits<4> Qd;
@@ -2671,20 +3193,32 @@ class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]>
let validForTailPredication = 1;
}
-def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>;
-def MVE_VMULf16 : MVE_VMUL_fp<"f16", 0b1>;
+multiclass MVE_VMULT_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size{0}>;
+ defvar Inst = !cast<Instruction>(NAME);
-let Predicates = [HasMVEFloat] in {
- def : Pat<(v4f32 (fmul (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
- (v4f32 (MVE_VMULf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
- def : Pat<(v8f16 (fmul (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
- (v8f16 (MVE_VMULf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
}
-class MVE_VCMLA<string suffix, bit size, list<dag> pattern=[]>
+multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI>
+ : MVE_VMULT_fp_m<"vmul", 0, VTI, fmul, int_arm_mve_mul_predicated>;
+
+defm MVE_VMULf32 : MVE_VMUL_fp_m<MVE_v4f32>;
+defm MVE_VMULf16 : MVE_VMUL_fp_m<MVE_v8f16>;
+
+class MVE_VCMLA<string suffix, bit size>
: MVEFloatArithNeon<"vcmla", suffix, size, (outs MQPR:$Qd),
(ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
- "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", pattern> {
+ "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", []> {
bits<4> Qd;
bits<4> Qn;
bits<2> rot;
@@ -2701,8 +3235,31 @@ class MVE_VCMLA<string suffix, bit size, list<dag> pattern=[]>
let Inst{4} = 0b0;
}
-def MVE_VCMLAf16 : MVE_VCMLA<"f16", 0b0>;
-def MVE_VCMLAf32 : MVE_VCMLA<"f32", 0b1>;
+multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI, bit size> {
+ def "" : MVE_VCMLA<VTI.Suffix, size>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(VTI.Vec (int_arm_mve_vcmlaq
+ imm:$rot, (VTI.Vec MQPR:$Qd_src),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ imm:$rot))>;
+
+ def : Pat<(VTI.Vec (int_arm_mve_vcmlaq_predicated
+ imm:$rot, (VTI.Vec MQPR:$Qd_src),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qn),
+ (VTI.Vec MQPR:$Qm), imm:$rot,
+ ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+
+ }
+}
+
+defm MVE_VCMLAf16 : MVE_VCMLA_m<MVE_v8f16, 0b0>;
+defm MVE_VCMLAf32 : MVE_VCMLA_m<MVE_v4f32, 0b1>;
class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4,
bit bit_8, bit bit_21, dag iops=(ins),
@@ -2736,63 +3293,50 @@ def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1,
def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1,
(ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-let Predicates = [HasMVEFloat, UseFusedMAC] in {
- def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1),
- (fmul (v8f16 MQPR:$src2),
- (v8f16 MQPR:$src3)))),
- (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>;
- def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1),
- (fmul (v4f32 MQPR:$src2),
- (v4f32 MQPR:$src3)))),
- (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>;
-
- def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1),
- (fmul (v8f16 MQPR:$src2),
- (v8f16 MQPR:$src3)))),
- (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>;
- def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1),
- (fmul (v4f32 MQPR:$src2),
- (v4f32 MQPR:$src3)))),
- (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>;
-}
-
let Predicates = [HasMVEFloat] in {
def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
(v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>;
def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
(v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>;
+ def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
+ (v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>;
+ def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
+ (v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>;
}
-
-let validForTailPredication = 1 in {
- def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>;
- def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>;
-}
-
-let Predicates = [HasMVEFloat] in {
- def : Pat<(v4f32 (fadd (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
- (v4f32 (MVE_VADDf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
- def : Pat<(v8f16 (fadd (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
- (v8f16 (MVE_VADDf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> {
+ let validForTailPredication = 1;
+ }
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
}
+multiclass MVE_VADD_fp_m<MVEVectorVTInfo VTI>
+ : MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated>;
+multiclass MVE_VSUB_fp_m<MVEVectorVTInfo VTI>
+ : MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated>;
-let validForTailPredication = 1 in {
- def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>;
- def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>;
-}
+defm MVE_VADDf32 : MVE_VADD_fp_m<MVE_v4f32>;
+defm MVE_VADDf16 : MVE_VADD_fp_m<MVE_v8f16>;
-let Predicates = [HasMVEFloat] in {
- def : Pat<(v4f32 (fsub (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
- (v4f32 (MVE_VSUBf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
- def : Pat<(v8f16 (fsub (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
- (v8f16 (MVE_VSUBf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
-}
+defm MVE_VSUBf32 : MVE_VSUB_fp_m<MVE_v4f32>;
+defm MVE_VSUBf16 : MVE_VSUB_fp_m<MVE_v8f16>;
-class MVE_VCADD<string suffix, bit size, string cstr="", list<dag> pattern=[]>
+class MVE_VCADD<string suffix, bit size, string cstr="">
: MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
- "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
+ "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
bits<4> Qd;
bits<4> Qn;
bit rot;
@@ -2810,8 +3354,29 @@ class MVE_VCADD<string suffix, bit size, string cstr="", list<dag> pattern=[]>
let Inst{4} = 0b0;
}
-def MVE_VCADDf16 : MVE_VCADD<"f16", 0b0>;
-def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1, "@earlyclobber $Qd">;
+multiclass MVE_VCADD_m<MVEVectorVTInfo VTI, bit size, string cstr=""> {
+ def "" : MVE_VCADD<VTI.Suffix, size, cstr>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(VTI.Vec (int_arm_mve_vcaddq (i32 1),
+ imm:$rot, (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ imm:$rot))>;
+
+ def : Pat<(VTI.Vec (int_arm_mve_vcaddq_predicated (i32 1),
+ imm:$rot, (VTI.Vec MQPR:$inactive),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+
+ }
+}
+
+defm MVE_VCADDf16 : MVE_VCADD_m<MVE_v8f16, 0b0>;
+defm MVE_VCADDf32 : MVE_VCADD_m<MVE_v4f32, 0b1, "@earlyclobber $Qd">;
class MVE_VABD_fp<string suffix, bit size>
: MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
@@ -2833,8 +3398,29 @@ class MVE_VABD_fp<string suffix, bit size>
let validForTailPredication = 1;
}
-def MVE_VABDf32 : MVE_VABD_fp<"f32", 0b0>;
-def MVE_VABDf16 : MVE_VABD_fp<"f16", 0b1>;
+multiclass MVE_VABDT_fp_m<MVEVectorVTInfo VTI,
+ Intrinsic unpred_int, Intrinsic pred_int> {
+ def "" : MVE_VABD_fp<VTI.Suffix, VTI.Size{0}>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 0))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 0), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
+}
+
+multiclass MVE_VABD_fp_m<MVEVectorVTInfo VTI>
+ : MVE_VABDT_fp_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
+
+defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>;
+defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>;
class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
Operand imm_operand_type, list<dag> pattern=[]>
@@ -3186,120 +3772,120 @@ def MVE_VCMPs8r : MVE_VCMPqrs<"s8", 0b00>;
def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>;
def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>;
-multiclass unpred_vcmp_z<string suffix, int fc> {
- def i8 : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))),
+multiclass unpred_vcmp_z<string suffix, PatLeaf fc> {
+ def i8 : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), fc)),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc))>;
- def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))),
+ def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), fc)),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc))>;
- def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))),
+ def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), fc)),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc))>;
- def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))))),
- (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
- def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))))),
- (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
- def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))))),
- (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+ def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), fc)))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), fc)))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), fc)))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
}
-multiclass unpred_vcmp_r<string suffix, int fc> {
- def i8 : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))),
+multiclass unpred_vcmp_r<string suffix, PatLeaf fc> {
+ def i8 : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc))>;
- def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))),
+ def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc)),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc))>;
- def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))),
+ def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>;
- def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))),
+ def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>;
- def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))),
+ def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>;
- def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))),
+ def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>;
- def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))))),
- (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, 1, VCCR:$p1))>;
- def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))))),
- (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, 1, VCCR:$p1))>;
- def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))))),
- (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc)))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
- def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))))),
- (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
- def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))))),
- (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
- def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))))),
- (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
}
-multiclass unpred_vcmpf_z<int fc> {
- def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))),
+multiclass unpred_vcmpf_z<PatLeaf fc> {
+ def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)),
(v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc))>;
- def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))),
+ def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>;
- def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))))),
- (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
- def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))))),
- (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)))),
+ (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
}
multiclass unpred_vcmpf_r<int fc> {
- def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))),
+ def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)),
(v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
- def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))),
+ def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)),
(v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;
- def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))),
+ def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)),
(v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>;
- def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))),
+ def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>;
- def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))))),
- (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, 1, VCCR:$p1))>;
- def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))))),
- (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))),
+ (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))),
+ (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
- def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))))),
- (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, 1, VCCR:$p1))>;
- def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))))),
- (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, 1, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
}
let Predicates = [HasMVEInt] in {
- defm MVE_VCEQZ : unpred_vcmp_z<"i", 0>;
- defm MVE_VCNEZ : unpred_vcmp_z<"i", 1>;
- defm MVE_VCGEZ : unpred_vcmp_z<"s", 10>;
- defm MVE_VCLTZ : unpred_vcmp_z<"s", 11>;
- defm MVE_VCGTZ : unpred_vcmp_z<"s", 12>;
- defm MVE_VCLEZ : unpred_vcmp_z<"s", 13>;
- defm MVE_VCGTUZ : unpred_vcmp_z<"u", 8>;
- defm MVE_VCGEUZ : unpred_vcmp_z<"u", 2>;
-
- defm MVE_VCEQ : unpred_vcmp_r<"i", 0>;
- defm MVE_VCNE : unpred_vcmp_r<"i", 1>;
- defm MVE_VCGE : unpred_vcmp_r<"s", 10>;
- defm MVE_VCLT : unpred_vcmp_r<"s", 11>;
- defm MVE_VCGT : unpred_vcmp_r<"s", 12>;
- defm MVE_VCLE : unpred_vcmp_r<"s", 13>;
- defm MVE_VCGTU : unpred_vcmp_r<"u", 8>;
- defm MVE_VCGEU : unpred_vcmp_r<"u", 2>;
+ defm MVE_VCEQZ : unpred_vcmp_z<"i", ARMCCeq>;
+ defm MVE_VCNEZ : unpred_vcmp_z<"i", ARMCCne>;
+ defm MVE_VCGEZ : unpred_vcmp_z<"s", ARMCCge>;
+ defm MVE_VCLTZ : unpred_vcmp_z<"s", ARMCClt>;
+ defm MVE_VCGTZ : unpred_vcmp_z<"s", ARMCCgt>;
+ defm MVE_VCLEZ : unpred_vcmp_z<"s", ARMCCle>;
+ defm MVE_VCGTUZ : unpred_vcmp_z<"u", ARMCChi>;
+ defm MVE_VCGEUZ : unpred_vcmp_z<"u", ARMCChs>;
+
+ defm MVE_VCEQ : unpred_vcmp_r<"i", ARMCCeq>;
+ defm MVE_VCNE : unpred_vcmp_r<"i", ARMCCne>;
+ defm MVE_VCGE : unpred_vcmp_r<"s", ARMCCge>;
+ defm MVE_VCLT : unpred_vcmp_r<"s", ARMCClt>;
+ defm MVE_VCGT : unpred_vcmp_r<"s", ARMCCgt>;
+ defm MVE_VCLE : unpred_vcmp_r<"s", ARMCCle>;
+ defm MVE_VCGTU : unpred_vcmp_r<"u", ARMCChi>;
+ defm MVE_VCGEU : unpred_vcmp_r<"u", ARMCChs>;
}
let Predicates = [HasMVEFloat] in {
- defm MVE_VFCEQZ : unpred_vcmpf_z<0>;
- defm MVE_VFCNEZ : unpred_vcmpf_z<1>;
- defm MVE_VFCGEZ : unpred_vcmpf_z<10>;
- defm MVE_VFCLTZ : unpred_vcmpf_z<11>;
- defm MVE_VFCGTZ : unpred_vcmpf_z<12>;
- defm MVE_VFCLEZ : unpred_vcmpf_z<13>;
+ defm MVE_VFCEQZ : unpred_vcmpf_z<ARMCCeq>;
+ defm MVE_VFCNEZ : unpred_vcmpf_z<ARMCCne>;
+ defm MVE_VFCGEZ : unpred_vcmpf_z<ARMCCge>;
+ defm MVE_VFCLTZ : unpred_vcmpf_z<ARMCClt>;
+ defm MVE_VFCGTZ : unpred_vcmpf_z<ARMCCgt>;
+ defm MVE_VFCLEZ : unpred_vcmpf_z<ARMCCle>;
- defm MVE_VFCEQ : unpred_vcmpf_r<0>;
- defm MVE_VFCNE : unpred_vcmpf_r<1>;
- defm MVE_VFCGE : unpred_vcmpf_r<10>;
- defm MVE_VFCLT : unpred_vcmpf_r<11>;
- defm MVE_VFCGT : unpred_vcmpf_r<12>;
- defm MVE_VFCLE : unpred_vcmpf_r<13>;
+ defm MVE_VFCEQ : unpred_vcmpf_r<ARMCCeq>;
+ defm MVE_VFCNE : unpred_vcmpf_r<ARMCCne>;
+ defm MVE_VFCGE : unpred_vcmpf_r<ARMCCge>;
+ defm MVE_VFCLT : unpred_vcmpf_r<ARMCClt>;
+ defm MVE_VFCGT : unpred_vcmpf_r<ARMCCgt>;
+ defm MVE_VFCLE : unpred_vcmpf_r<ARMCCle>;
}
@@ -3403,10 +3989,10 @@ defm MVE_VQDMLSDHX : MVE_VQxDMLxDH_multi<"vqdmlsdhx", 0b1, 0b0, 0b1>;
defm MVE_VQRDMLSDH : MVE_VQxDMLxDH_multi<"vqrdmlsdh", 0b0, 0b1, 0b1>;
defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>;
-class MVE_VCMUL<string iname, string suffix, bit size, string cstr="", list<dag> pattern=[]>
+class MVE_VCMUL<string iname, string suffix, bit size, string cstr="">
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
- "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
+ "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
bits<4> Qn;
bits<2> rot;
@@ -3422,8 +4008,30 @@ class MVE_VCMUL<string iname, string suffix, bit size, string cstr="", list<dag>
let Predicates = [HasMVEFloat];
}
-def MVE_VCMULf16 : MVE_VCMUL<"vcmul", "f16", 0b0>;
-def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1, "@earlyclobber $Qd">;
+multiclass MVE_VCMUL_m<string iname, MVEVectorVTInfo VTI,
+ bit size, string cstr=""> {
+ def "" : MVE_VCMUL<iname, VTI.Suffix, size, cstr>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(VTI.Vec (int_arm_mve_vcmulq
+ imm:$rot, (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ imm:$rot))>;
+
+ def : Pat<(VTI.Vec (int_arm_mve_vcmulq_predicated
+ imm:$rot, (VTI.Vec MQPR:$inactive),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+
+ }
+}
+
+defm MVE_VCMULf16 : MVE_VCMUL_m<"vcmul", MVE_v8f16, 0b0>;
+defm MVE_VCMULf32 : MVE_VCMUL_m<"vcmul", MVE_v4f32, 0b1, "@earlyclobber $Qd">;
class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
bit T, string cstr, list<dag> pattern=[]>
@@ -3442,29 +4050,80 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
let Inst{8} = 0b0;
let Inst{7} = Qn{3};
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
-multiclass MVE_VMULL_multi<string iname, string suffix,
- bit bit_28, bits<2> bits_21_20, string cstr=""> {
- def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0, cstr>;
- def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1, cstr>;
+multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int,
+ bit Top, string cstr=""> {
+ def "" : MVE_VMULL<"vmull" # !if(Top, "t", "b"), VTI.Suffix, VTI.Unsigned,
+ VTI.Size, Top, cstr>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ defvar uflag = !if(!eq(VTI.SuffixLetter, "p"), (?), (? (i32 VTI.Unsigned)));
+
+ // Unpredicated multiply
+ def : Pat<(VTI.DblVec !con((unpred_op (VTI.Vec MQPR:$Qm),
+ (VTI.Vec MQPR:$Qn)),
+ uflag, (? (i32 Top)))),
+ (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+ // Predicated multiply
+ def : Pat<(VTI.DblVec !con((pred_int (VTI.Vec MQPR:$Qm),
+ (VTI.Vec MQPR:$Qn)),
+ uflag, (? (i32 Top), (VTI.Pred VCCR:$mask),
+ (VTI.DblVec MQPR:$inactive)))),
+ (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.DblVec MQPR:$inactive)))>;
+ }
}
-// For integer multiplies, bits 21:20 encode size, and bit 28 signedness.
-// For polynomial multiplies, bits 21:20 take the unused value 0b11, and
-// bit 28 switches to encoding the size.
-
-defm MVE_VMULLs8 : MVE_VMULL_multi<"vmull", "s8", 0b0, 0b00>;
-defm MVE_VMULLs16 : MVE_VMULL_multi<"vmull", "s16", 0b0, 0b01>;
-defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10, "@earlyclobber $Qd">;
-defm MVE_VMULLu8 : MVE_VMULL_multi<"vmull", "u8", 0b1, 0b00>;
-defm MVE_VMULLu16 : MVE_VMULL_multi<"vmull", "u16", 0b1, 0b01>;
-defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10, "@earlyclobber $Qd">;
-defm MVE_VMULLp8 : MVE_VMULL_multi<"vmull", "p8", 0b0, 0b11>;
-defm MVE_VMULLp16 : MVE_VMULL_multi<"vmull", "p16", 0b1, 0b11>;
-
-class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size,
- bit round, list<dag> pattern=[]>
+// For polynomial multiplies, the size bits take the unused value 0b11, and
+// the unsigned bit switches to encoding the size.
+
+defm MVE_VMULLBs8 : MVE_VMULL_m<MVE_v16s8, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b0>;
+defm MVE_VMULLTs8 : MVE_VMULL_m<MVE_v16s8, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b1>;
+defm MVE_VMULLBs16 : MVE_VMULL_m<MVE_v8s16, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b0>;
+defm MVE_VMULLTs16 : MVE_VMULL_m<MVE_v8s16, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b1>;
+defm MVE_VMULLBs32 : MVE_VMULL_m<MVE_v4s32, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b0,
+ "@earlyclobber $Qd">;
+defm MVE_VMULLTs32 : MVE_VMULL_m<MVE_v4s32, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b1,
+ "@earlyclobber $Qd">;
+
+defm MVE_VMULLBu8 : MVE_VMULL_m<MVE_v16u8, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b0>;
+defm MVE_VMULLTu8 : MVE_VMULL_m<MVE_v16u8, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b1>;
+defm MVE_VMULLBu16 : MVE_VMULL_m<MVE_v8u16, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b0>;
+defm MVE_VMULLTu16 : MVE_VMULL_m<MVE_v8u16, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b1>;
+defm MVE_VMULLBu32 : MVE_VMULL_m<MVE_v4u32, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b0,
+ "@earlyclobber $Qd">;
+defm MVE_VMULLTu32 : MVE_VMULL_m<MVE_v4u32, int_arm_mve_vmull,
+ int_arm_mve_mull_int_predicated, 0b1,
+ "@earlyclobber $Qd">;
+
+defm MVE_VMULLBp8 : MVE_VMULL_m<MVE_v16p8, int_arm_mve_vmull_poly,
+ int_arm_mve_mull_poly_predicated, 0b0>;
+defm MVE_VMULLTp8 : MVE_VMULL_m<MVE_v16p8, int_arm_mve_vmull_poly,
+ int_arm_mve_mull_poly_predicated, 0b1>;
+defm MVE_VMULLBp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
+ int_arm_mve_mull_poly_predicated, 0b0>;
+defm MVE_VMULLTp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
+ int_arm_mve_mull_poly_predicated, 0b1>;
+
+class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round,
+ list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
vpred_r, "", pattern> {
@@ -3480,19 +4139,46 @@ class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size,
let Inst{0} = 0b1;
}
-def MVE_VMULHs8 : MVE_VxMULH<"vmulh", "s8", 0b0, 0b00, 0b0>;
-def MVE_VMULHs16 : MVE_VxMULH<"vmulh", "s16", 0b0, 0b01, 0b0>;
-def MVE_VMULHs32 : MVE_VxMULH<"vmulh", "s32", 0b0, 0b10, 0b0>;
-def MVE_VMULHu8 : MVE_VxMULH<"vmulh", "u8", 0b1, 0b00, 0b0>;
-def MVE_VMULHu16 : MVE_VxMULH<"vmulh", "u16", 0b1, 0b01, 0b0>;
-def MVE_VMULHu32 : MVE_VxMULH<"vmulh", "u32", 0b1, 0b10, 0b0>;
+multiclass MVE_VxMULH_m<string iname, MVEVectorVTInfo VTI, SDNode unpred_op,
+ Intrinsic pred_int, bit round> {
+ def "" : MVE_VxMULH<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, round>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated multiply returning high bits
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+ // Predicated multiply returning high bits
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
+}
+
+multiclass MVE_VMULT<string iname, MVEVectorVTInfo VTI, bit round>
+ : MVE_VxMULH_m<iname, VTI, !if(round, int_arm_mve_vrmulh, int_arm_mve_vmulh),
+ !if(round, int_arm_mve_rmulh_predicated,
+ int_arm_mve_mulh_predicated),
+ round>;
+
+defm MVE_VMULHs8 : MVE_VMULT<"vmulh", MVE_v16s8, 0b0>;
+defm MVE_VMULHs16 : MVE_VMULT<"vmulh", MVE_v8s16, 0b0>;
+defm MVE_VMULHs32 : MVE_VMULT<"vmulh", MVE_v4s32, 0b0>;
+defm MVE_VMULHu8 : MVE_VMULT<"vmulh", MVE_v16u8, 0b0>;
+defm MVE_VMULHu16 : MVE_VMULT<"vmulh", MVE_v8u16, 0b0>;
+defm MVE_VMULHu32 : MVE_VMULT<"vmulh", MVE_v4u32, 0b0>;
-def MVE_VRMULHs8 : MVE_VxMULH<"vrmulh", "s8", 0b0, 0b00, 0b1>;
-def MVE_VRMULHs16 : MVE_VxMULH<"vrmulh", "s16", 0b0, 0b01, 0b1>;
-def MVE_VRMULHs32 : MVE_VxMULH<"vrmulh", "s32", 0b0, 0b10, 0b1>;
-def MVE_VRMULHu8 : MVE_VxMULH<"vrmulh", "u8", 0b1, 0b00, 0b1>;
-def MVE_VRMULHu16 : MVE_VxMULH<"vrmulh", "u16", 0b1, 0b01, 0b1>;
-def MVE_VRMULHu32 : MVE_VxMULH<"vrmulh", "u32", 0b1, 0b10, 0b1>;
+defm MVE_VRMULHs8 : MVE_VMULT<"vrmulh", MVE_v16s8, 0b1>;
+defm MVE_VRMULHs16 : MVE_VMULT<"vrmulh", MVE_v8s16, 0b1>;
+defm MVE_VRMULHs32 : MVE_VMULT<"vrmulh", MVE_v4s32, 0b1>;
+defm MVE_VRMULHu8 : MVE_VMULT<"vrmulh", MVE_v16u8, 0b1>;
+defm MVE_VRMULHu16 : MVE_VMULT<"vrmulh", MVE_v8u16, 0b1>;
+defm MVE_VRMULHu32 : MVE_VMULT<"vrmulh", MVE_v4u32, 0b1>;
class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17,
bits<2> size, bit T, list<dag> pattern=[]>
@@ -3551,19 +4237,36 @@ class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
let Predicates = [HasMVEFloat];
}
-multiclass MVE_VCVT_ff_halves<string suffix, bit op> {
- def bh : MVE_VCVT_ff<"vcvtb", suffix, op, 0b0>;
- def th : MVE_VCVT_ff<"vcvtt", suffix, op, 0b1>;
+multiclass MVE_VCVT_f2h_m<string iname, int half> {
+ def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(v8f16 (int_arm_mve_vcvt_narrow
+ (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))),
+ (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>;
+ def : Pat<(v8f16 (int_arm_mve_vcvt_narrow_predicated
+ (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half),
+ (v4i1 VCCR:$mask))),
+ (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm),
+ ARMVCCThen, (v4i1 VCCR:$mask)))>;
+ }
}
-defm MVE_VCVTf16f32 : MVE_VCVT_ff_halves<"f16.f32", 0b0>;
-defm MVE_VCVTf32f16 : MVE_VCVT_ff_halves<"f32.f16", 0b1>;
+multiclass MVE_VCVT_h2f_m<string iname, int half> {
+ def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half>;
+}
+
+defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>;
+defm MVE_VCVTf16f32th : MVE_VCVT_f2h_m<"vcvtt", 0b1>;
+defm MVE_VCVTf32f16bh : MVE_VCVT_h2f_m<"vcvtb", 0b0>;
+defm MVE_VCVTf32f16th : MVE_VCVT_h2f_m<"vcvtt", 0b1>;
class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
- string cstr="", list<dag> pattern=[]>
+ string cstr="">
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
- "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
+ "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
bits<4> Qn;
bit rot;
@@ -3577,13 +4280,35 @@ class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
let Inst{0} = 0b0;
}
-def MVE_VCADDi8 : MVE_VxCADD<"vcadd", "i8", 0b00, 0b1>;
-def MVE_VCADDi16 : MVE_VxCADD<"vcadd", "i16", 0b01, 0b1>;
-def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1, "@earlyclobber $Qd">;
+multiclass MVE_VxCADD_m<string iname, MVEVectorVTInfo VTI,
+ bit halve, string cstr=""> {
+ def "" : MVE_VxCADD<iname, VTI.Suffix, VTI.Size, halve, cstr>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(VTI.Vec (int_arm_mve_vcaddq halve,
+ imm:$rot, (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ imm:$rot))>;
+
+ def : Pat<(VTI.Vec (int_arm_mve_vcaddq_predicated halve,
+ imm:$rot, (VTI.Vec MQPR:$inactive),
+ (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (VTI.Vec MQPR:$Qm),
+ imm:$rot, ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+
+ }
+}
+
+defm MVE_VCADDi8 : MVE_VxCADD_m<"vcadd", MVE_v16i8, 0b1>;
+defm MVE_VCADDi16 : MVE_VxCADD_m<"vcadd", MVE_v8i16, 0b1>;
+defm MVE_VCADDi32 : MVE_VxCADD_m<"vcadd", MVE_v4i32, 0b1, "@earlyclobber $Qd">;
-def MVE_VHCADDs8 : MVE_VxCADD<"vhcadd", "s8", 0b00, 0b0>;
-def MVE_VHCADDs16 : MVE_VxCADD<"vhcadd", "s16", 0b01, 0b0>;
-def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0, "@earlyclobber $Qd">;
+defm MVE_VHCADDs8 : MVE_VxCADD_m<"vhcadd", MVE_v16s8, 0b0>;
+defm MVE_VHCADDs16 : MVE_VxCADD_m<"vhcadd", MVE_v8s16, 0b0>;
+defm MVE_VHCADDs32 : MVE_VxCADD_m<"vhcadd", MVE_v4s32, 0b0, "@earlyclobber $Qd">;
class MVE_VADCSBC<string iname, bit I, bit subtract,
dag carryin, list<dag> pattern=[]>
@@ -3627,6 +4352,7 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
let Inst{8} = 0b1;
let Inst{7} = Qn{3};
let Inst{0} = 0b1;
+ let validForTailPredication = 1;
}
multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> {
@@ -3742,6 +4468,7 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size,
let Inst{12} = T;
let Inst{8} = 0b1;
let Inst{5} = 0b1;
+ let validForTailPredication = 1;
}
multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> {
@@ -3804,13 +4531,30 @@ class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
let validForTailPredication = 1;
}
+multiclass MVE_VxSHL_qr_p<string iname, MVEVectorVTInfo VTI, bit q, bit r> {
+ def "" : MVE_VxSHL_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, q, r>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ def : Pat<(VTI.Vec (int_arm_mve_vshl_scalar
+ (VTI.Vec MQPR:$in), (i32 rGPR:$sh),
+ (i32 q), (i32 r), (i32 VTI.Unsigned))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$in), (i32 rGPR:$sh)))>;
+
+ def : Pat<(VTI.Vec (int_arm_mve_vshl_scalar_predicated
+ (VTI.Vec MQPR:$in), (i32 rGPR:$sh),
+ (i32 q), (i32 r), (i32 VTI.Unsigned),
+ (VTI.Pred VCCR:$mask))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$in), (i32 rGPR:$sh),
+ ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+}
+
multiclass MVE_VxSHL_qr_types<string iname, bit bit_7, bit bit_17> {
- def s8 : MVE_VxSHL_qr<iname, "s8", 0b0, 0b00, bit_7, bit_17>;
- def s16 : MVE_VxSHL_qr<iname, "s16", 0b0, 0b01, bit_7, bit_17>;
- def s32 : MVE_VxSHL_qr<iname, "s32", 0b0, 0b10, bit_7, bit_17>;
- def u8 : MVE_VxSHL_qr<iname, "u8", 0b1, 0b00, bit_7, bit_17>;
- def u16 : MVE_VxSHL_qr<iname, "u16", 0b1, 0b01, bit_7, bit_17>;
- def u32 : MVE_VxSHL_qr<iname, "u32", 0b1, 0b10, bit_7, bit_17>;
+ defm s8 : MVE_VxSHL_qr_p<iname, MVE_v16s8, bit_7, bit_17>;
+ defm s16 : MVE_VxSHL_qr_p<iname, MVE_v8s16, bit_7, bit_17>;
+ defm s32 : MVE_VxSHL_qr_p<iname, MVE_v4s32, bit_7, bit_17>;
+ defm u8 : MVE_VxSHL_qr_p<iname, MVE_v16u8, bit_7, bit_17>;
+ defm u16 : MVE_VxSHL_qr_p<iname, MVE_v8u16, bit_7, bit_17>;
+ defm u32 : MVE_VxSHL_qr_p<iname, MVE_v4u32, bit_7, bit_17>;
}
defm MVE_VSHL_qr : MVE_VxSHL_qr_types<"vshl", 0b0, 0b0>;
@@ -4054,7 +4798,7 @@ def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
let hasSideEffects = 1 in
-class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
+class MVE_VCTPInst<string suffix, bits<2> size, list<dag> pattern=[]>
: MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
"$Rn", vpred_n, "", pattern> {
bits<4> Rn;
@@ -4072,20 +4816,22 @@ class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
let validForTailPredication = 1;
}
-def MVE_VCTP8 : MVE_VCTP<"8", 0b00>;
-def MVE_VCTP16 : MVE_VCTP<"16", 0b01>;
-def MVE_VCTP32 : MVE_VCTP<"32", 0b10>;
-def MVE_VCTP64 : MVE_VCTP<"64", 0b11>;
+multiclass MVE_VCTP<MVEVectorVTInfo VTI, Intrinsic intr> {
+ def "": MVE_VCTPInst<VTI.BitsSuffix, VTI.Size>;
+ defvar Inst = !cast<Instruction>(NAME);
-let Predicates = [HasMVEInt] in {
- def : Pat<(int_arm_vctp8 rGPR:$Rn),
- (v16i1 (MVE_VCTP8 rGPR:$Rn))>;
- def : Pat<(int_arm_vctp16 rGPR:$Rn),
- (v8i1 (MVE_VCTP16 rGPR:$Rn))>;
- def : Pat<(int_arm_vctp32 rGPR:$Rn),
- (v4i1 (MVE_VCTP32 rGPR:$Rn))>;
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(intr rGPR:$Rn), (VTI.Pred (Inst rGPR:$Rn))>;
+ def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)),
+ (VTI.Pred (Inst rGPR:$Rn, ARMVCCThen, VCCR:$mask))>;
+ }
}
+defm MVE_VCTP8 : MVE_VCTP<MVE_v16i8, int_arm_mve_vctp8>;
+defm MVE_VCTP16 : MVE_VCTP<MVE_v8i16, int_arm_mve_vctp16>;
+defm MVE_VCTP32 : MVE_VCTP<MVE_v4i32, int_arm_mve_vctp32>;
+defm MVE_VCTP64 : MVE_VCTP<MVE_v2i64, int_arm_mve_vctp64>;
+
// end of mve_qDest_rSrc
// start of coproc mov
@@ -4258,6 +5004,29 @@ foreach wb = [MVE_vldst24_writeback<
"vst" # n.nvecs # stage # "." # s.lanesize>;
}
+multiclass MVE_vst24_patterns<int lanesize, ValueType VT> {
+ foreach stage = [0,1] in
+ def : Pat<(int_arm_mve_vst2q i32:$addr,
+ (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)),
+ (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize)
+ (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
+ t2_addr_offset_none:$addr)>;
+
+ foreach stage = [0,1,2,3] in
+ def : Pat<(int_arm_mve_vst4q i32:$addr,
+ (VT MQPR:$v0), (VT MQPR:$v1),
+ (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)),
+ (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize)
+ (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
+ VT:$v2, qsub_2, VT:$v3, qsub_3),
+ t2_addr_offset_none:$addr)>;
+}
+defm : MVE_vst24_patterns<8, v16i8>;
+defm : MVE_vst24_patterns<16, v8i16>;
+defm : MVE_vst24_patterns<32, v4i32>;
+defm : MVE_vst24_patterns<16, v8f16>;
+defm : MVE_vst24_patterns<32, v4f32>;
+
// end of MVE interleaving load/store
// start of MVE predicable load/store
@@ -4513,28 +5282,90 @@ class MVE_VLDRSTR_rq_b<MVE_ldst_direction dir, MVE_memsz memsz,
string asm, string suffix, bit U, bits<2> size>
: MVE_VLDRSTR_rq<dir, memsz, U, size, 0, asm, suffix, 0>;
+// Multiclasses wrapping that to add ISel patterns for intrinsics.
+multiclass MVE_VLDR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> {
+ defm "": MVE_VLDRSTR_rq_w<MVE_ld, memsz, "vldr" # memsz.MnemonicLetter,
+ VTIs[0].Suffix, VTIs[0].Unsigned, VTIs[0].Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar InstU = !cast<Instruction>(NAME # "_u");
+
+ foreach VTI = VTIs in
+ foreach UnsignedFlag = !if(!eq(VTI.Size, memsz.encoding),
+ [0,1], [VTI.Unsigned]) in {
+ def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag)),
+ (VTI.Vec (InstU GPR:$base, MQPR:$offsets))>;
+ def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag)),
+ (VTI.Vec (Inst GPR:$base, MQPR:$offsets))>;
+ def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag, (VTI.Pred VCCR:$pred))),
+ (VTI.Vec (InstU GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
+ def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag, (VTI.Pred VCCR:$pred))),
+ (VTI.Vec (Inst GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
+ }
+}
+multiclass MVE_VLDR_rq_b<list<MVEVectorVTInfo> VTIs> {
+ def "": MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb",
+ VTIs[0].Suffix, VTIs[0].Unsigned, VTIs[0].Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ foreach VTI = VTIs in {
+ def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned)),
+ (VTI.Vec (Inst GPR:$base, MQPR:$offsets))>;
+ def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned, (VTI.Pred VCCR:$pred))),
+ (VTI.Vec (Inst GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
+ }
+}
+multiclass MVE_VSTR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> {
+ defm "": MVE_VLDRSTR_rq_w<MVE_st, memsz, "vstr" # memsz.MnemonicLetter,
+ VTIs[0].BitsSuffix, 0, VTIs[0].Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar InstU = !cast<Instruction>(NAME # "_u");
+
+ foreach VTI = VTIs in {
+ def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0),
+ (InstU MQPR:$data, GPR:$base, MQPR:$offsets)>;
+ def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift),
+ (Inst MQPR:$data, GPR:$base, MQPR:$offsets)>;
+ def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0, (VTI.Pred VCCR:$pred)),
+ (InstU MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
+ def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift, (VTI.Pred VCCR:$pred)),
+ (Inst MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
+ }
+}
+multiclass MVE_VSTR_rq_b<list<MVEVectorVTInfo> VTIs> {
+ def "": MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb",
+ VTIs[0].BitsSuffix, 0, VTIs[0].Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ foreach VTI = VTIs in {
+ def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0),
+ (Inst MQPR:$data, GPR:$base, MQPR:$offsets)>;
+ def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0, (VTI.Pred VCCR:$pred)),
+ (Inst MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
+ }
+}
+
// Actually define all the loads and stores in this family.
-def MVE_VLDRBU8_rq : MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u8", 1,0b00>;
-def MVE_VLDRBU16_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u16", 1,0b01>;
-def MVE_VLDRBS16_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","s16", 0,0b01>;
-def MVE_VLDRBU32_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u32", 1,0b10>;
-def MVE_VLDRBS32_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","s32", 0,0b10>;
+defm MVE_VLDRBU8_rq : MVE_VLDR_rq_b<[MVE_v16u8,MVE_v16s8]>;
+defm MVE_VLDRBU16_rq: MVE_VLDR_rq_b<[MVE_v8u16]>;
+defm MVE_VLDRBS16_rq: MVE_VLDR_rq_b<[MVE_v8s16]>;
+defm MVE_VLDRBU32_rq: MVE_VLDR_rq_b<[MVE_v4u32]>;
+defm MVE_VLDRBS32_rq: MVE_VLDR_rq_b<[MVE_v4s32]>;
-defm MVE_VLDRHU16_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","u16", 1,0b01>;
-defm MVE_VLDRHU32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","u32", 1,0b10>;
-defm MVE_VLDRHS32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","s32", 0,0b10>;
-defm MVE_VLDRWU32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memW, "vldrw","u32", 1,0b10>;
-defm MVE_VLDRDU64_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memD, "vldrd","u64", 1,0b11>;
+defm MVE_VLDRHU16_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v8u16,MVE_v8s16,MVE_v8f16]>;
+defm MVE_VLDRHU32_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v4u32]>;
+defm MVE_VLDRHS32_rq: MVE_VLDR_rq_w<MVE_memH, [MVE_v4s32]>;
+defm MVE_VLDRWU32_rq: MVE_VLDR_rq_w<MVE_memW, [MVE_v4u32,MVE_v4s32,MVE_v4f32]>;
+defm MVE_VLDRDU64_rq: MVE_VLDR_rq_w<MVE_memD, [MVE_v2u64,MVE_v2s64]>;
-def MVE_VSTRB8_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","8", 0,0b00>;
-def MVE_VSTRB16_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","16", 0,0b01>;
-def MVE_VSTRB32_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","32", 0,0b10>;
+defm MVE_VSTRB8_rq : MVE_VSTR_rq_b<[MVE_v16i8]>;
+defm MVE_VSTRB16_rq : MVE_VSTR_rq_b<[MVE_v8i16]>;
+defm MVE_VSTRB32_rq : MVE_VSTR_rq_b<[MVE_v4i32]>;
-defm MVE_VSTRH16_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memH, "vstrh","16", 0,0b01>;
-defm MVE_VSTRH32_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memH, "vstrh","32", 0,0b10>;
-defm MVE_VSTRW32_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memW, "vstrw","32", 0,0b10>;
-defm MVE_VSTRD64_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memD, "vstrd","64", 0,0b11>;
+defm MVE_VSTRH16_rq : MVE_VSTR_rq_w<MVE_memH, [MVE_v8i16,MVE_v8f16]>;
+defm MVE_VSTRH32_rq : MVE_VSTR_rq_w<MVE_memH, [MVE_v4i32]>;
+defm MVE_VSTRW32_rq : MVE_VSTR_rq_w<MVE_memW, [MVE_v4i32,MVE_v4f32]>;
+defm MVE_VSTRD64_rq : MVE_VSTR_rq_w<MVE_memD, [MVE_v2i64]>;
// Gather loads / scatter stores whose address operand is of the form
// [Qm,#imm], i.e. a vector containing a full base address for each
@@ -4573,11 +5404,58 @@ multiclass MVE_VLDRSTR_qi_m<MVE_ldst_direction dir, MVE_memsz memsz,
}
}
+// Multiclasses wrapping that one, adding selection patterns for the
+// non-writeback loads and all the stores. (The writeback loads must
+// deliver multiple output values, so they have to be selected by C++
+// code.)
+multiclass MVE_VLDR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
+ list<MVEVectorVTInfo> DVTIs> {
+ defm "" : MVE_VLDRSTR_qi_m<MVE_ld, memsz, "vldr" # memsz.MnemonicLetter,
+ "u" # memsz.TypeBits>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ foreach DVTI = DVTIs in {
+ def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base
+ (AVTI.Vec MQPR:$addr), (i32 imm:$offset))),
+ (DVTI.Vec (Inst (AVTI.Vec MQPR:$addr), (i32 imm:$offset)))>;
+ def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base_predicated
+ (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (AVTI.Pred VCCR:$pred))),
+ (DVTI.Vec (Inst (AVTI.Vec MQPR:$addr), (i32 imm:$offset),
+ ARMVCCThen, VCCR:$pred))>;
+ }
+}
+multiclass MVE_VSTR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
+ list<MVEVectorVTInfo> DVTIs> {
+ defm "" : MVE_VLDRSTR_qi_m<MVE_st, memsz, "vstr" # memsz.MnemonicLetter,
+ !cast<string>(memsz.TypeBits)>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar InstPre = !cast<Instruction>(NAME # "_pre");
+
+ foreach DVTI = DVTIs in {
+ def : Pat<(int_arm_mve_vstr_scatter_base
+ (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data)),
+ (Inst (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr),
+ (i32 imm:$offset))>;
+ def : Pat<(int_arm_mve_vstr_scatter_base_predicated
+ (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred)),
+ (Inst (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr),
+ (i32 imm:$offset), ARMVCCThen, VCCR:$pred)>;
+ def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb
+ (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data))),
+ (AVTI.Vec (InstPre (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr),
+ (i32 imm:$offset)))>;
+ def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb_predicated
+ (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred))),
+ (AVTI.Vec (InstPre (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr),
+ (i32 imm:$offset), ARMVCCThen, VCCR:$pred))>;
+ }
+}
+
// Actual instruction definitions.
-defm MVE_VLDRWU32_qi: MVE_VLDRSTR_qi_m<MVE_ld, MVE_memW, "vldrw", "u32">;
-defm MVE_VLDRDU64_qi: MVE_VLDRSTR_qi_m<MVE_ld, MVE_memD, "vldrd", "u64">;
-defm MVE_VSTRW32_qi: MVE_VLDRSTR_qi_m<MVE_st, MVE_memW, "vstrw", "32">;
-defm MVE_VSTRD64_qi: MVE_VLDRSTR_qi_m<MVE_st, MVE_memD, "vstrd", "64">;
+defm MVE_VLDRWU32_qi: MVE_VLDR_qi<MVE_memW, MVE_v4i32, [MVE_v4i32,MVE_v4f32]>;
+defm MVE_VLDRDU64_qi: MVE_VLDR_qi<MVE_memD, MVE_v2i64, [MVE_v2i64,MVE_v2f64]>;
+defm MVE_VSTRW32_qi: MVE_VSTR_qi<MVE_memW, MVE_v4i32, [MVE_v4i32,MVE_v4f32]>;
+defm MVE_VSTRD64_qi: MVE_VSTR_qi<MVE_memD, MVE_v2i64, [MVE_v2i64,MVE_v2f64]>;
// Define aliases for all the instructions where memory size and
// vector lane size are the same. These are mnemonic aliases, so they
@@ -4595,21 +5473,21 @@ defm MVE_VSTRD64_qi: MVE_VLDRSTR_qi_m<MVE_st, MVE_memD, "vstrd", "64">;
foreach vpt_cond = ["", "t", "e"] in
foreach memsz = [MVE_memB, MVE_memH, MVE_memW, MVE_memD] in
foreach suffix = memsz.suffixes in {
+ // Define an alias with every suffix in the list, except for the one
+ // used by the real Instruction record (i.e. the one that all the
+ // rest are aliases *for*).
+
+ if !ne(suffix, memsz.CanonLoadSuffix) then {
+ def : MnemonicAlias<
+ "vldr" # memsz.MnemonicLetter # vpt_cond # suffix,
+ "vldr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonLoadSuffix>;
+ }
- // These foreaches are conceptually ifs, implemented by iterating a
- // dummy variable over a list with 0 or 1 elements depending on the
- // condition. The idea is to iterate over _nearly_ all the suffixes
- // in memsz.suffixes, but omit the one we want all the others to alias.
-
- foreach _ = !if(!ne(suffix, memsz.CanonLoadSuffix), [1], []<int>) in
- def : MnemonicAlias<
- "vldr" # memsz.MnemonicLetter # vpt_cond # suffix,
- "vldr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonLoadSuffix>;
-
- foreach _ = !if(!ne(suffix, memsz.CanonStoreSuffix), [1], []<int>) in
- def : MnemonicAlias<
- "vstr" # memsz.MnemonicLetter # vpt_cond # suffix,
- "vstr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonStoreSuffix>;
+ if !ne(suffix, memsz.CanonStoreSuffix) then {
+ def : MnemonicAlias<
+ "vstr" # memsz.MnemonicLetter # vpt_cond # suffix,
+ "vstr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonStoreSuffix>;
+ }
}
// end of MVE predicable load/store
@@ -4632,7 +5510,6 @@ class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> patte
let Inst{4} = 0b0;
let Defs = [VPR];
- let validForTailPredication = 1;
}
class MVE_VPTt1<string suffix, bits<2> size, dag iops>
@@ -4644,7 +5521,6 @@ class MVE_VPTt1<string suffix, bits<2> size, dag iops>
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
let Inst{0} = fc{1};
- let validForTailPredication = 1;
}
class MVE_VPTt1i<string suffix, bits<2> size>
@@ -4746,7 +5622,6 @@ class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern=
let Defs = [VPR];
let Predicates = [HasMVEFloat];
- let validForTailPredication = 1;
}
class MVE_VPTft1<string suffix, bit size>
@@ -4816,7 +5691,6 @@ def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;
- let validForTailPredication = 1;
}
foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32",
@@ -4826,87 +5700,87 @@ def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm",
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
- (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+ (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
- (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+ (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
- (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+ (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
- (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+ (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
- (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+ (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
- (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
- (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), 1)))>;
+ (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
+ (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), ARMCCne)))>;
def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
- (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
- (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>;
+ (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
+ (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne)))>;
def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
- (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
- (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>;
+ (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
+ (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne)))>;
def : Pat<(v8f16 (vselect (v8i16 MQPR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
- (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
- (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>;
+ (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
+ (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne)))>;
def : Pat<(v4f32 (vselect (v4i32 MQPR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
- (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
- (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>;
+ (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
+ (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne)))>;
// Pred <-> Int
def : Pat<(v16i8 (zext (v16i1 VCCR:$pred))),
- (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+ (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v8i16 (zext (v8i1 VCCR:$pred))),
- (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+ (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))),
- (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+ (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))),
- (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+ (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v8i16 (sext (v8i1 VCCR:$pred))),
- (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+ (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))),
- (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+ (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))),
- (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+ (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v8i16 (anyext (v8i1 VCCR:$pred))),
- (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+ (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))),
- (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+ (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))),
- (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, 1))>;
+ (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, ARMCCne))>;
def : Pat<(v8i1 (trunc (v8i16 MQPR:$v1))),
- (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, 1))>;
+ (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, ARMCCne))>;
def : Pat<(v4i1 (trunc (v4i32 MQPR:$v1))),
- (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, 1))>;
+ (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, ARMCCne))>;
}
let Predicates = [HasMVEFloat] in {
// Pred <-> Float
// 112 is 1.0 in float
def : Pat<(v4f32 (uint_to_fp (v4i1 VCCR:$pred))),
- (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>;
+ (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred))>;
// 2620 in 1.0 in half
def : Pat<(v8f16 (uint_to_fp (v8i1 VCCR:$pred))),
- (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>;
+ (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred))>;
// 240 is -1.0 in float
def : Pat<(v4f32 (sint_to_fp (v4i1 VCCR:$pred))),
- (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>;
+ (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred))>;
// 2748 is -1.0 in half
def : Pat<(v8f16 (sint_to_fp (v8i1 VCCR:$pred))),
- (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>;
+ (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred))>;
def : Pat<(v4i1 (fp_to_uint (v4f32 MQPR:$v1))),
- (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>;
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, ARMCCne))>;
def : Pat<(v8i1 (fp_to_uint (v8f16 MQPR:$v1))),
- (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>;
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, ARMCCne))>;
def : Pat<(v4i1 (fp_to_sint (v4f32 MQPR:$v1))),
- (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>;
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, ARMCCne))>;
def : Pat<(v8i1 (fp_to_sint (v8f16 MQPR:$v1))),
- (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>;
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, ARMCCne))>;
}
def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary,
@@ -4955,6 +5829,8 @@ class MVE_WLSTP<string asm, bits<2> size>
let Inst{13} = 0b0;
let Inst{11} = label{0};
let Inst{10-1} = label{10-1};
+ let isBranch = 1;
+ let isTerminator = 1;
}
def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>;
@@ -4983,6 +5859,8 @@ def MVE_LETP : MVE_loltp_end<(outs GPRlr:$LRout),
let Inst{13} = 0b0;
let Inst{11} = label{0};
let Inst{10-1} = label{10-1};
+ let isBranch = 1;
+ let isTerminator = 1;
}
def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> {
@@ -4998,61 +5876,7 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> {
// Patterns
//===----------------------------------------------------------------------===//
-class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst,
- PatFrag StoreKind, int shift>
- : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
- (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
-class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
- PatFrag StoreKind, int shift>
- : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
- (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>;
-
-multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
- int shift> {
- def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
- def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
- def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
- def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
- def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
- def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
- def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
-}
-
-class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
- PatFrag LoadKind, int shift>
- : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
- (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
-class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
- PatFrag LoadKind, int shift>
- : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
- (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>;
-
-multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
- int shift> {
- def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
- def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
- def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
- def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
- def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
- def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
- def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
-}
-
-class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode,
- PatFrag StoreKind, int shift>
- : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr),
- (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>;
-
-multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind,
- int shift> {
- def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>;
- def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>;
- def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>;
- def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>;
- def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>;
- def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>;
- def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>;
-}
+// PatFrags for loads and stores. Often trying to keep semi-consistent names.
def aligned32_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
(pre_store node:$val, node:$ptr, node:$offset), [{
@@ -5072,77 +5896,249 @@ def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
}]>;
-def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+def aligned_maskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
return Ld->getMemoryVT().getScalarType() == MVT::i8;
}]>;
-def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_sextmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{
return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
}]>;
-def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_zextmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{
return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
}]>;
-def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_extmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
EVT ScalarVT = Ld->getMemoryVT().getScalarType();
return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
}]>;
-def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+def aligned_maskedloadvi16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
EVT ScalarVT = Ld->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2;
}]>;
-def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_sextmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{
return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
}]>;
-def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_zextmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{
return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
}]>;
-def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_extmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
EVT ScalarVT = Ld->getMemoryVT().getScalarType();
return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
}]>;
-def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+def aligned_maskedloadvi32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
EVT ScalarVT = Ld->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4;
}]>;
-def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
- (masked_st node:$val, node:$ptr, node:$pred), [{
+def aligned_maskedstvi8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, undef, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
-def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
- (maskedstore8 node:$val, node:$ptr, node:$pred), [{
- return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+def aligned_maskedstvi16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, undef, node:$pred), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+def aligned_maskedstvi32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, undef, node:$pred), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
}]>;
-def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
- (masked_st node:$val, node:$ptr, node:$pred), [{
+
+def pre_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask),
+ (masked_st node:$val, node:$base, node:$offset, node:$mask), [{
+ ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+ return AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+}]>;
+def post_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask),
+ (masked_st node:$val, node:$base, node:$offset, node:$mask), [{
+ ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+ return AM == ISD::POST_INC || AM == ISD::POST_DEC;
+}]>;
+def aligned_pre_maskedstorevi8 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+ (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def aligned_post_maskedstorevi8 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+ (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def aligned_pre_maskedstorevi16 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+ (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
}]>;
+def aligned_post_maskedstorevi16 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+ (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+def aligned_pre_maskedstorevi32 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+ (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
+}]>;
+def aligned_post_maskedstorevi32 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+ (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
+}]>;
+
+
+// PatFrags for "Aligned" extending / truncating
-def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
- (maskedstore16 node:$val, node:$ptr, node:$pred), [{
+def aligned_extloadvi8 : PatFrag<(ops node:$ptr), (extloadvi8 node:$ptr)>;
+def aligned_sextloadvi8 : PatFrag<(ops node:$ptr), (sextloadvi8 node:$ptr)>;
+def aligned_zextloadvi8 : PatFrag<(ops node:$ptr), (zextloadvi8 node:$ptr)>;
+
+def aligned_truncstvi8 : PatFrag<(ops node:$val, node:$ptr),
+ (truncstorevi8 node:$val, node:$ptr)>;
+def aligned_post_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
+ (post_truncstvi8 node:$val, node:$base, node:$offset)>;
+def aligned_pre_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
+ (pre_truncstvi8 node:$val, node:$base, node:$offset)>;
+
+let MinAlignment = 2 in {
+ def aligned_extloadvi16 : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>;
+ def aligned_sextloadvi16 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>;
+ def aligned_zextloadvi16 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>;
+
+ def aligned_truncstvi16 : PatFrag<(ops node:$val, node:$ptr),
+ (truncstorevi16 node:$val, node:$ptr)>;
+ def aligned_post_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
+ (post_truncstvi16 node:$val, node:$base, node:$offset)>;
+ def aligned_pre_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
+ (pre_truncstvi16 node:$val, node:$base, node:$offset)>;
+}
+
+def truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$pred),
+ (masked_st node:$val, node:$base, undef, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
}]>;
-def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
- (masked_st node:$val, node:$ptr, node:$pred), [{
+def aligned_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$pred),
+ (truncmaskedst node:$val, node:$base, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def aligned_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$pred),
+ (truncmaskedst node:$val, node:$base, node:$pred), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
- return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
}]>;
+def pre_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
+ (masked_st node:$val, node:$base, node:$offset, node:$pred), [{
+ ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore() && (AM == ISD::PRE_INC || AM == ISD::PRE_DEC);
+}]>;
+def aligned_pre_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
+ (pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def aligned_pre_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
+ (pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+def post_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
+ (masked_st node:$val, node:$base, node:$offset, node:$postd), [{
+ ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore() && (AM == ISD::POST_INC || AM == ISD::POST_DEC);
+}]>;
+def aligned_post_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
+ (post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def aligned_post_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
+ (post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+
+// Load/store patterns
+
+class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag StoreKind, int shift>
+ : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
+ (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+
+class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag StoreKind, int shift>
+ : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
+ (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+
+multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
+ int shift> {
+ def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
+}
+
+class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag LoadKind, int shift>
+ : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
+ (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
+
+class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag LoadKind, int shift>
+ : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
+ (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred))>;
+
+multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
+ int shift> {
+ def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
+}
+
+class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode,
+ PatFrag StoreKind, int shift>
+ : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr),
+ (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>;
+
+class MVE_vector_offset_maskedstore_typed<ValueType Ty, Instruction Opcode,
+ PatFrag StoreKind, int shift>
+ : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr, VCCR:$pred),
+ (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+
+multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind,
+ int shift> {
+ def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>;
+}
+
let Predicates = [HasMVEInt, IsLE] in {
// Stores
@@ -5220,116 +6216,73 @@ let Predicates = [HasMVEInt, IsBE] in {
let Predicates = [HasMVEInt] in {
// Aligned masked store, shared between LE and BE
- def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore8, 0>;
- def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, maskedstore16, 1>;
- def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, maskedstore16, 1>;
- def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, maskedstore32, 2>;
- def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, maskedstore32, 2>;
- // Truncating stores
- def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
- (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
- def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
- (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
- def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred),
- (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>;
+ def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, aligned_maskedstvi8, 0>;
+ def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, aligned_maskedstvi16, 1>;
+ def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, aligned_maskedstvi16, 1>;
+ def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, aligned_maskedstvi32, 2>;
+ def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, aligned_maskedstvi32, 2>;
+
+ // Pre/Post inc masked stores
+ def : MVE_vector_offset_maskedstore_typed<v16i8, MVE_VSTRBU8_pre, aligned_pre_maskedstorevi8, 0>;
+ def : MVE_vector_offset_maskedstore_typed<v16i8, MVE_VSTRBU8_post, aligned_post_maskedstorevi8, 0>;
+ def : MVE_vector_offset_maskedstore_typed<v8i16, MVE_VSTRHU16_pre, aligned_pre_maskedstorevi16, 1>;
+ def : MVE_vector_offset_maskedstore_typed<v8i16, MVE_VSTRHU16_post, aligned_post_maskedstorevi16, 1>;
+ def : MVE_vector_offset_maskedstore_typed<v8f16, MVE_VSTRHU16_pre, aligned_pre_maskedstorevi16, 1>;
+ def : MVE_vector_offset_maskedstore_typed<v8f16, MVE_VSTRHU16_post, aligned_post_maskedstorevi16, 1>;
+ def : MVE_vector_offset_maskedstore_typed<v4i32, MVE_VSTRWU32_pre, aligned_pre_maskedstorevi32, 2>;
+ def : MVE_vector_offset_maskedstore_typed<v4i32, MVE_VSTRWU32_post, aligned_post_maskedstorevi32, 2>;
+ def : MVE_vector_offset_maskedstore_typed<v4f32, MVE_VSTRWU32_pre, aligned_pre_maskedstorevi32, 2>;
+ def : MVE_vector_offset_maskedstore_typed<v4f32, MVE_VSTRWU32_post, aligned_post_maskedstorevi32, 2>;
+
// Aligned masked loads
- def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload8, 0>;
- def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>;
- def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>;
- def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>;
- def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>;
- // Extending masked loads.
- def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
- (v8i16 NEONimmAllZerosV))),
- (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
- def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
- (v4i32 NEONimmAllZerosV))),
- (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
- def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
- (v8i16 NEONimmAllZerosV))),
- (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
- def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
- (v4i32 NEONimmAllZerosV))),
- (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
- def : Pat<(v8i16 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
- (v8i16 NEONimmAllZerosV))),
- (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
- def : Pat<(v4i32 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
- (v4i32 NEONimmAllZerosV))),
- (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
- def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
- (v4i32 NEONimmAllZerosV))),
- (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
- def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
- (v4i32 NEONimmAllZerosV))),
- (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
- def : Pat<(v4i32 (extmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
- (v4i32 NEONimmAllZerosV))),
- (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+ def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, aligned_maskedloadvi8, 0>;
+ def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, aligned_maskedloadvi16, 1>;
+ def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, aligned_maskedloadvi16, 1>;
+ def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, aligned_maskedloadvi32, 2>;
+ def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, aligned_maskedloadvi32, 2>;
}
// Widening/Narrowing Loads/Stores
-let MinAlignment = 2 in {
- def truncstorevi16_align2 : PatFrag<(ops node:$val, node:$ptr),
- (truncstorevi16 node:$val, node:$ptr)>;
- def post_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset),
- (post_truncstvi16 node:$val, node:$base, node:$offset)>;
- def pre_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset),
- (pre_truncstvi16 node:$val, node:$base, node:$offset)>;
-}
-
-let Predicates = [HasMVEInt] in {
- def : Pat<(truncstorevi8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr),
- (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr)>;
- def : Pat<(truncstorevi8 (v4i32 MQPR:$val), taddrmode_imm7<0>:$addr),
- (MVE_VSTRB32 MQPR:$val, taddrmode_imm7<0>:$addr)>;
- def : Pat<(truncstorevi16_align2 (v4i32 MQPR:$val), taddrmode_imm7<1>:$addr),
- (MVE_VSTRH32 MQPR:$val, taddrmode_imm7<1>:$addr)>;
-
- def : Pat<(post_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
- (MVE_VSTRB16_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
- def : Pat<(post_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
- (MVE_VSTRB32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
- def : Pat<(post_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr),
- (MVE_VSTRH32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>;
-
- def : Pat<(pre_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
- (MVE_VSTRB16_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
- def : Pat<(pre_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
- (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
- def : Pat<(pre_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr),
- (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>;
-}
-
-
-let MinAlignment = 2 in {
- def extloadvi16_align2 : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>;
- def sextloadvi16_align2 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>;
- def zextloadvi16_align2 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>;
-}
-
-multiclass MVEExtLoad<string DestLanes, string DestElemBits,
- string SrcElemBits, string SrcElemType,
- string Align, Operand am> {
- def _Any : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
- (!cast<PatFrag>("extloadvi" # SrcElemBits # Align) am:$addr)),
- (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
- am:$addr)>;
- def _Z : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
- (!cast<PatFrag>("zextloadvi" # SrcElemBits # Align) am:$addr)),
- (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
- am:$addr)>;
- def _S : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
- (!cast<PatFrag>("sextloadvi" # SrcElemBits # Align) am:$addr)),
- (!cast<Instruction>("MVE_VLDR" # SrcElemType # "S" # DestElemBits)
- am:$addr)>;
+multiclass MVEExtLoadStore<Instruction LoadSInst, Instruction LoadUInst, string StoreInst,
+ string Amble, ValueType VT, int Shift> {
+ // Trunc stores
+ def : Pat<(!cast<PatFrag>("aligned_truncst"#Amble) (VT MQPR:$val), taddrmode_imm7<Shift>:$addr),
+ (!cast<Instruction>(StoreInst) MQPR:$val, taddrmode_imm7<Shift>:$addr)>;
+ def : Pat<(!cast<PatFrag>("aligned_post_truncst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr),
+ (!cast<Instruction>(StoreInst#"_post") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr)>;
+ def : Pat<(!cast<PatFrag>("aligned_pre_truncst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr),
+ (!cast<Instruction>(StoreInst#"_pre") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr)>;
+
+ // Masked trunc stores
+ def : Pat<(!cast<PatFrag>("aligned_truncmaskedst"#Amble) (VT MQPR:$val), taddrmode_imm7<Shift>:$addr, VCCR:$pred),
+ (!cast<Instruction>(StoreInst) MQPR:$val, taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+ def : Pat<(!cast<PatFrag>("aligned_post_truncmaskedst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, VCCR:$pred),
+ (!cast<Instruction>(StoreInst#"_post") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+ def : Pat<(!cast<PatFrag>("aligned_pre_truncmaskedst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, VCCR:$pred),
+ (!cast<Instruction>(StoreInst#"_pre") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+
+ // Ext loads
+ def : Pat<(VT (!cast<PatFrag>("aligned_extload"#Amble) taddrmode_imm7<Shift>:$addr)),
+ (VT (LoadUInst taddrmode_imm7<Shift>:$addr))>;
+ def : Pat<(VT (!cast<PatFrag>("aligned_sextload"#Amble) taddrmode_imm7<Shift>:$addr)),
+ (VT (LoadSInst taddrmode_imm7<Shift>:$addr))>;
+ def : Pat<(VT (!cast<PatFrag>("aligned_zextload"#Amble) taddrmode_imm7<Shift>:$addr)),
+ (VT (LoadUInst taddrmode_imm7<Shift>:$addr))>;
+
+ // Masked ext loads
+ def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+ (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
+ def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+ (VT (LoadSInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
+ def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+ (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
}
let Predicates = [HasMVEInt] in {
- defm : MVEExtLoad<"4", "32", "8", "B", "", taddrmode_imm7<0>>;
- defm : MVEExtLoad<"8", "16", "8", "B", "", taddrmode_imm7<0>>;
- defm : MVEExtLoad<"4", "32", "16", "H", "_align2", taddrmode_imm7<1>>;
+ defm : MVEExtLoadStore<MVE_VLDRBS16, MVE_VLDRBU16, "MVE_VSTRB16", "vi8", v8i16, 0>;
+ defm : MVEExtLoadStore<MVE_VLDRBS32, MVE_VLDRBU32, "MVE_VSTRB32", "vi8", v4i32, 0>;
+ defm : MVEExtLoadStore<MVE_VLDRHS32, MVE_VLDRHU32, "MVE_VSTRH32", "vi16", v4i32, 1>;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
index 60ca92e58041..6244d8d9e27e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -3314,30 +3314,30 @@ class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
// source operand element sizes of 8, 16 and 32 bits:
multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
bits<5> op11_7, bit op4, string opc, string Dt,
- string asm, int fc> {
+ string asm, PatFrag fc> {
// 64-bit vector types.
def v8i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "8"), asm, "",
- [(set DPR:$Vd, (v8i8 (ARMvcmpz (v8i8 DPR:$Vm), (i32 fc))))]>;
+ [(set DPR:$Vd, (v8i8 (ARMvcmpz (v8i8 DPR:$Vm), fc)))]>;
def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "16"), asm, "",
- [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4i16 DPR:$Vm), (i32 fc))))]>;
+ [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4i16 DPR:$Vm), fc)))]>;
def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "32"), asm, "",
- [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2i32 DPR:$Vm), (i32 fc))))]>;
+ [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2i32 DPR:$Vm), fc)))]>;
def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, "f32", asm, "",
- [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2f32 DPR:$Vm), (i32 fc))))]> {
+ [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2f32 DPR:$Vm), fc)))]> {
let Inst{10} = 1; // overwrite F = 1
}
def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, "f16", asm, "",
- [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4f16 DPR:$Vm), (i32 fc))))]>,
+ [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4f16 DPR:$Vm), fc)))]>,
Requires<[HasNEON,HasFullFP16]> {
let Inst{10} = 1; // overwrite F = 1
}
@@ -3346,25 +3346,25 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "8"), asm, "",
- [(set QPR:$Vd, (v16i8 (ARMvcmpz (v16i8 QPR:$Vm), (i32 fc))))]>;
+ [(set QPR:$Vd, (v16i8 (ARMvcmpz (v16i8 QPR:$Vm), fc)))]>;
def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "16"), asm, "",
- [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8i16 QPR:$Vm), (i32 fc))))]>;
+ [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8i16 QPR:$Vm), fc)))]>;
def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "32"), asm, "",
- [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4i32 QPR:$Vm), (i32 fc))))]>;
+ [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4i32 QPR:$Vm), fc)))]>;
def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, "f32", asm, "",
- [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4f32 QPR:$Vm), (i32 fc))))]> {
+ [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4f32 QPR:$Vm), fc)))]> {
let Inst{10} = 1; // overwrite F = 1
}
def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, "f16", asm, "",
- [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8f16 QPR:$Vm), (i32 fc))))]>,
+ [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8f16 QPR:$Vm), fc)))]>,
Requires<[HasNEON,HasFullFP16]> {
let Inst{10} = 1; // overwrite F = 1
}
@@ -3373,11 +3373,11 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
// Neon 3-register comparisons.
class N3VQ_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
InstrItinClass itin, string OpcodeStr, string Dt,
- ValueType ResTy, ValueType OpTy, int fc, bit Commutable>
+ ValueType ResTy, ValueType OpTy, PatFrag fc, bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
- [(set QPR:$Vd, (ResTy (ARMvcmp (OpTy QPR:$Vn), (OpTy QPR:$Vm), (i32 fc))))]> {
+ [(set QPR:$Vd, (ResTy (ARMvcmp (OpTy QPR:$Vn), (OpTy QPR:$Vm), fc)))]> {
// All of these have a two-operand InstAlias.
let TwoOperandAliasConstraint = "$Vn = $Vd";
let isCommutable = Commutable;
@@ -3385,11 +3385,11 @@ class N3VQ_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
class N3VD_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
InstrItinClass itin, string OpcodeStr, string Dt,
- ValueType ResTy, ValueType OpTy, int fc, bit Commutable>
+ ValueType ResTy, ValueType OpTy, PatFrag fc, bit Commutable>
: N3V<op24, op23, op21_20, op11_8, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
- [(set DPR:$Vd, (ResTy (ARMvcmp (OpTy DPR:$Vn), (OpTy DPR:$Vm), (i32 fc))))]> {
+ [(set DPR:$Vd, (ResTy (ARMvcmp (OpTy DPR:$Vn), (OpTy DPR:$Vm), fc)))]> {
// All of these have a two-operand InstAlias.
let TwoOperandAliasConstraint = "$Vn = $Vd";
let isCommutable = Commutable;
@@ -3399,7 +3399,7 @@ multiclass N3V_QHS_cmp<bit op24, bit op23, bits<4> op11_8, bit op4,
InstrItinClass itinD16, InstrItinClass itinD32,
InstrItinClass itinQ16, InstrItinClass itinQ32,
string OpcodeStr, string Dt,
- int fc, bit Commutable = 0> {
+ PatFrag fc, bit Commutable = 0> {
// 64-bit vector types.
def v8i8 : N3VD_cmp<op24, op23, 0b00, op11_8, op4, itinD16,
OpcodeStr, !strconcat(Dt, "8"),
@@ -4287,10 +4287,10 @@ defm VRHADDu : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm,
// VQADD : Vector Saturating Add
defm VQADDs : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm,
IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
- "vqadd", "s", int_arm_neon_vqadds, 1>;
+ "vqadd", "s", saddsat, 1>;
defm VQADDu : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
- "vqadd", "u", int_arm_neon_vqaddu, 1>;
+ "vqadd", "u", uaddsat, 1>;
// VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q)
defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>;
// VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
@@ -4527,22 +4527,22 @@ let Predicates = [HasNEON, HasV8_1a] in {
defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
null_frag>;
- def : Pat<(v4i16 (int_arm_neon_vqadds
+ def : Pat<(v4i16 (saddsat
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
(v4i16 DPR:$Vm))))),
(v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v2i32 (int_arm_neon_vqadds
+ def : Pat<(v2i32 (saddsat
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
(v2i32 DPR:$Vm))))),
(v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v8i16 (int_arm_neon_vqadds
+ def : Pat<(v8i16 (saddsat
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
(v8i16 QPR:$Vm))))),
(v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
- def : Pat<(v4i32 (int_arm_neon_vqadds
+ def : Pat<(v4i32 (saddsat
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
(v4i32 QPR:$Vm))))),
@@ -4551,7 +4551,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
null_frag>;
- def : Pat<(v4i16 (int_arm_neon_vqadds
+ def : Pat<(v4i16 (saddsat
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh
(v4i16 DPR:$Vn),
@@ -4559,7 +4559,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
imm:$lane)))))),
(v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm,
imm:$lane))>;
- def : Pat<(v2i32 (int_arm_neon_vqadds
+ def : Pat<(v2i32 (saddsat
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh
(v2i32 DPR:$Vn),
@@ -4567,7 +4567,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
imm:$lane)))))),
(v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
imm:$lane))>;
- def : Pat<(v8i16 (int_arm_neon_vqadds
+ def : Pat<(v8i16 (saddsat
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh
(v8i16 QPR:$src2),
@@ -4579,7 +4579,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
- def : Pat<(v4i32 (int_arm_neon_vqadds
+ def : Pat<(v4i32 (saddsat
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh
(v4i32 QPR:$src2),
@@ -4597,22 +4597,22 @@ let Predicates = [HasNEON, HasV8_1a] in {
defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
null_frag>;
- def : Pat<(v4i16 (int_arm_neon_vqsubs
+ def : Pat<(v4i16 (ssubsat
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
(v4i16 DPR:$Vm))))),
(v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v2i32 (int_arm_neon_vqsubs
+ def : Pat<(v2i32 (ssubsat
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
(v2i32 DPR:$Vm))))),
(v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v8i16 (int_arm_neon_vqsubs
+ def : Pat<(v8i16 (ssubsat
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
(v8i16 QPR:$Vm))))),
(v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
- def : Pat<(v4i32 (int_arm_neon_vqsubs
+ def : Pat<(v4i32 (ssubsat
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
(v4i32 QPR:$Vm))))),
@@ -4621,14 +4621,14 @@ let Predicates = [HasNEON, HasV8_1a] in {
defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
null_frag>;
- def : Pat<(v4i16 (int_arm_neon_vqsubs
+ def : Pat<(v4i16 (ssubsat
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh
(v4i16 DPR:$Vn),
(v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>;
- def : Pat<(v2i32 (int_arm_neon_vqsubs
+ def : Pat<(v2i32 (ssubsat
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh
(v2i32 DPR:$Vn),
@@ -4636,7 +4636,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
imm:$lane)))))),
(v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
imm:$lane))>;
- def : Pat<(v8i16 (int_arm_neon_vqsubs
+ def : Pat<(v8i16 (ssubsat
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh
(v8i16 QPR:$src2),
@@ -4648,7 +4648,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
- def : Pat<(v4i32 (int_arm_neon_vqsubs
+ def : Pat<(v4i32 (ssubsat
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh
(v4i32 QPR:$src2),
@@ -4667,20 +4667,20 @@ defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>;
let Predicates = [HasNEON] in {
-def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
(v4i16 DPR:$Vm))))),
(VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+def : Pat<(v2i64 (saddsat (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
(v2i32 DPR:$Vm))))),
(VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
(v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
-def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+def : Pat<(v2i64 (saddsat (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
(v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
@@ -4759,20 +4759,20 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b0111, "vqdmlsl", "s", null_frag>;
let Predicates = [HasNEON] in {
-def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
(v4i16 DPR:$Vm))))),
(VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+def : Pat<(v2i64 (ssubsat (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
(v2i32 DPR:$Vm))))),
(VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
(v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
-def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+def : Pat<(v2i64 (ssubsat (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
(v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
@@ -5012,6 +5012,27 @@ defm VCMLA : N3VCP8ComplexTied<1, 0, "vcmla", null_frag>;
defm VCADD : N3VCP8ComplexOdd<1, 0, 0, "vcadd", null_frag>;
defm VCMLA : N3VCP8ComplexTiedLane<0, "vcmla", null_frag>;
+let Predicates = [HasNEON,HasV8_3a,HasFullFP16] in {
+ def : Pat<(v4f16 (int_arm_neon_vcadd_rot90 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm))),
+ (VCADDv4f16 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm), (i32 0))>;
+ def : Pat<(v4f16 (int_arm_neon_vcadd_rot270 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm))),
+ (VCADDv4f16 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm), (i32 1))>;
+ def : Pat<(v8f16 (int_arm_neon_vcadd_rot90 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm))),
+ (VCADDv8f16 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm), (i32 0))>;
+ def : Pat<(v8f16 (int_arm_neon_vcadd_rot270 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm))),
+ (VCADDv8f16 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm), (i32 1))>;
+}
+let Predicates = [HasNEON,HasV8_3a] in {
+ def : Pat<(v2f32 (int_arm_neon_vcadd_rot90 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm))),
+ (VCADDv2f32 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm), (i32 0))>;
+ def : Pat<(v2f32 (int_arm_neon_vcadd_rot270 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm))),
+ (VCADDv2f32 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm), (i32 1))>;
+ def : Pat<(v4f32 (int_arm_neon_vcadd_rot90 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm))),
+ (VCADDv4f32 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm), (i32 0))>;
+ def : Pat<(v4f32 (int_arm_neon_vcadd_rot270 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm))),
+ (VCADDv4f32 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm), (i32 1))>;
+}
+
// Vector Subtract Operations.
// VSUB : Vector Subtract (integer and floating-point)
@@ -5045,10 +5066,10 @@ defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm,
// VQSUB : Vector Saturing Subtract
defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vqsub", "s", int_arm_neon_vqsubs, 0>;
+ "vqsub", "s", ssubsat, 0>;
defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
- "vqsub", "u", int_arm_neon_vqsubu, 0>;
+ "vqsub", "u", usubsat, 0>;
// VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q)
defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>;
// VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
@@ -5068,66 +5089,66 @@ def : Pat<(v2i32 (trunc (ARMvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
// VCEQ : Vector Compare Equal
defm VCEQ : N3V_QHS_cmp<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vceq", "i", 0, 1>;
+ IIC_VSUBi4Q, "vceq", "i", ARMCCeq, 1>;
def VCEQfd : N3VD_cmp<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
- 0, 1>;
+ ARMCCeq, 1>;
def VCEQfq : N3VQ_cmp<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
- 0, 1>;
+ ARMCCeq, 1>;
def VCEQhd : N3VD_cmp<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
- 0, 1>,
+ ARMCCeq, 1>,
Requires<[HasNEON, HasFullFP16]>;
def VCEQhq : N3VQ_cmp<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
- 0, 1>,
+ ARMCCeq, 1>,
Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in
defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
- "$Vd, $Vm, #0", 0>;
+ "$Vd, $Vm, #0", ARMCCeq>;
// VCGE : Vector Compare Greater Than or Equal
defm VCGEs : N3V_QHS_cmp<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcge", "s", 10, 0>;
+ IIC_VSUBi4Q, "vcge", "s", ARMCCge, 0>;
defm VCGEu : N3V_QHS_cmp<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcge", "u", 2, 0>;
+ IIC_VSUBi4Q, "vcge", "u", ARMCChs, 0>;
def VCGEfd : N3VD_cmp<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
- 10, 0>;
+ ARMCCge, 0>;
def VCGEfq : N3VQ_cmp<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
- 10, 0>;
+ ARMCCge, 0>;
def VCGEhd : N3VD_cmp<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
- 10, 0>,
+ ARMCCge, 0>,
Requires<[HasNEON, HasFullFP16]>;
def VCGEhq : N3VQ_cmp<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
- 10, 0>,
+ ARMCCge, 0>,
Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in {
defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
- "$Vd, $Vm, #0", 10>;
+ "$Vd, $Vm, #0", ARMCCge>;
defm VCLEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
- "$Vd, $Vm, #0", 13>;
+ "$Vd, $Vm, #0", ARMCCle>;
}
// VCGT : Vector Compare Greater Than
defm VCGTs : N3V_QHS_cmp<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcgt", "s", 12, 0>;
+ IIC_VSUBi4Q, "vcgt", "s", ARMCCgt, 0>;
defm VCGTu : N3V_QHS_cmp<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcgt", "u", 8, 0>;
+ IIC_VSUBi4Q, "vcgt", "u", ARMCChi, 0>;
def VCGTfd : N3VD_cmp<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
- 12, 0>;
+ ARMCCgt, 0>;
def VCGTfq : N3VQ_cmp<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
- 12, 0>;
+ ARMCCgt, 0>;
def VCGThd : N3VD_cmp<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
- 12, 0>,
+ ARMCCgt, 0>,
Requires<[HasNEON, HasFullFP16]>;
def VCGThq : N3VQ_cmp<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
- 12, 0>,
+ ARMCCgt, 0>,
Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in {
defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
- "$Vd, $Vm, #0", 12>;
+ "$Vd, $Vm, #0", ARMCCgt>;
defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
- "$Vd, $Vm, #0", 11>;
+ "$Vd, $Vm, #0", ARMCClt>;
}
// VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
@@ -6797,9 +6818,12 @@ def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>;
def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>;
let Predicates = [HasNEON] in {
-def : Pat<(v4f32 (ARMvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>;
-def : Pat<(v8f16 (ARMvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>;
-def : Pat<(v4f16 (ARMvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>;
+ def : Pat<(v4f32 (ARMvrev64 (v4f32 QPR:$Vm))),
+ (VREV64q32 QPR:$Vm)>;
+ def : Pat<(v8f16 (ARMvrev64 (v8f16 QPR:$Vm))),
+ (VREV64q16 QPR:$Vm)>;
+ def : Pat<(v4f16 (ARMvrev64 (v4f16 DPR:$Vm))),
+ (VREV64d16 DPR:$Vm)>;
}
// VREV32 : Vector Reverse elements within 32-bit words
@@ -6821,6 +6845,13 @@ def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>;
def VREV32q8 : VREV32Q<0b00, "vrev32", "8", v16i8>;
def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>;
+let Predicates = [HasNEON] in {
+ def : Pat<(v8f16 (ARMvrev32 (v8f16 QPR:$Vm))),
+ (VREV32q16 QPR:$Vm)>;
+ def : Pat<(v4f16 (ARMvrev32 (v4f16 DPR:$Vm))),
+ (VREV32d16 DPR:$Vm)>;
+}
+
// VREV16 : Vector Reverse elements within 16-bit halfwords
class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 25a45b39fa0c..4193e8147f47 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -270,7 +270,8 @@ def t2am_imm8_offset : MemOperand,
// t2addrmode_imm8s4 := reg +/- (imm8 << 2)
def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";}
-class T2AddrMode_Imm8s4 : MemOperand {
+class T2AddrMode_Imm8s4 : MemOperand,
+ ComplexPattern<i32, 2, "SelectT2AddrModeImm8<2>", []> {
let EncoderMethod = "getT2AddrModeImm8s4OpValue";
let DecoderMethod = "DecodeT2AddrModeImm8s4";
let ParserMatchClass = MemImm8s4OffsetAsmOperand;
@@ -917,10 +918,26 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, SDNode opnode,
// The register-immediate version is re-materializable. This is useful
// in particular for taking the address of a local.
let isReMaterializable = 1 in {
+ def spImm : T2sTwoRegImm<
+ (outs GPRsp:$Rd), (ins GPRsp:$Rn, t2_so_imm:$imm), IIC_iALUi,
+ opc, ".w\t$Rd, $Rn, $imm",
+ []>,
+ Sched<[WriteALU, ReadALU]> {
+ let Rn = 13;
+ let Rd = 13;
+
+ let Inst{31-27} = 0b11110;
+ let Inst{25-24} = 0b01;
+ let Inst{23-21} = op23_21;
+ let Inst{15} = 0;
+
+ let DecoderMethod = "DecodeT2AddSubSPImm";
+ }
+
def ri : T2sTwoRegImm<
- (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi,
+ (outs rGPR:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi,
opc, ".w\t$Rd, $Rn, $imm",
- [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]>,
+ [(set rGPR:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]>,
Sched<[WriteALU, ReadALU]> {
let Inst{31-27} = 0b11110;
let Inst{25} = 0;
@@ -931,9 +948,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, SDNode opnode,
}
// 12-bit imm
def ri12 : T2I<
- (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
+ (outs rGPR:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
!strconcat(opc, "w"), "\t$Rd, $Rn, $imm",
- [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]>,
+ [(set rGPR:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]>,
Sched<[WriteALU, ReadALU]> {
bits<4> Rd;
bits<4> Rn;
@@ -949,6 +966,26 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, SDNode opnode,
let Inst{11-8} = Rd;
let Inst{7-0} = imm{7-0};
}
+ def spImm12 : T2I<
+ (outs GPRsp:$Rd), (ins GPRsp:$Rn, imm0_4095:$imm), IIC_iALUi,
+ !strconcat(opc, "w"), "\t$Rd, $Rn, $imm",
+ []>,
+ Sched<[WriteALU, ReadALU]> {
+ bits<4> Rd = 13;
+ bits<4> Rn = 13;
+ bits<12> imm;
+ let Inst{31-27} = 0b11110;
+ let Inst{26} = imm{11};
+ let Inst{25-24} = 0b10;
+ let Inst{23-21} = op23_21;
+ let Inst{20} = 0; // The S bit.
+ let Inst{19-16} = Rn;
+ let Inst{15} = 0;
+ let Inst{14-12} = imm{10-8};
+ let Inst{11-8} = Rd;
+ let Inst{7-0} = imm{7-0};
+ let DecoderMethod = "DecodeT2AddSubSPImm";
+ }
// register
def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm),
IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm",
@@ -1412,7 +1449,8 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
// Load doubleword
def t2LDRDi8 : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
(ins t2addrmode_imm8s4:$addr),
- IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>,
+ IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "",
+ [(set rGPR:$Rt, rGPR:$Rt2, (ARMldrd t2addrmode_imm8s4:$addr))]>,
Sched<[WriteLd]>;
} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
@@ -1593,7 +1631,8 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in
def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
(ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
- IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>,
+ IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "",
+ [(ARMstrd rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr)]>,
Sched<[WriteST]>;
// Indexed stores
@@ -2264,19 +2303,29 @@ def : t2InstSubst<"sbc${s}${p} $rd, $rn, $imm",
(t2ADCri rGPR:$rd, rGPR:$rn, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
def : t2InstSubst<"add${s}${p}.w $rd, $rn, $imm",
- (t2SUBri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
-def : t2InstSubst<"addw${p} $rd, $rn, $imm",
- (t2SUBri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
+ (t2SUBri rGPR:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"sub${s}${p}.w $rd, $rn, $imm",
+ (t2ADDri rGPR:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"subw${p} $Rd, $Rn, $imm",
+ (t2ADDri12 rGPR:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstSubst<"sub${s}${p} $rd, $rn, $imm",
+ (t2ADDri rGPR:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"sub${p} $rd, $rn, $imm",
+ (t2ADDri12 rGPR:$rd, GPR:$rn, imm0_4095_neg:$imm, pred:$p)>;
+
+// SP to SP alike
+def : t2InstSubst<"add${s}${p}.w $rd, $rn, $imm",
+ (t2SUBspImm GPRsp:$rd, GPRsp:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
def : t2InstSubst<"sub${s}${p}.w $rd, $rn, $imm",
- (t2ADDri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
-def : t2InstSubst<"subw${p} $rd, $rn, $imm",
- (t2ADDri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
+ (t2ADDspImm GPRsp:$rd, GPRsp:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
def : t2InstSubst<"subw${p} $Rd, $Rn, $imm",
- (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+ (t2ADDspImm12 GPRsp:$Rd, GPRsp:$Rn, imm0_4095_neg:$imm, pred:$p)>;
def : t2InstSubst<"sub${s}${p} $rd, $rn, $imm",
- (t2ADDri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+ (t2ADDspImm GPRsp:$rd, GPRsp:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
def : t2InstSubst<"sub${p} $rd, $rn, $imm",
- (t2ADDri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
+ (t2ADDspImm12 GPRsp:$rd, GPRsp:$rn, imm0_4095_neg:$imm, pred:$p)>;
+
+
// RSB
defm t2RSB : T2I_rbin_irs <0b1110, "rsb", sub>;
@@ -2292,12 +2341,12 @@ defm t2RSBS : T2I_rbin_s_is <ARMsubc>;
// The AddedComplexity preferences the first variant over the others since
// it can be shrunk to a 16-bit wide encoding, while the others cannot.
let AddedComplexity = 1 in
-def : T2Pat<(add GPR:$src, imm1_255_neg:$imm),
- (t2SUBri GPR:$src, imm1_255_neg:$imm)>;
-def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm),
- (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>;
-def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm),
- (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>;
+def : T2Pat<(add rGPR:$src, imm1_255_neg:$imm),
+ (t2SUBri rGPR:$src, imm1_255_neg:$imm)>;
+def : T2Pat<(add rGPR:$src, t2_so_imm_neg:$imm),
+ (t2SUBri rGPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(add rGPR:$src, imm0_4095_neg:$imm),
+ (t2SUBri12 rGPR:$src, imm0_4095_neg:$imm)>;
def : T2Pat<(add GPR:$src, imm0_65535_neg:$imm),
(t2SUBrr GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
@@ -2796,10 +2845,10 @@ def : T2Pat<(t2_so_imm_not:$src),
// Thumb2SizeReduction's chances later on we select a t2ADD for an or where
// possible.
def : T2Pat<(or AddLikeOrOp:$Rn, t2_so_imm:$imm),
- (t2ADDri $Rn, t2_so_imm:$imm)>;
+ (t2ADDri rGPR:$Rn, t2_so_imm:$imm)>;
def : T2Pat<(or AddLikeOrOp:$Rn, imm0_4095:$Rm),
- (t2ADDri12 $Rn, imm0_4095:$Rm)>;
+ (t2ADDri12 rGPR:$Rn, imm0_4095:$Rm)>;
def : T2Pat<(or AddLikeOrOp:$Rn, non_imm32:$Rm),
(t2ADDrr $Rn, $Rm)>;
@@ -4551,10 +4600,18 @@ class T2TT<bits<2> at, string asm, list<dag> pattern>
let Unpredictable{5-0} = 0b111111;
}
-def t2TT : T2TT<0b00, "tt", []>, Requires<[IsThumb,Has8MSecExt]>;
-def t2TTT : T2TT<0b01, "ttt", []>, Requires<[IsThumb,Has8MSecExt]>;
-def t2TTA : T2TT<0b10, "tta", []>, Requires<[IsThumb,Has8MSecExt]>;
-def t2TTAT : T2TT<0b11, "ttat", []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TT : T2TT<0b00, "tt",
+ [(set rGPR:$Rt, (int_arm_cmse_tt GPRnopc:$Rn))]>,
+ Requires<[IsThumb, Has8MSecExt]>;
+def t2TTT : T2TT<0b01, "ttt",
+ [(set rGPR:$Rt, (int_arm_cmse_ttt GPRnopc:$Rn))]>,
+ Requires<[IsThumb, Has8MSecExt]>;
+def t2TTA : T2TT<0b10, "tta",
+ [(set rGPR:$Rt, (int_arm_cmse_tta GPRnopc:$Rn))]>,
+ Requires<[IsThumb, Has8MSecExt]>;
+def t2TTAT : T2TT<0b11, "ttat",
+ [(set rGPR:$Rt, (int_arm_cmse_ttat GPRnopc:$Rn))]>,
+ Requires<[IsThumb, Has8MSecExt]>;
//===----------------------------------------------------------------------===//
// Non-Instruction Patterns
@@ -4655,10 +4712,10 @@ def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
// Aliases for ADD without the ".w" optional width specifier.
def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
- (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p,
+ (t2ADDri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p,
cc_out:$s)>;
def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
- (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+ (t2ADDri12 rGPR:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm",
(t2ADDrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm",
@@ -4666,9 +4723,11 @@ def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm",
pred:$p, cc_out:$s)>;
// ... and with the destination and source register combined.
def : t2InstAlias<"add${s}${p} $Rdn, $imm",
- (t2ADDri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+ (t2ADDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
def : t2InstAlias<"add${p} $Rdn, $imm",
- (t2ADDri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>;
+ (t2ADDri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"addw${p} $Rdn, $imm",
+ (t2ADDri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095:$imm, pred:$p)>;
def : t2InstAlias<"add${s}${p} $Rdn, $Rm",
(t2ADDrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm",
@@ -4677,33 +4736,33 @@ def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm",
// add w/ negative immediates is just a sub.
def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm",
- (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+ (t2SUBri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
cc_out:$s)>;
def : t2InstSubst<"add${p} $Rd, $Rn, $imm",
- (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+ (t2SUBri12 rGPR:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
def : t2InstSubst<"add${s}${p} $Rdn, $imm",
- (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+ (t2SUBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_neg:$imm, pred:$p,
cc_out:$s)>;
def : t2InstSubst<"add${p} $Rdn, $imm",
- (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+ (t2SUBri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
def : t2InstSubst<"add${s}${p}.w $Rd, $Rn, $imm",
- (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+ (t2SUBri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
cc_out:$s)>;
def : t2InstSubst<"addw${p} $Rd, $Rn, $imm",
- (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+ (t2SUBri12 rGPR:$Rd, rGPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
def : t2InstSubst<"add${s}${p}.w $Rdn, $imm",
- (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+ (t2SUBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_neg:$imm, pred:$p,
cc_out:$s)>;
def : t2InstSubst<"addw${p} $Rdn, $imm",
- (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+ (t2SUBri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
// Aliases for SUB without the ".w" optional width specifier.
def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm",
- (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+ (t2SUBri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
def : t2InstAlias<"sub${p} $Rd, $Rn, $imm",
- (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+ (t2SUBri12 rGPR:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $Rm",
(t2SUBrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm",
@@ -4711,9 +4770,11 @@ def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm",
pred:$p, cc_out:$s)>;
// ... and with the destination and source register combined.
def : t2InstAlias<"sub${s}${p} $Rdn, $imm",
- (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+ (t2SUBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
def : t2InstAlias<"sub${p} $Rdn, $imm",
- (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>;
+ (t2SUBri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"subw${p} $Rdn, $imm",
+ (t2SUBri12 rGPR:$Rdn, rGPR:$Rdn, imm0_4095:$imm, pred:$p)>;
def : t2InstAlias<"sub${s}${p}.w $Rdn, $Rm",
(t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
def : t2InstAlias<"sub${s}${p} $Rdn, $Rm",
@@ -4722,6 +4783,65 @@ def : t2InstAlias<"sub${s}${p} $Rdn, $ShiftedRm",
(t2SUBrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm,
pred:$p, cc_out:$s)>;
+// SP to SP alike aliases
+// Aliases for ADD without the ".w" optional width specifier.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+ (t2ADDspImm GPRsp:$Rd, GPRsp:$Rn, t2_so_imm:$imm, pred:$p,
+ cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+ (t2ADDspImm12 GPRsp:$Rd, GPRsp:$Rn, imm0_4095:$imm, pred:$p)>;
+// ... and with the destination and source register combined.
+def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+ (t2ADDspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+
+def : t2InstAlias<"add${s}${p}.w $Rdn, $imm",
+ (t2ADDspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+
+def : t2InstAlias<"add${p} $Rdn, $imm",
+ (t2ADDspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095:$imm, pred:$p)>;
+
+def : t2InstAlias<"addw${p} $Rdn, $imm",
+ (t2ADDspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095:$imm, pred:$p)>;
+
+// add w/ negative immediates is just a sub.
+def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm",
+ (t2SUBspImm GPRsp:$Rd, GPRsp:$Rn, t2_so_imm_neg:$imm, pred:$p,
+ cc_out:$s)>;
+def : t2InstSubst<"add${p} $Rd, $Rn, $imm",
+ (t2SUBspImm12 GPRsp:$Rd, GPRsp:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstSubst<"add${s}${p} $Rdn, $imm",
+ (t2SUBspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+ cc_out:$s)>;
+def : t2InstSubst<"add${p} $Rdn, $imm",
+ (t2SUBspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
+def : t2InstSubst<"add${s}${p}.w $Rd, $Rn, $imm",
+ (t2SUBspImm GPRsp:$Rd, GPRsp:$Rn, t2_so_imm_neg:$imm, pred:$p,
+ cc_out:$s)>;
+def : t2InstSubst<"addw${p} $Rd, $Rn, $imm",
+ (t2SUBspImm12 GPRsp:$Rd, GPRsp:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstSubst<"add${s}${p}.w $Rdn, $imm",
+ (t2SUBspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+ cc_out:$s)>;
+def : t2InstSubst<"addw${p} $Rdn, $imm",
+ (t2SUBspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
+
+// Aliases for SUB without the ".w" optional width specifier.
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm",
+ (t2SUBspImm GPRsp:$Rd, GPRsp:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rd, $Rn, $imm",
+ (t2SUBspImm12 GPRsp:$Rd, GPRsp:$Rn, imm0_4095:$imm, pred:$p)>;
+// ... and with the destination and source register combined.
+def : t2InstAlias<"sub${s}${p} $Rdn, $imm",
+ (t2SUBspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p}.w $Rdn, $imm",
+ (t2SUBspImm GPRsp:$Rdn, GPRsp:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rdn, $imm",
+ (t2SUBspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"subw${p} $Rdn, $imm",
+ (t2SUBspImm12 GPRsp:$Rdn, GPRsp:$Rdn, imm0_4095:$imm, pred:$p)>;
+
// Alias for compares without the ".w" optional width specifier.
def : t2InstAlias<"cmn${p} $Rn, $Rm",
(t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
@@ -4978,10 +5098,16 @@ def : t2InstSubst<"orr${s}${p} $Rdn, $imm",
pred:$p, cc_out:$s)>;
// Likewise, "add Rd, t2_so_imm_neg" -> sub
def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm",
- (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm,
+ (t2SUBri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm,
+ pred:$p, cc_out:$s)>;
+def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm",
+ (t2SUBspImm GPRsp:$Rd, GPRsp:$Rn, t2_so_imm_neg:$imm,
+ pred:$p, cc_out:$s)>;
+def : t2InstSubst<"add${s}${p} $Rd, $imm",
+ (t2SUBri rGPR:$Rd, rGPR:$Rd, t2_so_imm_neg:$imm,
pred:$p, cc_out:$s)>;
def : t2InstSubst<"add${s}${p} $Rd, $imm",
- (t2SUBri GPRnopc:$Rd, GPRnopc:$Rd, t2_so_imm_neg:$imm,
+ (t2SUBspImm GPRsp:$Rd, GPRsp:$Rd, t2_so_imm_neg:$imm,
pred:$p, cc_out:$s)>;
// Same for CMP <--> CMN via t2_so_imm_neg
def : t2InstSubst<"cmp${p} $Rd, $imm",
@@ -5178,8 +5304,6 @@ class t2LOL<dag oops, dag iops, string asm, string ops>
let Inst{31-23} = 0b111100000;
let Inst{15-14} = 0b11;
let Inst{0} = 0b1;
- let isBranch = 1;
- let isTerminator = 1;
let DecoderMethod = "DecodeLOLoop";
let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB];
}
@@ -5196,13 +5320,13 @@ def t2WLS : t2LOL<(outs GPRlr:$LR),
let Inst{11} = label{0};
let Inst{10-1} = label{10-1};
let usesCustomInserter = 1;
+ let isBranch = 1;
+ let isTerminator = 1;
}
def t2DLS : t2LOL<(outs GPRlr:$LR), (ins rGPR:$Rn),
"dls", "$LR, $Rn"> {
bits<4> Rn;
- let isBranch = 0;
- let isTerminator = 0;
let Inst{22-20} = 0b100;
let Inst{19-16} = Rn{3-0};
let Inst{13-1} = 0b1000000000000;
@@ -5218,6 +5342,8 @@ def t2LEUpdate : t2LOL<(outs GPRlr:$LRout),
let Inst{11} = label{0};
let Inst{10-1} = label{10-1};
let usesCustomInserter = 1;
+ let isBranch = 1;
+ let isTerminator = 1;
}
def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> {
@@ -5226,6 +5352,8 @@ def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> {
let Inst{13-12} = 0b00;
let Inst{11} = label{0};
let Inst{10-1} = label{10-1};
+ let isBranch = 1;
+ let isTerminator = 1;
}
def t2DoLoopStart :
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
index fdd961bfbb2f..a41a483d1a4c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -277,7 +277,7 @@ def : MnemonicAlias<"vstm", "vstmia">;
//
let mayLoad = 1 in
def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
- IIC_fpLoad_m, "vlldm${p}\t$Rn", "", []>,
+ NoItinerary, "vlldm${p}\t$Rn", "", []>,
Requires<[HasV8MMainline, Has8MSecExt]> {
let Inst{24-23} = 0b00;
let Inst{22} = 0;
@@ -290,7 +290,7 @@ def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
let mayStore = 1 in
def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
- IIC_fpStore_m, "vlstm${p}\t$Rn", "", []>,
+ NoItinerary, "vlstm${p}\t$Rn", "", []>,
Requires<[HasV8MMainline, Has8MSecExt]> {
let Inst{24-23} = 0b00;
let Inst{22} = 0;
@@ -2143,6 +2143,9 @@ def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
(VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
+def : Pat<(f16 (fma (fneg HPR:$Sn), HPR:$Sm, HPR:$Sdin)),
+ (VFMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+ Requires<[HasFullFP16]>;
// (fma x, (fneg y), z) -> (vfms z, x, y)
def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
(VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
@@ -2150,6 +2153,9 @@ def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
(VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
+def : Pat<(f16 (fma HPR:$Sn, (fneg HPR:$Sm), HPR:$Sdin)),
+ (VFMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+ Requires<[HasFullFP16]>;
def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -2196,6 +2202,9 @@ def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
(VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
+def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
+ (VFNMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+ Requires<[HasFullFP16]>;
// (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
(VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
@@ -2203,6 +2212,9 @@ def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
(VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
+def : Pat<(f16 (fma (fneg HPR:$Sn), HPR:$Sm, (fneg HPR:$Sdin))),
+ (VFNMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+ Requires<[HasFullFP16]>;
def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -2248,6 +2260,9 @@ def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
(VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
+def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (fneg HPR:$Sdin))),
+ (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+ Requires<[HasFullFP16]>;
// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
(VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
@@ -2255,6 +2270,9 @@ def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
(VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
+def : Pat<(fneg (f16 (fma (fneg HPR:$Sn), HPR:$Sm, HPR:$Sdin))),
+ (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+ Requires<[HasFullFP16]>;
// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
(VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
@@ -2262,6 +2280,9 @@ def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
(VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
+def : Pat<(fneg (f16 (fma HPR:$Sn, (fneg HPR:$Sm), HPR:$Sdin))),
+ (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+ Requires<[HasFullFP16]>;
//===----------------------------------------------------------------------===//
// FP Conditional moves.
@@ -2279,6 +2300,12 @@ def VMOVScc : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
[(set (f32 SPR:$Sd),
(ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>,
RegConstraint<"$Sn = $Sd">, Requires<[HasFPRegs]>;
+
+def VMOVHcc : PseudoInst<(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm, cmovpred:$p),
+ IIC_fpUNA16,
+ [(set (f16 HPR:$Sd),
+ (ARMcmov HPR:$Sn, HPR:$Sm, cmovpred:$p))]>,
+ RegConstraint<"$Sd = $Sn">, Requires<[HasFPRegs]>;
} // hasSideEffects
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
index 8e5e474c0f59..67816bc2103f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -17,6 +17,7 @@
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/Support/Debug.h"
#define DEBUG_TYPE "arm-isel"
@@ -137,8 +138,10 @@ private:
unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
unsigned Size) const;
- void renderVFPF32Imm(MachineInstrBuilder &New, const MachineInstr &Old) const;
- void renderVFPF64Imm(MachineInstrBuilder &New, const MachineInstr &Old) const;
+ void renderVFPF32Imm(MachineInstrBuilder &New, const MachineInstr &Old,
+ int OpIdx = -1) const;
+ void renderVFPF64Imm(MachineInstrBuilder &New, const MachineInstr &Old,
+ int OpIdx = -1) const;
#define GET_GLOBALISEL_PREDICATES_DECL
#include "ARMGenGlobalISel.inc"
@@ -810,9 +813,10 @@ bool ARMInstructionSelector::selectShift(unsigned ShiftOpc,
}
void ARMInstructionSelector::renderVFPF32Imm(
- MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst) const {
+ MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst,
+ int OpIdx) const {
assert(OldInst.getOpcode() == TargetOpcode::G_FCONSTANT &&
- "Expected G_FCONSTANT");
+ OpIdx == -1 && "Expected G_FCONSTANT");
APFloat FPImmValue = OldInst.getOperand(1).getFPImm()->getValueAPF();
int FPImmEncoding = ARM_AM::getFP32Imm(FPImmValue);
@@ -822,9 +826,9 @@ void ARMInstructionSelector::renderVFPF32Imm(
}
void ARMInstructionSelector::renderVFPF64Imm(
- MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst) const {
+ MachineInstrBuilder &NewInstBuilder, const MachineInstr &OldInst, int OpIdx) const {
assert(OldInst.getOpcode() == TargetOpcode::G_FCONSTANT &&
- "Expected G_FCONSTANT");
+ OpIdx == -1 && "Expected G_FCONSTANT");
APFloat FPImmValue = OldInst.getOperand(1).getFPImm()->getValueAPF();
int FPImmEncoding = ARM_AM::getFP64Imm(FPImmValue);
@@ -1061,7 +1065,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) {
case G_SHL: {
return selectShift(ARM_AM::ShiftOpc::lsl, MIB);
}
- case G_GEP:
+ case G_PTR_ADD:
I.setDesc(TII.get(Opcodes.ADDrr));
MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
break;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index 81414e6d76fe..e2dff51ea61c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -162,7 +162,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
.legalFor({s32, p0})
.minScalar(0, s32);
- getActionDefinitionsBuilder(G_GEP)
+ getActionDefinitionsBuilder(G_PTR_ADD)
.legalFor({{p0, s32}})
.minScalar(1, s32);
@@ -264,7 +264,7 @@ void ARMLegalizerInfo::setFCmpLibcallsAEABI() {
{RTLIB::OLE_F32, CmpInst::BAD_ICMP_PREDICATE}};
FCmp32Libcalls[CmpInst::FCMP_OLT] = {
{RTLIB::OLT_F32, CmpInst::BAD_ICMP_PREDICATE}};
- FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F32, CmpInst::ICMP_EQ}};
+ FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::UO_F32, CmpInst::ICMP_EQ}};
FCmp32Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F32, CmpInst::ICMP_EQ}};
FCmp32Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F32, CmpInst::ICMP_EQ}};
FCmp32Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F32, CmpInst::ICMP_EQ}};
@@ -290,7 +290,7 @@ void ARMLegalizerInfo::setFCmpLibcallsAEABI() {
{RTLIB::OLE_F64, CmpInst::BAD_ICMP_PREDICATE}};
FCmp64Libcalls[CmpInst::FCMP_OLT] = {
{RTLIB::OLT_F64, CmpInst::BAD_ICMP_PREDICATE}};
- FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F64, CmpInst::ICMP_EQ}};
+ FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::UO_F64, CmpInst::ICMP_EQ}};
FCmp64Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F64, CmpInst::ICMP_EQ}};
FCmp64Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F64, CmpInst::ICMP_EQ}};
FCmp64Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F64, CmpInst::ICMP_EQ}};
@@ -315,7 +315,7 @@ void ARMLegalizerInfo::setFCmpLibcallsGNU() {
FCmp32Libcalls[CmpInst::FCMP_OGT] = {{RTLIB::OGT_F32, CmpInst::ICMP_SGT}};
FCmp32Libcalls[CmpInst::FCMP_OLE] = {{RTLIB::OLE_F32, CmpInst::ICMP_SLE}};
FCmp32Libcalls[CmpInst::FCMP_OLT] = {{RTLIB::OLT_F32, CmpInst::ICMP_SLT}};
- FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F32, CmpInst::ICMP_EQ}};
+ FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::UO_F32, CmpInst::ICMP_EQ}};
FCmp32Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F32, CmpInst::ICMP_SGE}};
FCmp32Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F32, CmpInst::ICMP_SGT}};
FCmp32Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F32, CmpInst::ICMP_SLE}};
@@ -333,7 +333,7 @@ void ARMLegalizerInfo::setFCmpLibcallsGNU() {
FCmp64Libcalls[CmpInst::FCMP_OGT] = {{RTLIB::OGT_F64, CmpInst::ICMP_SGT}};
FCmp64Libcalls[CmpInst::FCMP_OLE] = {{RTLIB::OLE_F64, CmpInst::ICMP_SLE}};
FCmp64Libcalls[CmpInst::FCMP_OLT] = {{RTLIB::OLT_F64, CmpInst::ICMP_SLT}};
- FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F64, CmpInst::ICMP_EQ}};
+ FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::UO_F64, CmpInst::ICMP_EQ}};
FCmp64Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F64, CmpInst::ICMP_SGE}};
FCmp64Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F64, CmpInst::ICMP_SGT}};
FCmp64Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F64, CmpInst::ICMP_SLE}};
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 4a193fed04a3..12dddd29ca84 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -696,18 +696,23 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
return nullptr;
}
- int BaseOpc =
- isThumb2 ? ARM::t2ADDri :
- (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi :
- (isThumb1 && Offset < 8) ? ARM::tADDi3 :
- isThumb1 ? ARM::tADDi8 : ARM::ADDri;
+ int BaseOpc = isThumb2 ? (BaseKill && Base == ARM::SP ? ARM::t2ADDspImm
+ : ARM::t2ADDri)
+ : (isThumb1 && Base == ARM::SP)
+ ? ARM::tADDrSPi
+ : (isThumb1 && Offset < 8)
+ ? ARM::tADDi3
+ : isThumb1 ? ARM::tADDi8 : ARM::ADDri;
if (Offset < 0) {
- Offset = - Offset;
- BaseOpc =
- isThumb2 ? ARM::t2SUBri :
- (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 :
- isThumb1 ? ARM::tSUBi8 : ARM::SUBri;
+ // FIXME: There are no Thumb1 load/store instructions with negative
+ // offsets. So the Base != ARM::SP might be unnecessary.
+ Offset = -Offset;
+ BaseOpc = isThumb2 ? (BaseKill && Base == ARM::SP ? ARM::t2SUBspImm
+ : ARM::t2SUBri)
+ : (isThumb1 && Offset < 8 && Base != ARM::SP)
+ ? ARM::tSUBi3
+ : isThumb1 ? ARM::tSUBi8 : ARM::SUBri;
}
if (!TL->isLegalAddImmediate(Offset))
@@ -1186,8 +1191,10 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break;
case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break;
case ARM::t2SUBri:
+ case ARM::t2SUBspImm:
case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break;
case ARM::t2ADDri:
+ case ARM::t2ADDspImm:
case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break;
case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break;
case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index e1c5a9c3e223..6717d4706aef 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -15,6 +15,26 @@
/// - t2LoopDec - placed within in the loop body.
/// - t2LoopEnd - the loop latch terminator.
///
+/// In addition to this, we also look for the presence of the VCTP instruction,
+/// which determines whether we can generated the tail-predicated low-overhead
+/// loop form.
+///
+/// Assumptions and Dependencies:
+/// Low-overhead loops are constructed and executed using a setup instruction:
+/// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
+/// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
+/// but fixed polarity: WLS can only branch forwards and LE can only branch
+/// backwards. These restrictions mean that this pass is dependent upon block
+/// layout and block sizes, which is why it's the last pass to run. The same is
+/// true for ConstantIslands, but this pass does not increase the size of the
+/// basic blocks, nor does it change the CFG. Instructions are mainly removed
+/// during the transform and pseudo instructions are replaced by real ones. In
+/// some cases, when we have to revert to a 'normal' loop, we have to introduce
+/// multiple instructions for a single pseudo (see RevertWhile and
+/// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd
+/// are defined to be as large as this maximum sequence of replacement
+/// instructions.
+///
//===----------------------------------------------------------------------===//
#include "ARM.h"
@@ -22,9 +42,16 @@
#include "ARMBaseRegisterInfo.h"
#include "ARMBasicBlockInfo.h"
#include "ARMSubtarget.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineLoopUtils.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ReachingDefAnalysis.h"
+#include "llvm/MC/MCInstrDesc.h"
using namespace llvm;
@@ -33,10 +60,154 @@ using namespace llvm;
namespace {
+ struct PredicatedMI {
+ MachineInstr *MI = nullptr;
+ SetVector<MachineInstr*> Predicates;
+
+ public:
+ PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) :
+ MI(I) {
+ Predicates.insert(Preds.begin(), Preds.end());
+ }
+ };
+
+ // Represent a VPT block, a list of instructions that begins with a VPST and
+ // has a maximum of four proceeding instructions. All instructions within the
+ // block are predicated upon the vpr and we allow instructions to define the
+ // vpr within in the block too.
+ class VPTBlock {
+ std::unique_ptr<PredicatedMI> VPST;
+ PredicatedMI *Divergent = nullptr;
+ SmallVector<PredicatedMI, 4> Insts;
+
+ public:
+ VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
+ VPST = std::make_unique<PredicatedMI>(MI, Preds);
+ }
+
+ void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
+ if (!Divergent && !set_difference(Preds, VPST->Predicates).empty()) {
+ Divergent = &Insts.back();
+ LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
+ }
+ Insts.emplace_back(MI, Preds);
+ assert(Insts.size() <= 4 && "Too many instructions in VPT block!");
+ }
+
+ // Have we found an instruction within the block which defines the vpr? If
+ // so, not all the instructions in the block will have the same predicate.
+ bool HasNonUniformPredicate() const {
+ return Divergent != nullptr;
+ }
+
+ // Is the given instruction part of the predicate set controlling the entry
+ // to the block.
+ bool IsPredicatedOn(MachineInstr *MI) const {
+ return VPST->Predicates.count(MI);
+ }
+
+ // Is the given instruction the only predicate which controls the entry to
+ // the block.
+ bool IsOnlyPredicatedOn(MachineInstr *MI) const {
+ return IsPredicatedOn(MI) && VPST->Predicates.size() == 1;
+ }
+
+ unsigned size() const { return Insts.size(); }
+ SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
+ MachineInstr *getVPST() const { return VPST->MI; }
+ PredicatedMI *getDivergent() const { return Divergent; }
+ };
+
+ struct LowOverheadLoop {
+
+ MachineLoop *ML = nullptr;
+ MachineFunction *MF = nullptr;
+ MachineInstr *InsertPt = nullptr;
+ MachineInstr *Start = nullptr;
+ MachineInstr *Dec = nullptr;
+ MachineInstr *End = nullptr;
+ MachineInstr *VCTP = nullptr;
+ VPTBlock *CurrentBlock = nullptr;
+ SetVector<MachineInstr*> CurrentPredicate;
+ SmallVector<VPTBlock, 4> VPTBlocks;
+ bool Revert = false;
+ bool CannotTailPredicate = false;
+
+ LowOverheadLoop(MachineLoop *ML) : ML(ML) {
+ MF = ML->getHeader()->getParent();
+ }
+
+ // If this is an MVE instruction, check that we know how to use tail
+ // predication with it. Record VPT blocks and return whether the
+ // instruction is valid for tail predication.
+ bool ValidateMVEInst(MachineInstr *MI);
+
+ void AnalyseMVEInst(MachineInstr *MI) {
+ CannotTailPredicate = !ValidateMVEInst(MI);
+ }
+
+ bool IsTailPredicationLegal() const {
+ // For now, let's keep things really simple and only support a single
+ // block for tail predication.
+ return !Revert && FoundAllComponents() && VCTP &&
+ !CannotTailPredicate && ML->getNumBlocks() == 1;
+ }
+
+ bool ValidateTailPredicate(MachineInstr *StartInsertPt,
+ ReachingDefAnalysis *RDA,
+ MachineLoopInfo *MLI);
+
+ // Is it safe to define LR with DLS/WLS?
+ // LR can be defined if it is the operand to start, because it's the same
+ // value, or if it's going to be equivalent to the operand to Start.
+ MachineInstr *IsSafeToDefineLR(ReachingDefAnalysis *RDA);
+
+ // Check the branch targets are within range and we satisfy our
+ // restrictions.
+ void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA,
+ MachineLoopInfo *MLI);
+
+ bool FoundAllComponents() const {
+ return Start && Dec && End;
+ }
+
+ SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
+
+ // Return the loop iteration count, or the number of elements if we're tail
+ // predicating.
+ MachineOperand &getCount() {
+ return IsTailPredicationLegal() ?
+ VCTP->getOperand(1) : Start->getOperand(0);
+ }
+
+ unsigned getStartOpcode() const {
+ bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
+ if (!IsTailPredicationLegal())
+ return IsDo ? ARM::t2DLS : ARM::t2WLS;
+
+ return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo);
+ }
+
+ void dump() const {
+ if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
+ if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
+ if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
+ if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP;
+ if (!FoundAllComponents())
+ dbgs() << "ARM Loops: Not a low-overhead loop.\n";
+ else if (!(Start && Dec && End))
+ dbgs() << "ARM Loops: Failed to find all loop components.\n";
+ }
+ };
+
class ARMLowOverheadLoops : public MachineFunctionPass {
MachineFunction *MF = nullptr;
+ MachineLoopInfo *MLI = nullptr;
+ ReachingDefAnalysis *RDA = nullptr;
const ARMBaseInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
public:
@@ -47,6 +218,7 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineLoopInfo>();
+ AU.addRequired<ReachingDefAnalysis>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -54,7 +226,8 @@ namespace {
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoVRegs);
+ MachineFunctionProperties::Property::NoVRegs).set(
+ MachineFunctionProperties::Property::TracksLiveness);
}
StringRef getPassName() const override {
@@ -64,8 +237,6 @@ namespace {
private:
bool ProcessLoop(MachineLoop *ML);
- MachineInstr * IsSafeToDefineLR(MachineInstr *MI);
-
bool RevertNonLoops();
void RevertWhile(MachineInstr *MI) const;
@@ -74,9 +245,13 @@ namespace {
void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
- void Expand(MachineLoop *ML, MachineInstr *Start,
- MachineInstr *InsertPt, MachineInstr *Dec,
- MachineInstr *End, bool Revert);
+ void RemoveLoopUpdate(LowOverheadLoop &LoLoop);
+
+ void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
+
+ MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
+
+ void Expand(LowOverheadLoop &LoLoop);
};
}
@@ -86,128 +261,321 @@ char ARMLowOverheadLoops::ID = 0;
INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
false, false)
-bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
- const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
- if (!ST.hasLOB())
- return false;
+MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
+ // We can define LR because LR already contains the same value.
+ if (Start->getOperand(0).getReg() == ARM::LR)
+ return Start;
- MF = &mf;
- LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
+ unsigned CountReg = Start->getOperand(0).getReg();
+ auto IsMoveLR = [&CountReg](MachineInstr *MI) {
+ return MI->getOpcode() == ARM::tMOVr &&
+ MI->getOperand(0).getReg() == ARM::LR &&
+ MI->getOperand(1).getReg() == CountReg &&
+ MI->getOperand(2).getImm() == ARMCC::AL;
+ };
- auto &MLI = getAnalysis<MachineLoopInfo>();
- MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
- MRI = &MF->getRegInfo();
- TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
- BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
- BBUtils->computeAllBlockSizes();
- BBUtils->adjustBBOffsetsAfter(&MF->front());
+ MachineBasicBlock *MBB = Start->getParent();
- bool Changed = false;
- for (auto ML : MLI) {
- if (!ML->getParentLoop())
- Changed |= ProcessLoop(ML);
- }
- Changed |= RevertNonLoops();
- return Changed;
+ // Find an insertion point:
+ // - Is there a (mov lr, Count) before Start? If so, and nothing else writes
+ // to Count before Start, we can insert at that mov.
+ if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR))
+ if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
+ return LRDef;
+
+ // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
+ // to Count after Start, we can insert at that mov.
+ if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR))
+ if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
+ return LRDef;
+
+ // We've found no suitable LR def and Start doesn't use LR directly. Can we
+ // just define LR anyway?
+ if (!RDA->isRegUsedAfter(Start, ARM::LR))
+ return Start;
+
+ return nullptr;
}
-static bool IsLoopStart(MachineInstr &MI) {
- return MI.getOpcode() == ARM::t2DoLoopStart ||
- MI.getOpcode() == ARM::t2WhileLoopStart;
+// Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must
+// not define a register that is used by any instructions, after and including,
+// 'To'. These instructions also must not redefine any of Froms operands.
+template<typename Iterator>
+static bool IsSafeToMove(MachineInstr *From, MachineInstr *To, ReachingDefAnalysis *RDA) {
+ SmallSet<int, 2> Defs;
+ // First check that From would compute the same value if moved.
+ for (auto &MO : From->operands()) {
+ if (!MO.isReg() || MO.isUndef() || !MO.getReg())
+ continue;
+ if (MO.isDef())
+ Defs.insert(MO.getReg());
+ else if (!RDA->hasSameReachingDef(From, To, MO.getReg()))
+ return false;
+ }
+
+ // Now walk checking that the rest of the instructions will compute the same
+ // value.
+ for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) {
+ for (auto &MO : I->operands())
+ if (MO.isReg() && MO.getReg() && MO.isUse() && Defs.count(MO.getReg()))
+ return false;
+ }
+ return true;
}
-template<typename T>
-static MachineInstr* SearchForDef(MachineInstr *Begin, T End, unsigned Reg) {
- for(auto &MI : make_range(T(Begin), End)) {
- for (auto &MO : MI.operands()) {
- if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
+bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
+ ReachingDefAnalysis *RDA, MachineLoopInfo *MLI) {
+ assert(VCTP && "VCTP instruction expected but is not set");
+ // All predication within the loop should be based on vctp. If the block
+ // isn't predicated on entry, check whether the vctp is within the block
+ // and that all other instructions are then predicated on it.
+ for (auto &Block : VPTBlocks) {
+ if (Block.IsPredicatedOn(VCTP))
+ continue;
+ if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: "
+ << *Block.getDivergent()->MI);
+ return false;
+ }
+ SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
+ for (auto &PredMI : Insts) {
+ if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI))
continue;
- return &MI;
+ LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
+ << " - which is predicated on:\n";
+ for (auto *MI : PredMI.Predicates)
+ dbgs() << " - " << *MI;
+ );
+ return false;
}
}
- return nullptr;
-}
-static MachineInstr* SearchForUse(MachineInstr *Begin,
- MachineBasicBlock::iterator End,
- unsigned Reg) {
- for(auto &MI : make_range(MachineBasicBlock::iterator(Begin), End)) {
- for (auto &MO : MI.operands()) {
- if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
- continue;
- return &MI;
+ // For tail predication, we need to provide the number of elements, instead
+ // of the iteration count, to the loop start instruction. The number of
+ // elements is provided to the vctp instruction, so we need to check that
+ // we can use this register at InsertPt.
+ Register NumElements = VCTP->getOperand(1).getReg();
+
+ // If the register is defined within loop, then we can't perform TP.
+ // TODO: Check whether this is just a mov of a register that would be
+ // available.
+ if (RDA->getReachingDef(VCTP, NumElements) >= 0) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
+ return false;
+ }
+
+ // The element count register maybe defined after InsertPt, in which case we
+ // need to try to move either InsertPt or the def so that the [w|d]lstp can
+ // use the value.
+ MachineBasicBlock *InsertBB = InsertPt->getParent();
+ if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) {
+ if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) {
+ if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) {
+ ElemDef->removeFromParent();
+ InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
+ << *ElemDef);
+ } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) {
+ InsertPt->removeFromParent();
+ InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
+ } else {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop "
+ << "start instruction.\n");
+ return false;
+ }
}
}
- return nullptr;
+
+ // Especially in the case of while loops, InsertBB may not be the
+ // preheader, so we need to check that the register isn't redefined
+ // before entering the loop.
+ auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB,
+ Register NumElements) {
+ // NumElements is redefined in this block.
+ if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0)
+ return true;
+
+ // Don't continue searching up through multiple predecessors.
+ if (MBB->pred_size() > 1)
+ return true;
+
+ return false;
+ };
+
+ // First, find the block that looks like the preheader.
+ MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true);
+ if (!MBB) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n");
+ return false;
+ }
+
+ // Then search backwards for a def, until we get to InsertBB.
+ while (MBB != InsertBB) {
+ if (CannotProvideElements(MBB, NumElements)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
+ return false;
+ }
+ MBB = *MBB->pred_begin();
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n");
+ return true;
}
-// Is it safe to define LR with DLS/WLS?
-// LR can defined if it is the operand to start, because it's the same value,
-// or if it's going to be equivalent to the operand to Start.
-MachineInstr *ARMLowOverheadLoops::IsSafeToDefineLR(MachineInstr *Start) {
+void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
+ ReachingDefAnalysis *RDA,
+ MachineLoopInfo *MLI) {
+ if (Revert)
+ return;
- auto IsMoveLR = [](MachineInstr *MI, unsigned Reg) {
- return MI->getOpcode() == ARM::tMOVr &&
- MI->getOperand(0).getReg() == ARM::LR &&
- MI->getOperand(1).getReg() == Reg &&
- MI->getOperand(2).getImm() == ARMCC::AL;
- };
+ if (!End->getOperand(1).isMBB())
+ report_fatal_error("Expected LoopEnd to target basic block");
- MachineBasicBlock *MBB = Start->getParent();
- unsigned CountReg = Start->getOperand(0).getReg();
- // Walk forward and backward in the block to find the closest instructions
- // that define LR. Then also filter them out if they're not a mov lr.
- MachineInstr *PredLRDef = SearchForDef(Start, MBB->rend(), ARM::LR);
- if (PredLRDef && !IsMoveLR(PredLRDef, CountReg))
- PredLRDef = nullptr;
-
- MachineInstr *SuccLRDef = SearchForDef(Start, MBB->end(), ARM::LR);
- if (SuccLRDef && !IsMoveLR(SuccLRDef, CountReg))
- SuccLRDef = nullptr;
-
- // We've either found one, two or none mov lr instructions... Now figure out
- // if they are performing the equilvant mov that the Start instruction will.
- // Do this by scanning forward and backward to see if there's a def of the
- // register holding the count value. If we find a suitable def, return it as
- // the insert point. Later, if InsertPt != Start, then we can remove the
- // redundant instruction.
- if (SuccLRDef) {
- MachineBasicBlock::iterator End(SuccLRDef);
- if (!SearchForDef(Start, End, CountReg)) {
- return SuccLRDef;
- } else
- SuccLRDef = nullptr;
+ // TODO Maybe there's cases where the target doesn't have to be the header,
+ // but for now be safe and revert.
+ if (End->getOperand(1).getMBB() != ML->getHeader()) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
+ Revert = true;
+ return;
}
- if (PredLRDef) {
- MachineBasicBlock::reverse_iterator End(PredLRDef);
- if (!SearchForDef(Start, End, CountReg)) {
- return PredLRDef;
- } else
- PredLRDef = nullptr;
+
+ // The WLS and LE instructions have 12-bits for the label offset. WLS
+ // requires a positive offset, while LE uses negative.
+ if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) ||
+ !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
+ Revert = true;
+ return;
}
- // We can define LR because LR already contains the same value.
- if (Start->getOperand(0).getReg() == ARM::LR)
- return Start;
+ if (Start->getOpcode() == ARM::t2WhileLoopStart &&
+ (BBUtils->getOffsetOf(Start) >
+ BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
+ !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
+ Revert = true;
+ return;
+ }
- // We've found no suitable LR def and Start doesn't use LR directly. Can we
- // just define LR anyway?
- const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
- LivePhysRegs LiveRegs(*TRI);
- LiveRegs.addLiveOuts(*MBB);
+ InsertPt = Revert ? nullptr : IsSafeToDefineLR(RDA);
+ if (!InsertPt) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
+ Revert = true;
+ return;
+ } else
+ LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
- // Not if we've haven't found a suitable mov and LR is live out.
- if (LiveRegs.contains(ARM::LR))
- return nullptr;
+ if (!IsTailPredicationLegal()) {
+ LLVM_DEBUG(if (!VCTP)
+ dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
+ dbgs() << "ARM Loops: Tail-predication is not valid.\n");
+ return;
+ }
- // If LR is not live out, we can insert the instruction if nothing else
- // uses LR after it.
- if (!SearchForUse(Start, MBB->end(), ARM::LR))
- return Start;
+ assert(ML->getBlocks().size() == 1 &&
+ "Shouldn't be processing a loop with more than one block");
+ CannotTailPredicate = !ValidateTailPredicate(InsertPt, RDA, MLI);
+ LLVM_DEBUG(if (CannotTailPredicate)
+ dbgs() << "ARM Loops: Couldn't validate tail predicate.\n");
+}
- LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find suitable insertion point for"
- << " LR\n");
- return nullptr;
+bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
+ if (CannotTailPredicate)
+ return false;
+
+ // Only support a single vctp.
+ if (isVCTP(MI) && VCTP)
+ return false;
+
+ // Start a new vpt block when we discover a vpt.
+ if (MI->getOpcode() == ARM::MVE_VPST) {
+ VPTBlocks.emplace_back(MI, CurrentPredicate);
+ CurrentBlock = &VPTBlocks.back();
+ return true;
+ } else if (isVCTP(MI))
+ VCTP = MI;
+ else if (MI->getOpcode() == ARM::MVE_VPSEL ||
+ MI->getOpcode() == ARM::MVE_VPNOT)
+ return false;
+
+ // TODO: Allow VPSEL and VPNOT, we currently cannot because:
+ // 1) It will use the VPR as a predicate operand, but doesn't have to be
+ // instead a VPT block, which means we can assert while building up
+ // the VPT block because we don't find another VPST to being a new
+ // one.
+ // 2) VPSEL still requires a VPR operand even after tail predicating,
+ // which means we can't remove it unless there is another
+ // instruction, such as vcmp, that can provide the VPR def.
+
+ bool IsUse = false;
+ bool IsDef = false;
+ const MCInstrDesc &MCID = MI->getDesc();
+ for (int i = MI->getNumOperands() - 1; i >= 0; --i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || MO.getReg() != ARM::VPR)
+ continue;
+
+ if (MO.isDef()) {
+ CurrentPredicate.insert(MI);
+ IsDef = true;
+ } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) {
+ CurrentBlock->addInst(MI, CurrentPredicate);
+ IsUse = true;
+ } else {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
+ return false;
+ }
+ }
+
+ // If we find a vpr def that is not already predicated on the vctp, we've
+ // got disjoint predicates that may not be equivalent when we do the
+ // conversion.
+ if (IsDef && !IsUse && VCTP && !isVCTP(MI)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI);
+ return false;
+ }
+
+ uint64_t Flags = MCID.TSFlags;
+ if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
+ return true;
+
+ // If we find an instruction that has been marked as not valid for tail
+ // predication, only allow the instruction if it's contained within a valid
+ // VPT block.
+ if ((Flags & ARMII::ValidForTailPredication) == 0 && !IsUse) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
+ return false;
+ }
+
+ return true;
+}
+
+bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
+ const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
+ if (!ST.hasLOB())
+ return false;
+
+ MF = &mf;
+ LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
+
+ MLI = &getAnalysis<MachineLoopInfo>();
+ RDA = &getAnalysis<ReachingDefAnalysis>();
+ MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+ MRI = &MF->getRegInfo();
+ TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
+ TRI = ST.getRegisterInfo();
+ BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
+ BBUtils->computeAllBlockSizes();
+ BBUtils->adjustBBOffsetsAfter(&MF->front());
+
+ bool Changed = false;
+ for (auto ML : *MLI) {
+ if (!ML->getParentLoop())
+ Changed |= ProcessLoop(ML);
+ }
+ Changed |= RevertNonLoops();
+ return Changed;
}
bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
@@ -218,14 +586,21 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
Changed |= ProcessLoop(*I);
- LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n";
+ if (auto *Preheader = ML->getLoopPreheader())
+ dbgs() << " - " << Preheader->getName() << "\n";
+ else if (auto *Preheader = MLI->findLoopPreheader(ML))
+ dbgs() << " - " << Preheader->getName() << "\n";
+ for (auto *MBB : ML->getBlocks())
+ dbgs() << " - " << MBB->getName() << "\n";
+ );
// Search the given block for a loop start instruction. If one isn't found,
// and there's only one predecessor block, search that one too.
std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
[&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
for (auto &MI : *MBB) {
- if (IsLoopStart(MI))
+ if (isLoopStart(MI))
return &MI;
}
if (MBB->pred_size() == 1)
@@ -233,53 +608,43 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
return nullptr;
};
- MachineInstr *Start = nullptr;
- MachineInstr *Dec = nullptr;
- MachineInstr *End = nullptr;
- bool Revert = false;
-
- // Search the preheader for the start intrinsic, or look through the
- // predecessors of the header to find exactly one set.iterations intrinsic.
+ LowOverheadLoop LoLoop(ML);
+ // Search the preheader for the start intrinsic.
// FIXME: I don't see why we shouldn't be supporting multiple predecessors
// with potentially multiple set.loop.iterations, so we need to enable this.
- if (auto *Preheader = ML->getLoopPreheader()) {
- Start = SearchForStart(Preheader);
- } else {
- LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n"
- << " - Performing manual predecessor search.\n");
- MachineBasicBlock *Pred = nullptr;
- for (auto *MBB : ML->getHeader()->predecessors()) {
- if (!ML->contains(MBB)) {
- if (Pred) {
- LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n");
- Start = nullptr;
- break;
- }
- Pred = MBB;
- Start = SearchForStart(MBB);
- }
- }
- }
+ if (auto *Preheader = ML->getLoopPreheader())
+ LoLoop.Start = SearchForStart(Preheader);
+ else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
+ LoLoop.Start = SearchForStart(Preheader);
+ else
+ return false;
// Find the low-overhead loop components and decide whether or not to fall
- // back to a normal loop.
+ // back to a normal loop. Also look for a vctp instructions and decide
+ // whether we can convert that predicate using tail predication.
for (auto *MBB : reverse(ML->getBlocks())) {
for (auto &MI : *MBB) {
if (MI.getOpcode() == ARM::t2LoopDec)
- Dec = &MI;
+ LoLoop.Dec = &MI;
else if (MI.getOpcode() == ARM::t2LoopEnd)
- End = &MI;
- else if (IsLoopStart(MI))
- Start = &MI;
+ LoLoop.End = &MI;
+ else if (isLoopStart(MI))
+ LoLoop.Start = &MI;
else if (MI.getDesc().isCall()) {
// TODO: Though the call will require LE to execute again, does this
// mean we should revert? Always executing LE hopefully should be
// faster than performing a sub,cmp,br or even subs,br.
- Revert = true;
+ LoLoop.Revert = true;
LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
+ } else {
+ // Record VPR defs and build up their corresponding vpt blocks.
+ // Check we know how to tail predicate any mve instructions.
+ LoLoop.AnalyseMVEInst(&MI);
}
- if (!Dec || End)
+ // We need to ensure that LR is not used or defined inbetween LoopDec and
+ // LoopEnd.
+ if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert)
continue;
// If we find that LR has been written or read between LoopDec and
@@ -294,61 +659,21 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() &&
MO.getReg() == ARM::LR) {
LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI);
- Revert = true;
+ LoLoop.Revert = true;
break;
}
}
}
-
- if (Dec && End && Revert)
- break;
}
- LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
- if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
- if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;);
-
- if (!Start && !Dec && !End) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
- return Changed;
- } else if (!(Start && Dec && End)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n");
+ LLVM_DEBUG(LoLoop.dump());
+ if (!LoLoop.FoundAllComponents()) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
return false;
}
- if (!End->getOperand(1).isMBB())
- report_fatal_error("Expected LoopEnd to target basic block");
-
- // TODO Maybe there's cases where the target doesn't have to be the header,
- // but for now be safe and revert.
- if (End->getOperand(1).getMBB() != ML->getHeader()) {
- LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
- Revert = true;
- }
-
- // The WLS and LE instructions have 12-bits for the label offset. WLS
- // requires a positive offset, while LE uses negative.
- if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) ||
- !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
- Revert = true;
- }
- if (Start->getOpcode() == ARM::t2WhileLoopStart &&
- (BBUtils->getOffsetOf(Start) >
- BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
- !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
- LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
- Revert = true;
- }
-
- MachineInstr *InsertPt = Revert ? nullptr : IsSafeToDefineLR(Start);
- if (!InsertPt) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
- Revert = true;
- } else
- LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
-
- Expand(ML, Start, InsertPt, Dec, End, Revert);
+ LoLoop.CheckLegality(BBUtils.get(), RDA, MLI);
+ Expand(LoLoop);
return true;
}
@@ -365,7 +690,7 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
MIB.addImm(0);
MIB.addImm(ARMCC::AL);
MIB.addReg(ARM::NoRegister);
-
+
MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
ARM::tBcc : ARM::t2Bcc;
@@ -378,19 +703,15 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
}
bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI,
- bool AllowFlags) const {
+ bool SetFlags) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
MachineBasicBlock *MBB = MI->getParent();
- // If nothing uses or defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
- bool SetFlags = false;
- if (AllowFlags) {
- if (auto *Def = SearchForDef(MI, MBB->end(), ARM::CPSR)) {
- if (!SearchForUse(MI, MBB->end(), ARM::CPSR) &&
- Def->getOpcode() == ARM::t2LoopEnd)
- SetFlags = true;
- }
- }
+ // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
+ if (SetFlags &&
+ (RDA->isRegUsedAfter(MI, ARM::CPSR) ||
+ !RDA->hasSameReachingDef(MI, &MBB->back(), ARM::CPSR)))
+ SetFlags = false;
MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(ARM::t2SUBri));
@@ -438,44 +759,223 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
MI->eraseFromParent();
}
-void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
- MachineInstr *InsertPt,
- MachineInstr *Dec, MachineInstr *End,
- bool Revert) {
+MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
+ MachineInstr *InsertPt = LoLoop.InsertPt;
+ MachineInstr *Start = LoLoop.Start;
+ MachineBasicBlock *MBB = InsertPt->getParent();
+ bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
+ unsigned Opc = LoLoop.getStartOpcode();
+ MachineOperand &Count = LoLoop.getCount();
- auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start,
- MachineInstr *InsertPt) {
- MachineBasicBlock *MBB = InsertPt->getParent();
- unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ?
- ARM::t2DLS : ARM::t2WLS;
- MachineInstrBuilder MIB =
- BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
- MIB.addDef(ARM::LR);
- MIB.add(Start->getOperand(0));
- if (Opc == ARM::t2WLS)
- MIB.add(Start->getOperand(1));
-
- if (InsertPt != Start)
- InsertPt->eraseFromParent();
- Start->eraseFromParent();
- LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
- return &*MIB;
+ MIB.addDef(ARM::LR);
+ MIB.add(Count);
+ if (!IsDo)
+ MIB.add(Start->getOperand(1));
+
+ // When using tail-predication, try to delete the dead code that was used to
+ // calculate the number of loop iterations.
+ if (LoLoop.IsTailPredicationLegal()) {
+ SmallVector<MachineInstr*, 4> Killed;
+ SmallVector<MachineInstr*, 4> Dead;
+ if (auto *Def = RDA->getReachingMIDef(Start,
+ Start->getOperand(0).getReg())) {
+ Killed.push_back(Def);
+
+ while (!Killed.empty()) {
+ MachineInstr *Def = Killed.back();
+ Killed.pop_back();
+ Dead.push_back(Def);
+ for (auto &MO : Def->operands()) {
+ if (!MO.isReg() || !MO.isKill())
+ continue;
+
+ MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg());
+ if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1)
+ Killed.push_back(Kill);
+ }
+ }
+ for (auto *MI : Dead)
+ MI->eraseFromParent();
+ }
+ }
+
+ // If we're inserting at a mov lr, then remove it as it's redundant.
+ if (InsertPt != Start)
+ InsertPt->eraseFromParent();
+ Start->eraseFromParent();
+ LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
+ return &*MIB;
+}
+
+// Goal is to optimise and clean-up these loops:
+//
+// vector.body:
+// renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+// renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4
+// ..
+// $lr = MVE_DLSTP_32 renamable $r3
+//
+// The SUB is the old update of the loop iteration count expression, which
+// is no longer needed. This sub is removed when the element count, which is in
+// r3 in this example, is defined by an instruction in the loop, and it has
+// no uses.
+//
+void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) {
+ Register ElemCount = LoLoop.VCTP->getOperand(1).getReg();
+ MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back();
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n");
+
+ if (LoLoop.ML->getNumBlocks() != 1) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Single block loop expected\n");
+ return;
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing elemcount in operand: ";
+ LoLoop.VCTP->getOperand(1).dump());
+
+ // Find the definition we are interested in removing, if there is one.
+ MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount);
+ if (!Def) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Can't find a def, nothing to do.\n");
+ return;
+ }
+
+ // Bail if we define CPSR and it is not dead
+ if (!Def->registerDefIsDead(ARM::CPSR, TRI)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n");
+ return;
+ }
+
+ // Bail if elemcount is used in exit blocks, i.e. if it is live-in.
+ if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n");
+ return;
+ }
+
+ // Bail if there are uses after this Def in the block.
+ SmallVector<MachineInstr*, 4> Uses;
+ RDA->getReachingLocalUses(Def, ElemCount, Uses);
+ if (Uses.size()) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n");
+ return;
+ }
+
+ Uses.clear();
+ RDA->getAllInstWithUseBefore(Def, ElemCount, Uses);
+
+ // Remove Def if there are no uses, or if the only use is the VCTP
+ // instruction.
+ if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: ";
+ Def->dump());
+ Def->eraseFromParent();
+ return;
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n";
+ for (auto U : Uses) U->dump());
+}
+
+void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
+ auto RemovePredicate = [](MachineInstr *MI) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
+ if (int PIdx = llvm::findFirstVPTPredOperandIdx(*MI)) {
+ assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
+ "Expected Then predicate!");
+ MI->getOperand(PIdx).setImm(ARMVCC::None);
+ MI->getOperand(PIdx+1).setReg(0);
+ } else
+ llvm_unreachable("trying to unpredicate a non-predicated instruction");
};
+ // There are a few scenarios which we have to fix up:
+ // 1) A VPT block with is only predicated by the vctp and has no internal vpr
+ // defs.
+ // 2) A VPT block which is only predicated by the vctp but has an internal
+ // vpr def.
+ // 3) A VPT block which is predicated upon the vctp as well as another vpr
+ // def.
+ // 4) A VPT block which is not predicated upon a vctp, but contains it and
+ // all instructions within the block are predicated upon in.
+
+ for (auto &Block : LoLoop.getVPTBlocks()) {
+ SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
+ if (Block.HasNonUniformPredicate()) {
+ PredicatedMI *Divergent = Block.getDivergent();
+ if (isVCTP(Divergent->MI)) {
+ // The vctp will be removed, so the size of the vpt block needs to be
+ // modified.
+ uint64_t Size = getARMVPTBlockMask(Block.size() - 1);
+ Block.getVPST()->getOperand(0).setImm(Size);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n");
+ } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+ // The VPT block has a non-uniform predicate but it's entry is guarded
+ // only by a vctp, which means we:
+ // - Need to remove the original vpst.
+ // - Then need to unpredicate any following instructions, until
+ // we come across the divergent vpr def.
+ // - Insert a new vpst to predicate the instruction(s) that following
+ // the divergent vpr def.
+ // TODO: We could be producing more VPT blocks than necessary and could
+ // fold the newly created one into a proceeding one.
+ for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()),
+ E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
+ RemovePredicate(&*I);
+
+ unsigned Size = 0;
+ auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);
+ auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);
+ MachineInstr *InsertAt = nullptr;
+ while (I != E) {
+ InsertAt = &*I;
+ ++Size;
+ ++I;
+ }
+ MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,
+ InsertAt->getDebugLoc(),
+ TII->get(ARM::MVE_VPST));
+ MIB.addImm(getARMVPTBlockMask(Size));
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
+ LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
+ Block.getVPST()->eraseFromParent();
+ }
+ } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+ // A vpt block which is only predicated upon vctp and has no internal vpr
+ // defs:
+ // - Remove vpst.
+ // - Unpredicate the remaining instructions.
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
+ Block.getVPST()->eraseFromParent();
+ for (auto &PredMI : Insts)
+ RemovePredicate(PredMI.MI);
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP);
+ LoLoop.VCTP->eraseFromParent();
+}
+
+void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
+
// Combine the LoopDec and LoopEnd instructions into LE(TP).
- auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec,
- MachineInstr *End) {
+ auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
+ MachineInstr *End = LoLoop.End;
MachineBasicBlock *MBB = End->getParent();
+ unsigned Opc = LoLoop.IsTailPredicationLegal() ?
+ ARM::MVE_LETP : ARM::t2LEUpdate;
MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
- TII->get(ARM::t2LEUpdate));
+ TII->get(Opc));
MIB.addDef(ARM::LR);
MIB.add(End->getOperand(0));
MIB.add(End->getOperand(1));
LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
- End->eraseFromParent();
- Dec->eraseFromParent();
+ LoLoop.End->eraseFromParent();
+ LoLoop.Dec->eraseFromParent();
return &*MIB;
};
@@ -496,18 +996,22 @@ void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
}
};
- if (Revert) {
- if (Start->getOpcode() == ARM::t2WhileLoopStart)
- RevertWhile(Start);
+ if (LoLoop.Revert) {
+ if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
+ RevertWhile(LoLoop.Start);
else
- Start->eraseFromParent();
- bool FlagsAlreadySet = RevertLoopDec(Dec, true);
- RevertLoopEnd(End, FlagsAlreadySet);
+ LoLoop.Start->eraseFromParent();
+ bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec, true);
+ RevertLoopEnd(LoLoop.End, FlagsAlreadySet);
} else {
- Start = ExpandLoopStart(ML, Start, InsertPt);
- RemoveDeadBranch(Start);
- End = ExpandLoopEnd(ML, Dec, End);
- RemoveDeadBranch(End);
+ LoLoop.Start = ExpandLoopStart(LoLoop);
+ RemoveDeadBranch(LoLoop.Start);
+ LoLoop.End = ExpandLoopEnd(LoLoop);
+ RemoveDeadBranch(LoLoop.End);
+ if (LoLoop.IsTailPredicationLegal()) {
+ RemoveLoopUpdate(LoLoop);
+ ConvertVPTBlocks(LoLoop);
+ }
}
}
@@ -521,7 +1025,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
SmallVector<MachineInstr*, 4> Ends;
for (auto &I : MBB) {
- if (IsLoopStart(I))
+ if (isLoopStart(I))
Starts.push_back(&I);
else if (I.getOpcode() == ARM::t2LoopDec)
Decs.push_back(&I);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index ae5657a0a2c1..e2c9335db419 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -14,23 +14,24 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/ADT/Statistic.h"
+#include "ARM.h"
+#include "ARMSubtarget.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/NoFolder.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/PassRegistry.h"
#include "llvm/PassSupport.h"
#include "llvm/Support/Debug.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "ARM.h"
-#include "ARMSubtarget.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
using namespace llvm;
using namespace PatternMatch;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
index b008d3e2e296..dea1d767beb4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
@@ -182,11 +182,9 @@ def UseMulOps : Predicate<"Subtarget->useMulOps()">;
// But only select them if more precision in FP computation is allowed, and when
// they are not slower than a mul + add sequence.
// Do not use them for Darwin platforms.
-def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="
- " FPOpFusion::Fast && "
- " Subtarget->hasVFP4Base()) && "
- "!Subtarget->isTargetDarwin() &&"
- "Subtarget->useFPVMLx()">;
+def UseFusedMAC : Predicate<"TM.Options.AllowFPOpFusion =="
+ " FPOpFusion::Fast && "
+ "Subtarget->useFPVFMx()">;
def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index b100150175fc..43c8cd5a89be 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -172,8 +172,9 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
#endif
}
-const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
- const TargetRegisterClass &RC) const {
+const RegisterBank &
+ARMRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
+ LLT) const {
using namespace ARM;
switch (RC.getID()) {
@@ -249,7 +250,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case G_SEXT:
case G_ZEXT:
case G_ANYEXT:
- case G_GEP:
+ case G_PTR_ADD:
case G_INTTOPTR:
case G_PTRTOINT:
case G_CTLZ:
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
index 1961f7af49bb..b8aff65a967e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
@@ -32,8 +32,8 @@ class ARMRegisterBankInfo final : public ARMGenRegisterBankInfo {
public:
ARMRegisterBankInfo(const TargetRegisterInfo &TRI);
- const RegisterBank &
- getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+ const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
+ LLT) const override;
const InstructionMapping &
getInstrMapping(const MachineInstr &MI) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 09603057b2c8..eb4d39b01cbb 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -72,6 +72,9 @@ static cl::opt<bool>
ForceFastISel("arm-force-fast-isel",
cl::init(false), cl::Hidden);
+static cl::opt<bool> EnableSubRegLiveness("arm-enable-subreg-liveness",
+ cl::init(false), cl::Hidden);
+
/// initializeSubtargetDependencies - Initializes using a CPU and feature string
/// so that we can use initializer lists for subtarget initialization.
ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
@@ -379,11 +382,23 @@ bool ARMSubtarget::enableMachineScheduler() const {
return useMachineScheduler();
}
+bool ARMSubtarget::enableSubRegLiveness() const { return EnableSubRegLiveness; }
+
// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
bool ARMSubtarget::enablePostRAScheduler() const {
+ if (enableMachineScheduler())
+ return false;
+ if (disablePostRAScheduler())
+ return false;
+ // Thumb1 cores will generally not benefit from post-ra scheduling
+ return !isThumb1Only();
+}
+
+bool ARMSubtarget::enablePostRAMachineScheduler() const {
+ if (!enableMachineScheduler())
+ return false;
if (disablePostRAScheduler())
return false;
- // Don't reschedule potential IT blocks.
return !isThumb1Only();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
index ef460342a69e..6bdd021970ef 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -203,6 +203,10 @@ protected:
/// whether the FP VML[AS] instructions are slow (if so, don't use them).
bool SlowFPVMLx = false;
+ /// SlowFPVFMx - If the VFP4 / NEON instructions are available, indicates
+ /// whether the FP VFM[AS] instructions are slow (if so, don't use them).
+ bool SlowFPVFMx = false;
+
/// HasVMLxForwarding - If true, NEON has special multiplier accumulator
/// forwarding to allow mul + mla being issued back to back.
bool HasVMLxForwarding = false;
@@ -223,9 +227,6 @@ protected:
/// register allocation.
bool DisablePostRAScheduler = false;
- /// UseAA - True if using AA during codegen (DAGCombine, MISched, etc)
- bool UseAA = false;
-
/// HasThumb2 - True if Thumb2 instructions are supported.
bool HasThumb2 = false;
@@ -635,6 +636,11 @@ public:
bool useMulOps() const { return UseMulOps; }
bool useFPVMLx() const { return !SlowFPVMLx; }
+ bool useFPVFMx() const {
+ return !isTargetDarwin() && hasVFP4Base() && !SlowFPVFMx;
+ }
+ bool useFPVFMx16() const { return useFPVFMx() && hasFullFP16(); }
+ bool useFPVFMx64() const { return useFPVFMx() && hasFP64(); }
bool hasVMLxForwarding() const { return HasVMLxForwarding; }
bool isFPBrccSlow() const { return SlowFPBrcc; }
bool hasFP64() const { return HasFP64; }
@@ -806,9 +812,15 @@ public:
/// True for some subtargets at > -O0.
bool enablePostRAScheduler() const override;
+ /// True for some subtargets at > -O0.
+ bool enablePostRAMachineScheduler() const override;
+
+ /// Check whether this subtarget wants to use subregister liveness.
+ bool enableSubRegLiveness() const override;
+
/// Enable use of alias analysis during code generation (during MI
/// scheduling, DAGCombine, etc.).
- bool useAA() const override { return UseAA; }
+ bool useAA() const override { return true; }
// enableAtomicExpand- True if we need to expand our atomics.
bool enableAtomicExpand() const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 5c8007f101d9..84876eda33a6 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -46,6 +46,7 @@
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/CFGuard.h"
#include "llvm/Transforms/Scalar.h"
#include <cassert>
#include <memory>
@@ -78,7 +79,7 @@ namespace llvm {
void initializeARMExecutionDomainFixPass(PassRegistry&);
}
-extern "C" void LLVMInitializeARMTarget() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
// Register the target.
RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget());
RegisterTargetMachine<ARMLETargetMachine> A(getTheThumbLETarget());
@@ -90,7 +91,6 @@ extern "C" void LLVMInitializeARMTarget() {
initializeARMLoadStoreOptPass(Registry);
initializeARMPreAllocLoadStoreOptPass(Registry);
initializeARMParallelDSPPass(Registry);
- initializeARMCodeGenPreparePass(Registry);
initializeARMConstantIslandsPass(Registry);
initializeARMExecutionDomainFixPass(Registry);
initializeARMExpandPseudoPass(Registry);
@@ -98,6 +98,7 @@ extern "C" void LLVMInitializeARMTarget() {
initializeMVEVPTBlockPass(Registry);
initializeMVETailPredicationPass(Registry);
initializeARMLowOverheadLoopsPass(Registry);
+ initializeMVEGatherScatterLoweringPass(Registry);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -321,14 +322,7 @@ namespace {
class ARMPassConfig : public TargetPassConfig {
public:
ARMPassConfig(ARMBaseTargetMachine &TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {
- if (TM.getOptLevel() != CodeGenOpt::None) {
- ARMGenSubtargetInfo STI(TM.getTargetTriple(), TM.getTargetCPU(),
- TM.getTargetFeatureString());
- if (STI.hasFeature(ARM::FeatureUseMISched))
- substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
- }
- }
+ : TargetPassConfig(TM, PM) {}
ARMBaseTargetMachine &getARMTargetMachine() const {
return getTM<ARMBaseTargetMachine>();
@@ -411,6 +405,8 @@ void ARMPassConfig::addIRPasses() {
return ST.hasAnyDataBarrier() && !ST.isThumb1Only();
}));
+ addPass(createMVEGatherScatterLoweringPass());
+
TargetPassConfig::addIRPasses();
// Run the parallel DSP pass.
@@ -420,11 +416,15 @@ void ARMPassConfig::addIRPasses() {
// Match interleaved memory accesses to ldN/stN intrinsics.
if (TM->getOptLevel() != CodeGenOpt::None)
addPass(createInterleavedAccessPass());
+
+ // Add Control Flow Guard checks.
+ if (TM->getTargetTriple().isOSWindows())
+ addPass(createCFGuardCheckPass());
}
void ARMPassConfig::addCodeGenPrepare() {
if (getOptLevel() != CodeGenOpt::None)
- addPass(createARMCodeGenPreparePass());
+ addPass(createTypePromotionPass());
TargetPassConfig::addCodeGenPrepare();
}
@@ -518,6 +518,13 @@ void ARMPassConfig::addPreSched2() {
}
addPass(createMVEVPTBlockPass());
addPass(createThumb2ITBlockPass());
+
+ // Add both scheduling passes to give the subtarget an opportunity to pick
+ // between them.
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(&PostMachineSchedulerID);
+ addPass(&PostRASchedulerID);
+ }
}
void ARMPassConfig::addPreEmitPass() {
@@ -534,4 +541,8 @@ void ARMPassConfig::addPreEmitPass() {
addPass(createARMConstantIslandPass());
addPass(createARMLowOverheadLoopsPass());
+
+ // Identify valid longjmp targets for Windows Control Flow Guard.
+ if (TM->getTargetTriple().isOSWindows())
+ addPass(createCFGuardLongjmpPass());
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h
index cb8650d8139b..ac55d2bdcc2b 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -70,6 +70,8 @@ public:
TargetTriple.isOSWindows() ||
TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
}
+
+ bool targetSchedulesPostRAScheduling() const override { return true; };
};
/// ARM/Thumb little endian target machine.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 86c8684d14dc..7ff05034c1f2 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -22,6 +22,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/Casting.h"
@@ -37,13 +38,17 @@ using namespace llvm;
#define DEBUG_TYPE "armtti"
static cl::opt<bool> EnableMaskedLoadStores(
- "enable-arm-maskedldst", cl::Hidden, cl::init(false),
+ "enable-arm-maskedldst", cl::Hidden, cl::init(true),
cl::desc("Enable the generation of masked loads and stores"));
static cl::opt<bool> DisableLowOverheadLoops(
"disable-arm-loloops", cl::Hidden, cl::init(false),
cl::desc("Disable the generation of low-overhead loops"));
+extern cl::opt<bool> DisableTailPredication;
+
+extern cl::opt<bool> EnableMaskedGatherScatters;
+
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -104,7 +109,7 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
return 1;
}
-int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
Type *Ty) {
// Division by a constant can be turned into multiplication, but only if we
// know it's constant. So it's not so much that the immediate is cheap (it's
@@ -512,6 +517,27 @@ bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
(EltWidth == 8);
}
+bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
+ if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
+ return false;
+
+ // This method is called in 2 places:
+ // - from the vectorizer with a scalar type, in which case we need to get
+ // this as good as we can with the limited info we have (and rely on the cost
+ // model for the rest).
+ // - from the masked intrinsic lowering pass with the actual vector type.
+ // For MVE, we have a custom lowering pass that will already have custom
+ // legalised any gathers that we can to MVE intrinsics, and want to expand all
+ // the rest. The pass runs before the masked intrinsic lowering pass, so if we
+ // are here, we know we want to expand.
+ if (isa<VectorType>(Ty))
+ return false;
+
+ unsigned EltWidth = Ty->getScalarSizeInBits();
+ return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
+ (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8);
+}
+
int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
assert(MI && "MemcpyInst expected");
@@ -640,58 +666,60 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
-int ARMTTIImpl::getArithmeticInstrCost(
- unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
- TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo,
- ArrayRef<const Value *> Args) {
+int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Op1Info,
+ TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
- const unsigned FunctionCallDivCost = 20;
- const unsigned ReciprocalDivCost = 10;
- static const CostTblEntry CostTbl[] = {
- // Division.
- // These costs are somewhat random. Choose a cost of 20 to indicate that
- // vectorizing devision (added function call) is going to be very expensive.
- // Double registers types.
- { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
- { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
- { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
- { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
- { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
- { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
- { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
- { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
- { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
- { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
- { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
- // Quad register types.
- { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
- { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
- { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
- { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
- { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
- { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
- { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
- { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
- { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
- // Multiplication.
- };
-
if (ST->hasNEON()) {
+ const unsigned FunctionCallDivCost = 20;
+ const unsigned ReciprocalDivCost = 10;
+ static const CostTblEntry CostTbl[] = {
+ // Division.
+ // These costs are somewhat random. Choose a cost of 20 to indicate that
+ // vectorizing devision (added function call) is going to be very expensive.
+ // Double registers types.
+ { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
+ { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
+ { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
+ { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
+ { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
+ // Quad register types.
+ { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
+ // Multiplication.
+ };
+
if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
return LT.first * Entry->Cost;
@@ -712,6 +740,33 @@ int ARMTTIImpl::getArithmeticInstrCost(
return Cost;
}
+ // If this operation is a shift on arm/thumb2, it might well be folded into
+ // the following instruction, hence having a cost of 0.
+ auto LooksLikeAFreeShift = [&]() {
+ if (ST->isThumb1Only() || Ty->isVectorTy())
+ return false;
+
+ if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
+ return false;
+ if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
+ return false;
+
+ // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
+ switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::And:
+ case Instruction::Xor:
+ case Instruction::Or:
+ case Instruction::ICmp:
+ return true;
+ default:
+ return false;
+ }
+ };
+ if (LooksLikeAFreeShift())
+ return 0;
+
int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
? ST->getMVEVectorCostFactor()
: 1;
@@ -735,11 +790,13 @@ int ARMTTIImpl::getArithmeticInstrCost(
return BaseCost;
}
-int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace, const Instruction *I) {
+int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment, unsigned AddressSpace,
+ const Instruction *I) {
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
- if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 &&
+ if (ST->hasNEON() && Src->isVectorTy() &&
+ (Alignment && *Alignment != Align(16)) &&
Src->getVectorElementType()->isDoubleTy()) {
// Unaligned loads/stores are extremely inefficient.
// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
@@ -751,13 +808,10 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
return BaseCost * LT.first;
}
-int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int ARMTTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+ bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
@@ -772,9 +826,19 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
// vldN/vstN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
// matched to more than one vldN/vstN instruction.
+ int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1;
if (NumElts % Factor == 0 &&
- TLI->isLegalInterleavedAccessType(SubVecTy, DL))
- return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
+ TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL))
+ return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
+
+ // Some smaller than legal interleaved patterns are cheap as we can make
+ // use of the vmovn or vrev patterns to interleave a standard load. This is
+ // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
+ // promoted differently). The cost of 2 here is then a load and vrev or
+ // vmovn.
+ if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
+ VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64)
+ return 2 * BaseCost;
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -998,6 +1062,142 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
return true;
}
+static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
+ // We don't allow icmp's, and because we only look at single block loops,
+ // we simply count the icmps, i.e. there should only be 1 for the backedge.
+ if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
+ return false;
+
+ if (isa<FCmpInst>(&I))
+ return false;
+
+ // We could allow extending/narrowing FP loads/stores, but codegen is
+ // too inefficient so reject this for now.
+ if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
+ return false;
+
+ // Extends have to be extending-loads
+ if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
+ if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
+ return false;
+
+ // Truncs have to be narrowing-stores
+ if (isa<TruncInst>(&I) )
+ if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
+ return false;
+
+ return true;
+}
+
+// To set up a tail-predicated loop, we need to know the total number of
+// elements processed by that loop. Thus, we need to determine the element
+// size and:
+// 1) it should be uniform for all operations in the vector loop, so we
+// e.g. don't want any widening/narrowing operations.
+// 2) it should be smaller than i64s because we don't have vector operations
+// that work on i64s.
+// 3) we don't want elements to be reversed or shuffled, to make sure the
+// tail-predication masks/predicates the right lanes.
+//
+static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+ const DataLayout &DL,
+ const LoopAccessInfo *LAI) {
+ PredicatedScalarEvolution PSE = LAI->getPSE();
+ int ICmpCount = 0;
+ int Stride = 0;
+
+ LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
+ SmallVector<Instruction *, 16> LoadStores;
+ for (BasicBlock *BB : L->blocks()) {
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ if (isa<PHINode>(&I))
+ continue;
+ if (!canTailPredicateInstruction(I, ICmpCount)) {
+ LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
+ return false;
+ }
+
+ Type *T = I.getType();
+ if (T->isPointerTy())
+ T = T->getPointerElementType();
+
+ if (T->getScalarSizeInBits() > 32) {
+ LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
+ return false;
+ }
+
+ if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
+ Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
+ int64_t NextStride = getPtrStride(PSE, Ptr, L);
+ // TODO: for now only allow consecutive strides of 1. We could support
+ // other strides as long as it is uniform, but let's keep it simple for
+ // now.
+ if (Stride == 0 && NextStride == 1) {
+ Stride = NextStride;
+ continue;
+ }
+ if (Stride != NextStride) {
+ LLVM_DEBUG(dbgs() << "Different strides found, can't "
+ "tail-predicate\n.");
+ return false;
+ }
+ }
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
+ return true;
+}
+
+bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
+ ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *TLI,
+ DominatorTree *DT,
+ const LoopAccessInfo *LAI) {
+ if (DisableTailPredication)
+ return false;
+
+ // Creating a predicated vector loop is the first step for generating a
+ // tail-predicated hardware loop, for which we need the MVE masked
+ // load/stores instructions:
+ if (!ST->hasMVEIntegerOps())
+ return false;
+
+ // For now, restrict this to single block loops.
+ if (L->getNumBlocks() > 1) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
+ "loop.\n");
+ return false;
+ }
+
+ assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected");
+
+ HardwareLoopInfo HWLoopInfo(L);
+ if (!HWLoopInfo.canAnalyze(*LI)) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+ "analyzable.\n");
+ return false;
+ }
+
+ // This checks if we have the low-overhead branch architecture
+ // extension, and if we will create a hardware-loop:
+ if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+ "profitable.\n");
+ return false;
+ }
+
+ if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+ "a candidate.\n");
+ return false;
+ }
+
+ return canTailPredicateLoop(L, LI, SE, DL, LAI);
+}
+
+
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
// Only currently enable these preferences for M-Class cores.
@@ -1035,6 +1235,11 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
unsigned Cost = 0;
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
+ // Don't unroll vectorised loop. MVE does not benefit from it as much as
+ // scalar code.
+ if (I.getType()->isVectorTy())
+ return;
+
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
ImmutableCallSite CS(&I);
if (const Function *F = CS.getCalledFunction()) {
@@ -1043,10 +1248,6 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
return;
}
- // Don't unroll vectorised loop. MVE does not benefit from it as much as
- // scalar code.
- if (I.getType()->isVectorTy())
- return;
SmallVector<const Value*, 4> Operands(I.value_op_begin(),
I.value_op_end());
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index a878fdcfe3c7..880588adfdfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -69,15 +69,15 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx,
ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs,
ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign,
- ARM::FeatureHasSlowFPVMLx, ARM::FeatureVMLxForwarding,
- ARM::FeaturePref32BitThumb, ARM::FeatureAvoidPartialCPSR,
- ARM::FeatureCheapPredicableCPSR, ARM::FeatureAvoidMOVsShOp,
- ARM::FeatureHasRetAddrStack, ARM::FeatureHasNoBranchPredictor,
- ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization,
- ARM::FeatureMClass, ARM::FeatureRClass, ARM::FeatureAClass,
- ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, ARM::FeatureLongCalls,
- ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt,
- ARM::FeatureNoNegativeImmediates
+ ARM::FeatureHasSlowFPVMLx, ARM::FeatureHasSlowFPVFMx,
+ ARM::FeatureVMLxForwarding, ARM::FeaturePref32BitThumb,
+ ARM::FeatureAvoidPartialCPSR, ARM::FeatureCheapPredicableCPSR,
+ ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack,
+ ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP,
+ ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass,
+ ARM::FeatureAClass, ARM::FeatureNaClTrap, ARM::FeatureStrictAlign,
+ ARM::FeatureLongCalls, ARM::FeatureExecuteOnly, ARM::FeatureReserveR9,
+ ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates
};
const ARMSubtarget *getST() const { return ST; }
@@ -115,7 +115,7 @@ public:
using BaseT::getIntImmCost;
int getIntImmCost(const APInt &Imm, Type *Ty);
- int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
/// @}
@@ -159,6 +159,10 @@ public:
return isLegalMaskedLoad(DataTy, Alignment);
}
+ bool isLegalMaskedGather(Type *Ty, MaybeAlign Alignment);
+
+ bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) { return false; }
+
int getMemcpyCost(const Instruction *I);
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
@@ -187,9 +191,10 @@ public:
TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
- ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+ const Instruction *CxtI = nullptr);
- int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
unsigned AddressSpace, const Instruction *I = nullptr);
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
@@ -203,7 +208,12 @@ public:
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo);
-
+ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
+ ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *TLI,
+ DominatorTree *DT,
+ const LoopAccessInfo *LAI);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index d2c355c1da75..f6d76ee09534 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -6554,7 +6554,8 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
// Check against T3. If the second register is the PC, this is an
// alternate form of ADR, which uses encoding T4, so check for that too.
if (static_cast<ARMOperand &>(*Operands[4]).getReg() != ARM::PC &&
- static_cast<ARMOperand &>(*Operands[5]).isT2SOImm())
+ (static_cast<ARMOperand &>(*Operands[5]).isT2SOImm() ||
+ static_cast<ARMOperand &>(*Operands[5]).isT2SOImmNeg()))
return false;
// Otherwise, we use encoding T4, which does not have a cc_out
@@ -6609,9 +6610,34 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
(static_cast<ARMOperand &>(*Operands[4]).isImm() ||
(Operands.size() == 6 &&
- static_cast<ARMOperand &>(*Operands[5]).isImm())))
- return true;
-
+ static_cast<ARMOperand &>(*Operands[5]).isImm()))) {
+ // Thumb2 (add|sub){s}{p}.w GPRnopc, sp, #{T2SOImm} has cc_out
+ return (!(isThumbTwo() &&
+ (static_cast<ARMOperand &>(*Operands[4]).isT2SOImm() ||
+ static_cast<ARMOperand &>(*Operands[4]).isT2SOImmNeg())));
+ }
+ // Fixme: Should join all the thumb+thumb2 (add|sub) in a single if case
+ // Thumb2 ADD r0, #4095 -> ADDW r0, r0, #4095 (T4)
+ // Thumb2 SUB r0, #4095 -> SUBW r0, r0, #4095
+ if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
+ (Operands.size() == 5) &&
+ static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+ static_cast<ARMOperand &>(*Operands[3]).getReg() != ARM::SP &&
+ static_cast<ARMOperand &>(*Operands[3]).getReg() != ARM::PC &&
+ static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+ static_cast<ARMOperand &>(*Operands[4]).isImm()) {
+ const ARMOperand &IMM = static_cast<ARMOperand &>(*Operands[4]);
+ if (IMM.isT2SOImm() || IMM.isT2SOImmNeg())
+ return false; // add.w / sub.w
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(IMM.getImm())) {
+ const int64_t Value = CE->getValue();
+ // Thumb1 imm8 sub / add
+ if ((Value < ((1 << 7) - 1) << 2) && inITBlock() && (!(Value & 3)) &&
+ isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()))
+ return false;
+ return true; // Thumb2 T4 addw / subw
+ }
+ }
return false;
}
@@ -6703,7 +6729,7 @@ static void applyMnemonicAliases(StringRef &Mnemonic,
// omitted. We don't have a way to do that in tablegen, so fix it up here.
//
// We have to be careful to not emit an invalid Rt2 here, because the rest of
-// the assmebly parser could then generate confusing diagnostics refering to
+// the assembly parser could then generate confusing diagnostics refering to
// it. If we do find anything that prevents us from doing the transformation we
// bail out, and let the assembly parser report an error on the instruction as
// it is written.
@@ -7707,12 +7733,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
}
break;
- case ARM::t2ADDri:
- case ARM::t2ADDri12:
case ARM::t2ADDrr:
case ARM::t2ADDrs:
- case ARM::t2SUBri:
- case ARM::t2SUBri12:
case ARM::t2SUBrr:
case ARM::t2SUBrs:
if (Inst.getOperand(0).getReg() == ARM::SP &&
@@ -7895,10 +7917,10 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
case ARM::MVE_VQDMULLs32bh:
case ARM::MVE_VQDMULLs32th:
case ARM::MVE_VCMULf32:
- case ARM::MVE_VMULLs32bh:
- case ARM::MVE_VMULLs32th:
- case ARM::MVE_VMULLu32bh:
- case ARM::MVE_VMULLu32th: {
+ case ARM::MVE_VMULLBs32:
+ case ARM::MVE_VMULLTs32:
+ case ARM::MVE_VMULLBu32:
+ case ARM::MVE_VMULLTu32: {
if (Operands[3]->getReg() == Operands[4]->getReg()) {
return Error (Operands[3]->getStartLoc(),
"Qd register and Qn register can't be identical");
@@ -9750,23 +9772,33 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
}
break;
case ARM::t2ADDri12:
- // If the immediate fits for encoding T3 (t2ADDri) and the generic "add"
- // mnemonic was used (not "addw"), encoding T3 is preferred.
- if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "add" ||
- ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
- break;
- Inst.setOpcode(ARM::t2ADDri);
- Inst.addOperand(MCOperand::createReg(0)); // cc_out
- break;
case ARM::t2SUBri12:
- // If the immediate fits for encoding T3 (t2SUBri) and the generic "sub"
- // mnemonic was used (not "subw"), encoding T3 is preferred.
- if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "sub" ||
+ case ARM::t2ADDspImm12:
+ case ARM::t2SUBspImm12: {
+ // If the immediate fits for encoding T3 and the generic
+ // mnemonic was used, encoding T3 is preferred.
+ const StringRef Token = static_cast<ARMOperand &>(*Operands[0]).getToken();
+ if ((Token != "add" && Token != "sub") ||
ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
break;
- Inst.setOpcode(ARM::t2SUBri);
+ switch (Inst.getOpcode()) {
+ case ARM::t2ADDri12:
+ Inst.setOpcode(ARM::t2ADDri);
+ break;
+ case ARM::t2SUBri12:
+ Inst.setOpcode(ARM::t2SUBri);
+ break;
+ case ARM::t2ADDspImm12:
+ Inst.setOpcode(ARM::t2ADDspImm);
+ break;
+ case ARM::t2SUBspImm12:
+ Inst.setOpcode(ARM::t2SUBspImm);
+ break;
+ }
+
Inst.addOperand(MCOperand::createReg(0)); // cc_out
- break;
+ return true;
+ }
case ARM::tADDi8:
// If the immediate is in the range 0-7, we want tADDi3 iff Rd was
// explicitly specified. From the ARM ARM: "Encoding T1 is preferred
@@ -9812,6 +9844,25 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
Inst = TmpInst;
return true;
}
+ case ARM::t2ADDspImm:
+ case ARM::t2SUBspImm: {
+ // Prefer T1 encoding if possible
+ if (Inst.getOperand(5).getReg() != 0 || HasWideQualifier)
+ break;
+ unsigned V = Inst.getOperand(2).getImm();
+ if (V & 3 || V > ((1 << 7) - 1) << 2)
+ break;
+ MCInst TmpInst;
+ TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDspImm ? ARM::tADDspi
+ : ARM::tSUBspi);
+ TmpInst.addOperand(MCOperand::createReg(ARM::SP)); // destination reg
+ TmpInst.addOperand(MCOperand::createReg(ARM::SP)); // source reg
+ TmpInst.addOperand(MCOperand::createImm(V / 4)); // immediate
+ TmpInst.addOperand(Inst.getOperand(3)); // pred
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
case ARM::t2ADDrr: {
// If the destination and first source operand are the same, and
// there's no setting of the flags, use encoding T2 instead of T3.
@@ -11495,7 +11546,7 @@ bool ARMAsmParser::parseDirectiveThumbSet(SMLoc L) {
}
/// Force static initialization.
-extern "C" void LLVMInitializeARMAsmParser() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmParser() {
RegisterMCAsmParser<ARMAsmParser> X(getTheARMLETarget());
RegisterMCAsmParser<ARMAsmParser> Y(getTheARMBETarget());
RegisterMCAsmParser<ARMAsmParser> A(getTheThumbLETarget());
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index eabc26d05f47..d26b04556abb 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -137,18 +137,15 @@ public:
DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
- raw_ostream &VStream,
raw_ostream &CStream) const override;
private:
DecodeStatus getARMInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
- raw_ostream &VStream,
raw_ostream &CStream) const;
DecodeStatus getThumbInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
- raw_ostream &VStream,
raw_ostream &CStream) const;
mutable ITStatus ITBlock;
@@ -204,6 +201,9 @@ static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -566,6 +566,9 @@ static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn,
static DecodeStatus DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
#include "ARMGenDisassemblerTables.inc"
static MCDisassembler *createARMDisassembler(const Target &T,
@@ -576,8 +579,7 @@ static MCDisassembler *createARMDisassembler(const Target &T,
// Post-decoding checks
static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
- uint64_t Address, raw_ostream &OS,
- raw_ostream &CS,
+ uint64_t Address, raw_ostream &CS,
uint32_t Insn,
DecodeStatus Result) {
switch (MI.getOpcode()) {
@@ -609,17 +611,16 @@ static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes,
- uint64_t Address, raw_ostream &OS,
+ uint64_t Address,
raw_ostream &CS) const {
if (STI.getFeatureBits()[ARM::ModeThumb])
- return getThumbInstruction(MI, Size, Bytes, Address, OS, CS);
- return getARMInstruction(MI, Size, Bytes, Address, OS, CS);
+ return getThumbInstruction(MI, Size, Bytes, Address, CS);
+ return getARMInstruction(MI, Size, Bytes, Address, CS);
}
DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes,
uint64_t Address,
- raw_ostream &OS,
raw_ostream &CS) const {
CommentStream = &CS;
@@ -642,7 +643,7 @@ DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size,
decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI);
if (Result != MCDisassembler::Fail) {
Size = 4;
- return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn, Result);
+ return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result);
}
struct DecodeTable {
@@ -673,7 +674,7 @@ DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size,
decodeInstruction(DecoderTableCoProc32, MI, Insn, Address, this, STI);
if (Result != MCDisassembler::Fail) {
Size = 4;
- return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn, Result);
+ return checkDecodedInstruction(MI, Size, Address, CS, Insn, Result);
}
Size = 4;
@@ -906,7 +907,6 @@ void ARMDisassembler::UpdateThumbVFPPredicate(
DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes,
uint64_t Address,
- raw_ostream &OS,
raw_ostream &CS) const {
CommentStream = &CS;
@@ -1010,7 +1010,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
if (Result != MCDisassembler::Fail) {
Size = 4;
Check(Result, AddThumbPredicate(MI));
- return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn32, Result);
+ return checkDecodedInstruction(MI, Size, Address, CS, Insn32, Result);
}
if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
@@ -1099,7 +1099,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
return MCDisassembler::Fail;
}
-extern "C" void LLVMInitializeARMDisassembler() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMDisassembler() {
TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
createARMDisassembler);
TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(),
@@ -1231,6 +1231,17 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
return S;
}
+static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo != 13)
+ return MCDisassembler::Fail;
+
+ unsigned Register = GPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder) {
unsigned Register = 0;
@@ -5588,14 +5599,25 @@ static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn,
unsigned sign1 = fieldFromInstruction(Insn, 21, 1);
unsigned sign2 = fieldFromInstruction(Insn, 23, 1);
if (sign1 != sign2) return MCDisassembler::Fail;
+ const unsigned Rd = fieldFromInstruction(Insn, 8, 4);
+ assert(Inst.getNumOperands() == 0 && "We should receive an empty Inst");
+ DecodeStatus S = DecoderGPRRegisterClass(Inst, Rd, Address, Decoder);
unsigned Val = fieldFromInstruction(Insn, 0, 8);
Val |= fieldFromInstruction(Insn, 12, 3) << 8;
Val |= fieldFromInstruction(Insn, 26, 1) << 11;
- Val |= sign1 << 12;
- Inst.addOperand(MCOperand::createImm(SignExtend32<13>(Val)));
-
- return MCDisassembler::Success;
+ // If sign, then it is decreasing the address.
+ if (sign1) {
+ // Following ARMv7 Architecture Manual, when the offset
+ // is zero, it is decoded as a subw, not as a adr.w
+ if (!Val) {
+ Inst.setOpcode(ARM::t2SUBri12);
+ Inst.addOperand(MCOperand::createReg(ARM::PC));
+ } else
+ Val = -Val;
+ }
+ Inst.addOperand(MCOperand::createImm(Val));
+ return S;
}
static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, uint32_t Val,
@@ -6595,3 +6617,40 @@ static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, uint64_t Address
Inst.addOperand(MCOperand::createReg(ARM::VPR));
return S;
}
+
+static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ const unsigned Rd = fieldFromInstruction(Insn, 8, 4);
+ const unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+ const unsigned Imm12 = fieldFromInstruction(Insn, 26, 1) << 11 |
+ fieldFromInstruction(Insn, 12, 3) << 8 |
+ fieldFromInstruction(Insn, 0, 8);
+ const unsigned TypeT3 = fieldFromInstruction(Insn, 25, 1);
+ unsigned sign1 = fieldFromInstruction(Insn, 21, 1);
+ unsigned sign2 = fieldFromInstruction(Insn, 23, 1);
+ unsigned S = fieldFromInstruction(Insn, 20, 1);
+ if (sign1 != sign2)
+ return MCDisassembler::Fail;
+
+ // T3 does a zext of imm12, where T2 does a ThumbExpandImm (T2SOImm)
+ DecodeStatus DS = MCDisassembler::Success;
+ if ((!Check(DS,
+ DecodeGPRspRegisterClass(Inst, Rd, Address, Decoder))) || // dst
+ (!Check(DS, DecodeGPRspRegisterClass(Inst, Rn, Address, Decoder))))
+ return MCDisassembler::Fail;
+ if (TypeT3) {
+ Inst.setOpcode(sign1 ? ARM::t2SUBspImm12 : ARM::t2ADDspImm12);
+ S = 0;
+ Inst.addOperand(MCOperand::createImm(Imm12)); // zext imm12
+ } else {
+ Inst.setOpcode(sign1 ? ARM::t2SUBspImm : ARM::t2ADDspImm);
+ if (!Check(DS, DecodeT2SOImm(Inst, Imm12, Address, Decoder))) // imm12
+ return MCDisassembler::Fail;
+ }
+ if (!Check(DS, DecodeCCOutOperand(Inst, S, Address, Decoder))) // cc_out
+ return MCDisassembler::Fail;
+
+ Inst.addOperand(MCOperand::createReg(0)); // pred
+
+ return DS;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 1fee38821a49..2c26dd388c05 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -255,8 +255,11 @@ void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx,
// execute-only section in the object.
MCSectionELF *TextSection =
static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
- if (Sec.getKind().isExecuteOnly() && !TextSection->hasInstructions() &&
- !TextSection->hasData()) {
+ if (Sec.getKind().isExecuteOnly() && !TextSection->hasInstructions()) {
+ for (auto &F : TextSection->getFragmentList())
+ if (auto *DF = dyn_cast<MCDataFragment>(&F))
+ if (!DF->getContents().empty())
+ return;
TextSection->setFlags(TextSection->getFlags() | ELF::SHF_ARM_PURECODE);
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index f51fbdcd84da..f558ca8d2d9f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -441,10 +441,12 @@ public:
friend class ARMTargetELFStreamer;
ARMELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
- std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
- bool IsThumb)
- : MCELFStreamer(Context, std::move(TAB), std::move(OW), std::move(Emitter)),
- IsThumb(IsThumb) {
+ std::unique_ptr<MCObjectWriter> OW,
+ std::unique_ptr<MCCodeEmitter> Emitter, bool IsThumb,
+ bool IsAndroid)
+ : MCELFStreamer(Context, std::move(TAB), std::move(OW),
+ std::move(Emitter)),
+ IsThumb(IsThumb), IsAndroid(IsAndroid) {
EHReset();
}
@@ -657,11 +659,10 @@ private:
uint64_t Offset) {
auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
Name + "." + Twine(MappingSymbolCounter++)));
- EmitLabel(Symbol, Loc, F);
+ EmitLabelAtPos(Symbol, Loc, F, Offset);
Symbol->setType(ELF::STT_NOTYPE);
Symbol->setBinding(ELF::STB_LOCAL);
Symbol->setExternal(false);
- Symbol->setOffset(Offset);
}
void EmitThumbFunc(MCSymbol *Func) override {
@@ -687,6 +688,7 @@ private:
void EmitFixup(const MCExpr *Expr, MCFixupKind Kind);
bool IsThumb;
+ bool IsAndroid;
int64_t MappingSymbolCounter = 0;
DenseMap<const MCSection *, std::unique_ptr<ElfMappingSymbolInfo>>
@@ -1269,7 +1271,12 @@ void ARMELFStreamer::emitFnEnd() {
// Emit the exception index table entry
SwitchToExIdxSection(*FnStart);
- if (PersonalityIndex < ARM::EHABI::NUM_PERSONALITY_INDEX)
+ // The EHABI requires a dependency preserving R_ARM_NONE relocation to the
+ // personality routine to protect it from an arbitrary platform's static
+ // linker garbage collection. We disable this for Android where the unwinder
+ // is either dynamically linked or directly references the personality
+ // routine.
+ if (PersonalityIndex < ARM::EHABI::NUM_PERSONALITY_INDEX && !IsAndroid)
EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex));
const MCSymbolRefExpr *FnStartRef =
@@ -1504,9 +1511,11 @@ MCELFStreamer *createARMELFStreamer(MCContext &Context,
std::unique_ptr<MCAsmBackend> TAB,
std::unique_ptr<MCObjectWriter> OW,
std::unique_ptr<MCCodeEmitter> Emitter,
- bool RelaxAll, bool IsThumb) {
- ARMELFStreamer *S = new ARMELFStreamer(Context, std::move(TAB), std::move(OW),
- std::move(Emitter), IsThumb);
+ bool RelaxAll, bool IsThumb,
+ bool IsAndroid) {
+ ARMELFStreamer *S =
+ new ARMELFStreamer(Context, std::move(TAB), std::move(OW),
+ std::move(Emitter), IsThumb, IsAndroid);
// FIXME: This should eventually end up somewhere else where more
// intelligent flag decisions can be made. For now we are just maintaining
// the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index a1def61b58d9..b36106a78b71 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -88,8 +88,9 @@ void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
OS << markup("<reg:") << getRegisterName(RegNo, DefaultAltIdx) << markup(">");
}
-void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
- StringRef Annot, const MCSubtargetInfo &STI) {
+void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address,
+ StringRef Annot, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
unsigned Opcode = MI->getOpcode();
switch (Opcode) {
@@ -275,7 +276,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
// Copy the rest operands into NewMI.
for (unsigned i = isStore ? 3 : 2; i < MI->getNumOperands(); ++i)
NewMI.addOperand(MI->getOperand(i));
- printInstruction(&NewMI, STI, O);
+ printInstruction(&NewMI, Address, STI, O);
return;
}
break;
@@ -288,7 +289,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
switch (MI->getOperand(0).getImm()) {
default:
if (!printAliasInstr(MI, STI, O))
- printInstruction(MI, STI, O);
+ printInstruction(MI, Address, STI, O);
break;
case 0:
O << "\tssbb";
@@ -302,7 +303,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
}
if (!printAliasInstr(MI, STI, O))
- printInstruction(MI, STI, O);
+ printInstruction(MI, Address, STI, O);
printAnnotation(O, Annot);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
index eeb811e216fc..20f901033395 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
@@ -25,13 +25,13 @@ public:
bool applyTargetSpecificCLOption(StringRef Opt) override;
- void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
- const MCSubtargetInfo &STI) override;
+ void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+ const MCSubtargetInfo &STI, raw_ostream &O) override;
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
// Autogenerated by tblgen.
- void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
- raw_ostream &O);
+ void printInstruction(const MCInst *MI, uint64_t Address,
+ const MCSubtargetInfo &STI, raw_ostream &O);
virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
raw_ostream &O);
virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 90022a8d88a6..9f60e70e0e02 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -187,7 +187,8 @@ static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) {
}
static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
- const Triple &TheTriple) {
+ const Triple &TheTriple,
+ const MCTargetOptions &Options) {
MCAsmInfo *MAI;
if (TheTriple.isOSDarwin() || TheTriple.isOSBinFormatMachO())
MAI = new ARMMCAsmInfoDarwin(TheTriple);
@@ -211,7 +212,8 @@ static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
bool RelaxAll) {
return createARMELFStreamer(
Ctx, std::move(MAB), std::move(OW), std::move(Emitter), false,
- (T.getArch() == Triple::thumb || T.getArch() == Triple::thumbeb));
+ (T.getArch() == Triple::thumb || T.getArch() == Triple::thumbeb),
+ T.isAndroid());
}
static MCStreamer *
@@ -315,7 +317,7 @@ static MCInstrAnalysis *createThumbMCInstrAnalysis(const MCInstrInfo *Info) {
}
// Force static initialization.
-extern "C" void LLVMInitializeARMTargetMC() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetMC() {
for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
&getTheThumbLETarget(), &getTheThumbBETarget()}) {
// Register the MC asm info.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
index 38667d686b85..a9460b70da56 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This file implements the unwind opcode assmebler for ARM exception handling
+// This file implements the unwind opcode assembler for ARM exception handling
// table.
//
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
index c3134c04b33a..5fb7307159d1 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This file declares the unwind opcode assmebler for ARM exception handling
+// This file declares the unwind opcode assembler for ARM exception handling
// table.
//
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
new file mode 100644
index 000000000000..9f64af02e698
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -0,0 +1,301 @@
+//===- MVEGatherScatterLowering.cpp - Gather/Scatter lowering -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// This pass custom lowers llvm.gather and llvm.scatter instructions to
+/// arm.mve.gather and arm.mve.scatter intrinsics, optimising the code to
+/// produce a better final result as we go.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mve-gather-scatter-lowering"
+
+cl::opt<bool> EnableMaskedGatherScatters(
+ "enable-arm-maskedgatscat", cl::Hidden, cl::init(false),
+ cl::desc("Enable the generation of masked gathers and scatters"));
+
+namespace {
+
+class MVEGatherScatterLowering : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ explicit MVEGatherScatterLowering() : FunctionPass(ID) {
+ initializeMVEGatherScatterLoweringPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override {
+ return "MVE gather/scatter lowering";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetPassConfig>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ // Check this is a valid gather with correct alignment
+ bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize,
+ unsigned Alignment);
+ // Check whether Ptr is hidden behind a bitcast and look through it
+ void lookThroughBitcast(Value *&Ptr);
+ // Check for a getelementptr and deduce base and offsets from it, on success
+ // returning the base directly and the offsets indirectly using the Offsets
+ // argument
+ Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> Builder);
+
+ bool lowerGather(IntrinsicInst *I);
+ // Create a gather from a base + vector of offsets
+ Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> Builder);
+ // Create a gather from a vector of pointers
+ Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> Builder);
+};
+
+} // end anonymous namespace
+
+char MVEGatherScatterLowering::ID = 0;
+
+INITIALIZE_PASS(MVEGatherScatterLowering, DEBUG_TYPE,
+ "MVE gather/scattering lowering pass", false, false)
+
+Pass *llvm::createMVEGatherScatterLoweringPass() {
+ return new MVEGatherScatterLowering();
+}
+
+bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements,
+ unsigned ElemSize,
+ unsigned Alignment) {
+ // Do only allow non-extending gathers for now
+ if (((NumElements == 4 && ElemSize == 32) ||
+ (NumElements == 8 && ElemSize == 16) ||
+ (NumElements == 16 && ElemSize == 8)) &&
+ ElemSize / 8 <= Alignment)
+ return true;
+ LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid "
+ << "alignment or vector type \n");
+ return false;
+}
+
+Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr,
+ IRBuilder<> Builder) {
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (!GEP) {
+ LLVM_DEBUG(dbgs() << "masked gathers: no getelementpointer found\n");
+ return nullptr;
+ }
+ LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading"
+ << " from base + vector of offsets\n");
+ Value *GEPPtr = GEP->getPointerOperand();
+ if (GEPPtr->getType()->isVectorTy()) {
+ LLVM_DEBUG(dbgs() << "masked gathers: gather from a vector of pointers"
+ << " hidden behind a getelementptr currently not"
+ << " supported. Expanding.\n");
+ return nullptr;
+ }
+ if (GEP->getNumOperands() != 2) {
+ LLVM_DEBUG(dbgs() << "masked gathers: getelementptr with too many"
+ << " operands. Expanding.\n");
+ return nullptr;
+ }
+ Offsets = GEP->getOperand(1);
+ // SExt offsets inside masked gathers are not permitted by the architecture;
+ // we therefore can't fold them
+ if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets))
+ Offsets = ZextOffs->getOperand(0);
+ Type *OffsType = VectorType::getInteger(cast<VectorType>(Ty));
+ // If the offset we found does not have the type the intrinsic expects,
+ // i.e., the same type as the gather itself, we need to convert it (only i
+ // types) or fall back to expanding the gather
+ if (OffsType != Offsets->getType()) {
+ if (OffsType->getScalarSizeInBits() >
+ Offsets->getType()->getScalarSizeInBits()) {
+ LLVM_DEBUG(dbgs() << "masked gathers: extending offsets\n");
+ Offsets = Builder.CreateZExt(Offsets, OffsType, "");
+ } else {
+ LLVM_DEBUG(dbgs() << "masked gathers: no correct offset type. Can't"
+ << " create masked gather\n");
+ return nullptr;
+ }
+ }
+ // If none of the checks failed, return the gep's base pointer
+ return GEPPtr;
+}
+
+void MVEGatherScatterLowering::lookThroughBitcast(Value *&Ptr) {
+ // Look through bitcast instruction if #elements is the same
+ if (auto *BitCast = dyn_cast<BitCastInst>(Ptr)) {
+ Type *BCTy = BitCast->getType();
+ Type *BCSrcTy = BitCast->getOperand(0)->getType();
+ if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) {
+ LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n");
+ Ptr = BitCast->getOperand(0);
+ }
+ }
+}
+
+bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
+ using namespace PatternMatch;
+ LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n");
+
+ // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
+ // Attempt to turn the masked gather in I into a MVE intrinsic
+ // Potentially optimising the addressing modes as we do so.
+ Type *Ty = I->getType();
+ Value *Ptr = I->getArgOperand(0);
+ unsigned Alignment = cast<ConstantInt>(I->getArgOperand(1))->getZExtValue();
+ Value *Mask = I->getArgOperand(2);
+ Value *PassThru = I->getArgOperand(3);
+
+ if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(),
+ Ty->getScalarSizeInBits(), Alignment))
+ return false;
+ lookThroughBitcast(Ptr);
+ assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
+
+ IRBuilder<> Builder(I->getContext());
+ Builder.SetInsertPoint(I);
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Builder);
+ if (!Load)
+ Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
+ if (!Load)
+ return false;
+
+ if (!isa<UndefValue>(PassThru) && !match(PassThru, m_Zero())) {
+ LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - "
+ << "creating select\n");
+ Load = Builder.CreateSelect(Mask, Load, PassThru);
+ }
+
+ LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n");
+ I->replaceAllUsesWith(Load);
+ I->eraseFromParent();
+ return true;
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
+ IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
+ using namespace PatternMatch;
+ LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n");
+ Type *Ty = I->getType();
+ if (Ty->getVectorNumElements() != 4)
+ // Can't build an intrinsic for this
+ return nullptr;
+ Value *Mask = I->getArgOperand(2);
+ if (match(Mask, m_One()))
+ return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base,
+ {Ty, Ptr->getType()},
+ {Ptr, Builder.getInt32(0)});
+ else
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_vldr_gather_base_predicated,
+ {Ty, Ptr->getType(), Mask->getType()},
+ {Ptr, Builder.getInt32(0), Mask});
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
+ IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
+ using namespace PatternMatch;
+ Type *Ty = I->getType();
+ Value *Offsets;
+ Value *BasePtr = checkGEP(Offsets, Ty, Ptr, Builder);
+ if (!BasePtr)
+ return nullptr;
+
+ unsigned Scale;
+ int GEPElemSize =
+ BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits();
+ int ResultElemSize = Ty->getScalarSizeInBits();
+ // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a
+ // 8bit, 16bit or 32bit load scaled by 1
+ if (GEPElemSize == 32 && ResultElemSize == 32) {
+ Scale = 2;
+ } else if (GEPElemSize == 16 && ResultElemSize == 16) {
+ Scale = 1;
+ } else if (GEPElemSize == 8) {
+ Scale = 0;
+ } else {
+ LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't"
+ << " create masked gather\n");
+ return nullptr;
+ }
+
+ Value *Mask = I->getArgOperand(2);
+ if (!match(Mask, m_One()))
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_vldr_gather_offset_predicated,
+ {Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()},
+ {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()),
+ Builder.getInt32(Scale), Builder.getInt32(1), Mask});
+ else
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_vldr_gather_offset,
+ {Ty, BasePtr->getType(), Offsets->getType()},
+ {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()),
+ Builder.getInt32(Scale), Builder.getInt32(1)});
+}
+
+bool MVEGatherScatterLowering::runOnFunction(Function &F) {
+ if (!EnableMaskedGatherScatters)
+ return false;
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ auto &TM = TPC.getTM<TargetMachine>();
+ auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
+ if (!ST->hasMVEIntegerOps())
+ return false;
+ SmallVector<IntrinsicInst *, 4> Gathers;
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+ if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
+ Gathers.push_back(II);
+ }
+ }
+
+ if (Gathers.empty())
+ return false;
+
+ for (IntrinsicInst *I : Gathers)
+ lowerGather(I);
+
+ return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
index 4db8ab17c49b..038c68739cdf 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -20,7 +20,14 @@
/// - A tail-predicated loop, with implicit predication.
/// - A loop containing multiple VCPT instructions, predicating multiple VPT
/// blocks of instructions operating on different vector types.
+///
+/// This pass inserts the inserts the VCTP intrinsic to represent the effect of
+/// tail predication. This will be picked up by the ARM Low-overhead loop pass,
+/// which performs the final transformation to a DLSTP or WLSTP tail-predicated
+/// loop.
+#include "ARM.h"
+#include "ARMSubtarget.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -28,20 +35,19 @@
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Instructions.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "ARM.h"
-#include "ARMSubtarget.h"
using namespace llvm;
#define DEBUG_TYPE "mve-tail-predication"
#define DESC "Transform predicated vector loops to use MVE tail predication"
-static cl::opt<bool>
+cl::opt<bool>
DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
cl::init(true),
cl::desc("Disable MVE Tail Predication"));
@@ -85,6 +91,12 @@ private:
/// Is the icmp that generates an i1 vector, based upon a loop counter
/// and a limit that is defined outside the loop.
bool isTailPredicate(Instruction *Predicate, Value *NumElements);
+
+ /// Insert the intrinsic to represent the effect of tail predication.
+ void InsertVCTPIntrinsic(Instruction *Predicate,
+ DenseMap<Instruction*, Instruction*> &NewPredicates,
+ VectorType *VecTy,
+ Value *NumElements);
};
} // end namespace
@@ -123,7 +135,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
// The MVE and LOB extensions are combined to enable tail-predication, but
// there's nothing preventing us from generating VCTP instructions for v8.1m.
if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) {
- LLVM_DEBUG(dbgs() << "TP: Not a v8.1m.main+mve target.\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Not a v8.1m.main+mve target.\n");
return false;
}
@@ -148,7 +160,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
// Look for the hardware loop intrinsic that sets the iteration count.
IntrinsicInst *Setup = FindLoopIterations(Preheader);
- // The test.set iteration could live in the pre- preheader.
+ // The test.set iteration could live in the pre-preheader.
if (!Setup) {
if (!Preheader->getSinglePredecessor())
return false;
@@ -171,11 +183,9 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
if (!Decrement)
return false;
- LLVM_DEBUG(dbgs() << "TP: Running on Loop: " << *L
- << *Setup << "\n"
+ LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
<< *Decrement << "\n");
- bool Changed = TryConvert(Setup->getArgOperand(0));
- return Changed;
+ return TryConvert(Setup->getArgOperand(0));
}
bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
@@ -208,7 +218,7 @@ bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
// The vector icmp
if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
m_Instruction(Shuffle))) ||
- Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle))
+ Pred != ICmpInst::ICMP_ULE)
return false;
// First find the stuff outside the loop which is setting up the limit
@@ -230,11 +240,11 @@ bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes())))
return false;
- if (TripCount != NumElements)
+ if (TripCount != NumElements || !L->isLoopInvariant(BECount))
return false;
// Now back to searching inside the loop body...
- // Find the add with takes the index iv and adds a constant vector to it.
+ // Find the add with takes the index iv and adds a constant vector to it.
Instruction *BroadcastSplat = nullptr;
Constant *Const = nullptr;
if (!match(Induction, m_Add(m_Instruction(BroadcastSplat),
@@ -269,14 +279,14 @@ bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader());
if (!match(OnEntry, m_Zero()))
return false;
-
+
Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch());
unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements();
Instruction *LHS = nullptr;
if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes))))
return false;
-
+
return LHS == Phi;
}
@@ -298,8 +308,8 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
unsigned ElementWidth = VecTy->getScalarSizeInBits();
// MVE vectors are 128-bit, but don't support 128 x i1.
// TODO: Can we support vectors larger than 128-bits?
- unsigned MaxWidth = TTI->getRegisterBitWidth(true);
- if (Lanes * ElementWidth != MaxWidth || Lanes == MaxWidth)
+ unsigned MaxWidth = TTI->getRegisterBitWidth(true);
+ if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
return false;
MaskedInsts.push_back(cast<IntrinsicInst>(&I));
} else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
@@ -399,19 +409,25 @@ Value* MVETailPredication::ComputeElements(Value *TripCount,
// tail predicated loop.
static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
SetVector<Instruction*> &MaybeDead, Loop *L) {
- if (BasicBlock *Exit = L->getUniqueExitBlock()) {
- for (auto &Pair : NewPredicates) {
- Instruction *OldPred = Pair.first;
- Instruction *NewPred = Pair.second;
-
- for (auto &I : *Exit) {
- if (I.isSameOperationAs(OldPred)) {
- Instruction *PredClone = NewPred->clone();
- PredClone->insertBefore(&I);
- I.replaceAllUsesWith(PredClone);
- MaybeDead.insert(&I);
- break;
- }
+ BasicBlock *Exit = L->getUniqueExitBlock();
+ if (!Exit) {
+ LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
+ return;
+ }
+
+ for (auto &Pair : NewPredicates) {
+ Instruction *OldPred = Pair.first;
+ Instruction *NewPred = Pair.second;
+
+ for (auto &I : *Exit) {
+ if (I.isSameOperationAs(OldPred)) {
+ Instruction *PredClone = NewPred->clone();
+ PredClone->insertBefore(&I);
+ I.replaceAllUsesWith(PredClone);
+ MaybeDead.insert(&I);
+ LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
+ dbgs() << "ARM TP: with: "; PredClone->dump());
+ break;
}
}
}
@@ -432,23 +448,69 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
Dead.insert(I);
}
- for (auto *I : Dead)
+ for (auto *I : Dead) {
+ LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump());
I->eraseFromParent();
+ }
for (auto I : L->blocks())
DeleteDeadPHIs(I);
}
+void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
+ DenseMap<Instruction*, Instruction*> &NewPredicates,
+ VectorType *VecTy, Value *NumElements) {
+ IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
+ Module *M = L->getHeader()->getModule();
+ Type *Ty = IntegerType::get(M->getContext(), 32);
+
+ // Insert a phi to count the number of elements processed by the loop.
+ PHINode *Processed = Builder.CreatePHI(Ty, 2);
+ Processed->addIncoming(NumElements, L->getLoopPreheader());
+
+ // Insert the intrinsic to represent the effect of tail predication.
+ Builder.SetInsertPoint(cast<Instruction>(Predicate));
+ ConstantInt *Factor =
+ ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+
+ Intrinsic::ID VCTPID;
+ switch (VecTy->getNumElements()) {
+ default:
+ llvm_unreachable("unexpected number of lanes");
+ case 4: VCTPID = Intrinsic::arm_mve_vctp32; break;
+ case 8: VCTPID = Intrinsic::arm_mve_vctp16; break;
+ case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
+
+ // FIXME: vctp64 currently not supported because the predicate
+ // vector wants to be <2 x i1>, but v2i1 is not a legal MVE
+ // type, so problems happen at isel time.
+ // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics
+ // purposes, but takes a v4i1 instead of a v2i1.
+ }
+ Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
+ Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
+ Predicate->replaceAllUsesWith(TailPredicate);
+ NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
+
+ // Add the incoming value to the new phi.
+ // TODO: This add likely already exists in the loop.
+ Value *Remaining = Builder.CreateSub(Processed, Factor);
+ Processed->addIncoming(Remaining, L->getLoopLatch());
+ LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: "
+ << *Processed << "\n"
+ << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n");
+}
+
bool MVETailPredication::TryConvert(Value *TripCount) {
- if (!IsPredicatedVectorLoop())
+ if (!IsPredicatedVectorLoop()) {
+ LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop");
return false;
+ }
- LLVM_DEBUG(dbgs() << "TP: Found predicated vector loop.\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
// Walk through the masked intrinsics and try to find whether the predicate
// operand is generated from an induction variable.
- Module *M = L->getHeader()->getModule();
- Type *Ty = IntegerType::get(M->getContext(), 32);
SetVector<Instruction*> Predicates;
DenseMap<Instruction*, Instruction*> NewPredicates;
@@ -465,43 +527,14 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
continue;
if (!isTailPredicate(Predicate, NumElements)) {
- LLVM_DEBUG(dbgs() << "TP: Not tail predicate: " << *Predicate << "\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Not tail predicate: " << *Predicate << "\n");
continue;
}
- LLVM_DEBUG(dbgs() << "TP: Found tail predicate: " << *Predicate << "\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Found tail predicate: " << *Predicate << "\n");
Predicates.insert(Predicate);
- // Insert a phi to count the number of elements processed by the loop.
- IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
- PHINode *Processed = Builder.CreatePHI(Ty, 2);
- Processed->addIncoming(NumElements, L->getLoopPreheader());
-
- // Insert the intrinsic to represent the effect of tail predication.
- Builder.SetInsertPoint(cast<Instruction>(Predicate));
- ConstantInt *Factor =
- ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
- Intrinsic::ID VCTPID;
- switch (VecTy->getNumElements()) {
- default:
- llvm_unreachable("unexpected number of lanes");
- case 2: VCTPID = Intrinsic::arm_vctp64; break;
- case 4: VCTPID = Intrinsic::arm_vctp32; break;
- case 8: VCTPID = Intrinsic::arm_vctp16; break;
- case 16: VCTPID = Intrinsic::arm_vctp8; break;
- }
- Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
- Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
- Predicate->replaceAllUsesWith(TailPredicate);
- NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
-
- // Add the incoming value to the new phi.
- // TODO: This add likely already exists in the loop.
- Value *Remaining = Builder.CreateSub(Processed, Factor);
- Processed->addIncoming(Remaining, L->getLoopLatch());
- LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: "
- << *Processed << "\n"
- << "TP: Inserted VCTP: " << *TailPredicate << "\n");
+ InsertVCTPIntrinsic(Predicate, NewPredicates, VecTy, NumElements);
}
// Now clean up.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
index bc0a80b177ed..a5df46c94f42 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -22,9 +22,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineInstrBundle.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/ReachingDefAnalysis.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Debug.h"
#include <cassert>
#include <new>
@@ -37,16 +37,21 @@ namespace {
class MVEVPTBlock : public MachineFunctionPass {
public:
static char ID;
- const Thumb2InstrInfo *TII;
- const TargetRegisterInfo *TRI;
MVEVPTBlock() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &Fn) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<ReachingDefAnalysis>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoVRegs);
+ MachineFunctionProperties::Property::NoVRegs).set(
+ MachineFunctionProperties::Property::TracksLiveness);
}
StringRef getPassName() const override {
@@ -55,6 +60,9 @@ namespace {
private:
bool InsertVPTBlocks(MachineBasicBlock &MBB);
+
+ const Thumb2InstrInfo *TII = nullptr;
+ ReachingDefAnalysis *RDA = nullptr;
};
char MVEVPTBlock::ID = 0;
@@ -63,112 +71,32 @@ namespace {
INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false)
-enum VPTMaskValue {
- T = 8, // 0b1000
- TT = 4, // 0b0100
- TE = 12, // 0b1100
- TTT = 2, // 0b0010
- TTE = 6, // 0b0110
- TEE = 10, // 0b1010
- TET = 14, // 0b1110
- TTTT = 1, // 0b0001
- TTTE = 3, // 0b0011
- TTEE = 5, // 0b0101
- TTET = 7, // 0b0111
- TEEE = 9, // 0b1001
- TEET = 11, // 0b1011
- TETT = 13, // 0b1101
- TETE = 15 // 0b1111
-};
-
-static unsigned VCMPOpcodeToVPT(unsigned Opcode) {
- switch (Opcode) {
- case ARM::MVE_VCMPf32:
- return ARM::MVE_VPTv4f32;
- case ARM::MVE_VCMPf16:
- return ARM::MVE_VPTv8f16;
- case ARM::MVE_VCMPi8:
- return ARM::MVE_VPTv16i8;
- case ARM::MVE_VCMPi16:
- return ARM::MVE_VPTv8i16;
- case ARM::MVE_VCMPi32:
- return ARM::MVE_VPTv4i32;
- case ARM::MVE_VCMPu8:
- return ARM::MVE_VPTv16u8;
- case ARM::MVE_VCMPu16:
- return ARM::MVE_VPTv8u16;
- case ARM::MVE_VCMPu32:
- return ARM::MVE_VPTv4u32;
- case ARM::MVE_VCMPs8:
- return ARM::MVE_VPTv16s8;
- case ARM::MVE_VCMPs16:
- return ARM::MVE_VPTv8s16;
- case ARM::MVE_VCMPs32:
- return ARM::MVE_VPTv4s32;
-
- case ARM::MVE_VCMPf32r:
- return ARM::MVE_VPTv4f32r;
- case ARM::MVE_VCMPf16r:
- return ARM::MVE_VPTv8f16r;
- case ARM::MVE_VCMPi8r:
- return ARM::MVE_VPTv16i8r;
- case ARM::MVE_VCMPi16r:
- return ARM::MVE_VPTv8i16r;
- case ARM::MVE_VCMPi32r:
- return ARM::MVE_VPTv4i32r;
- case ARM::MVE_VCMPu8r:
- return ARM::MVE_VPTv16u8r;
- case ARM::MVE_VCMPu16r:
- return ARM::MVE_VPTv8u16r;
- case ARM::MVE_VCMPu32r:
- return ARM::MVE_VPTv4u32r;
- case ARM::MVE_VCMPs8r:
- return ARM::MVE_VPTv16s8r;
- case ARM::MVE_VCMPs16r:
- return ARM::MVE_VPTv8s16r;
- case ARM::MVE_VCMPs32r:
- return ARM::MVE_VPTv4s32r;
-
- default:
- return 0;
- }
-}
-
-static MachineInstr *findVCMPToFoldIntoVPST(MachineBasicBlock::iterator MI,
- const TargetRegisterInfo *TRI,
+static MachineInstr *findVCMPToFoldIntoVPST(MachineInstr *MI,
+ ReachingDefAnalysis *RDA,
unsigned &NewOpcode) {
- // Search backwards to the instruction that defines VPR. This may or not
- // be a VCMP, we check that after this loop. If we find another instruction
- // that reads cpsr, we return nullptr.
- MachineBasicBlock::iterator CmpMI = MI;
- while (CmpMI != MI->getParent()->begin()) {
- --CmpMI;
- if (CmpMI->modifiesRegister(ARM::VPR, TRI))
- break;
- if (CmpMI->readsRegister(ARM::VPR, TRI))
- break;
- }
-
- if (CmpMI == MI)
- return nullptr;
- NewOpcode = VCMPOpcodeToVPT(CmpMI->getOpcode());
- if (NewOpcode == 0)
+ // First, search backwards to the instruction that defines VPR
+ auto *Def = RDA->getReachingMIDef(MI, ARM::VPR);
+ if (!Def)
return nullptr;
- // Search forward from CmpMI to MI, checking if either register was def'd
- if (registerDefinedBetween(CmpMI->getOperand(1).getReg(), std::next(CmpMI),
- MI, TRI))
+ // Now check that Def is a VCMP
+ if (!(NewOpcode = VCMPOpcodeToVPT(Def->getOpcode())))
return nullptr;
- if (registerDefinedBetween(CmpMI->getOperand(2).getReg(), std::next(CmpMI),
- MI, TRI))
+
+ // Check that Def's operands are not defined between the VCMP and MI, i.e.
+ // check that they have the same reaching def.
+ if (!RDA->hasSameReachingDef(Def, MI, Def->getOperand(1).getReg()) ||
+ !RDA->hasSameReachingDef(Def, MI, Def->getOperand(2).getReg()))
return nullptr;
- return &*CmpMI;
+
+ return Def;
}
bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
bool Modified = false;
MachineBasicBlock::instr_iterator MBIter = Block.instr_begin();
MachineBasicBlock::instr_iterator EndIter = Block.instr_end();
+ SmallSet<MachineInstr *, 4> RemovedVCMPs;
while (MBIter != EndIter) {
MachineInstr *MI = &*MBIter;
@@ -208,29 +136,13 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
++MBIter;
};
- unsigned BlockMask = 0;
- switch (VPTInstCnt) {
- case 1:
- BlockMask = VPTMaskValue::T;
- break;
- case 2:
- BlockMask = VPTMaskValue::TT;
- break;
- case 3:
- BlockMask = VPTMaskValue::TTT;
- break;
- case 4:
- BlockMask = VPTMaskValue::TTTT;
- break;
- default:
- llvm_unreachable("Unexpected number of instruction in a VPT block");
- };
+ unsigned BlockMask = getARMVPTBlockMask(VPTInstCnt);
// Search back for a VCMP that can be folded to create a VPT, or else create
// a VPST directly
MachineInstrBuilder MIBuilder;
unsigned NewOpcode;
- MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode);
+ MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, RDA, NewOpcode);
if (VCMP) {
LLVM_DEBUG(dbgs() << " folding VCMP into VPST: "; VCMP->dump());
MIBuilder = BuildMI(Block, MI, dl, TII->get(NewOpcode));
@@ -238,7 +150,11 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
MIBuilder.add(VCMP->getOperand(1));
MIBuilder.add(VCMP->getOperand(2));
MIBuilder.add(VCMP->getOperand(3));
- VCMP->eraseFromParent();
+ // We delay removing the actual VCMP instruction by saving it to a list
+ // and deleting all instructions in this list in one go after we have
+ // created the VPT blocks. We do this in order not to invalidate the
+ // ReachingDefAnalysis that is queried by 'findVCMPToFoldIntoVPST'.
+ RemovedVCMPs.insert(VCMP);
} else {
MIBuilder = BuildMI(Block, MI, dl, TII->get(ARM::MVE_VPST));
MIBuilder.addImm(BlockMask);
@@ -249,10 +165,17 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
Modified = true;
}
+
+ for (auto *I : RemovedVCMPs)
+ I->eraseFromParent();
+
return Modified;
}
bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
+ if (skipFunction(Fn.getFunction()))
+ return false;
+
const ARMSubtarget &STI =
static_cast<const ARMSubtarget &>(Fn.getSubtarget());
@@ -260,7 +183,7 @@ bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
return false;
TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
- TRI = STI.getRegisterInfo();
+ RDA = &getAnalysis<ReachingDefAnalysis>();
LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n"
<< "********** Function: " << Fn.getName() << '\n');
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
index 86cb907abfa3..a7f7d75e356e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -27,7 +27,7 @@ Target &llvm::getTheThumbBETarget() {
return TheThumbBETarget;
}
-extern "C" void LLVMInitializeARMTargetInfo() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetInfo() {
RegisterTarget<Triple::arm, /*HasJIT=*/true> X(getTheARMLETarget(), "arm",
"ARM", "ARM");
RegisterTarget<Triple::armeb, /*HasJIT=*/true> Y(getTheARMBETarget(), "armeb",
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index fccaa4c9cc8a..b08b71a4952d 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -37,8 +37,8 @@ unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DestReg,
- unsigned SrcReg, bool KillSrc) const {
+ const DebugLoc &DL, MCRegister DestReg,
+ MCRegister SrcReg, bool KillSrc) const {
// Need to check the arch.
MachineFunction &MF = *MBB.getParent();
const ARMSubtarget &st = MF.getSubtarget<ARMSubtarget>();
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h
index bc433e7a7a93..530289fe8c5d 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -38,7 +38,7 @@ public:
const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index af1f0aeb27ba..e06bb9546c03 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -120,8 +120,8 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DestReg,
- unsigned SrcReg, bool KillSrc) const {
+ const DebugLoc &DL, MCRegister DestReg,
+ MCRegister SrcReg, bool KillSrc) const {
// Handle SPR, DPR, and QPR copies.
if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
@@ -303,50 +303,45 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
continue;
}
- bool HasCCOut = true;
- if (BaseReg == ARM::SP) {
- // sub sp, sp, #imm7
- if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
- assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
- Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
- BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
- .addReg(BaseReg)
- .addImm(ThisVal / 4)
- .setMIFlags(MIFlags)
- .add(predOps(ARMCC::AL));
- NumBytes = 0;
- continue;
- }
+ assert((DestReg != ARM::SP || BaseReg == ARM::SP) &&
+ "Writing to SP, from other register.");
- // sub rd, sp, so_imm
- Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
- if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
- NumBytes = 0;
- } else {
- // FIXME: Move this to ARMAddressingModes.h?
- unsigned RotAmt = countLeadingZeros(ThisVal);
- ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
- NumBytes &= ~ThisVal;
- assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
- "Bit extraction didn't work?");
- }
+ // Try to use T1, as it smaller
+ if ((DestReg == ARM::SP) && (ThisVal < ((1 << 7) - 1) * 4)) {
+ assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
+ Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+ BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+ .addReg(BaseReg)
+ .addImm(ThisVal / 4)
+ .setMIFlags(MIFlags)
+ .add(predOps(ARMCC::AL));
+ break;
+ }
+ bool HasCCOut = true;
+ int ImmIsT2SO = ARM_AM::getT2SOImmVal(ThisVal);
+ bool ToSP = DestReg == ARM::SP;
+ unsigned t2SUB = ToSP ? ARM::t2SUBspImm : ARM::t2SUBri;
+ unsigned t2ADD = ToSP ? ARM::t2ADDspImm : ARM::t2ADDri;
+ unsigned t2SUBi12 = ToSP ? ARM::t2SUBspImm12 : ARM::t2SUBri12;
+ unsigned t2ADDi12 = ToSP ? ARM::t2ADDspImm12 : ARM::t2ADDri12;
+ Opc = isSub ? t2SUB : t2ADD;
+ // Prefer T2: sub rd, rn, so_imm | sub sp, sp, so_imm
+ if (ImmIsT2SO != -1) {
+ NumBytes = 0;
+ } else if (ThisVal < 4096) {
+ // Prefer T3 if can make it in a single go: subw rd, rn, imm12 | subw sp,
+ // sp, imm12
+ Opc = isSub ? t2SUBi12 : t2ADDi12;
+ HasCCOut = false;
+ NumBytes = 0;
} else {
- assert(DestReg != ARM::SP && BaseReg != ARM::SP);
- Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
- if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
- NumBytes = 0;
- } else if (ThisVal < 4096) {
- Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
- HasCCOut = false;
- NumBytes = 0;
- } else {
- // FIXME: Move this to ARMAddressingModes.h?
- unsigned RotAmt = countLeadingZeros(ThisVal);
- ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
- NumBytes &= ~ThisVal;
- assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
- "Bit extraction didn't work?");
- }
+ // Use one T2 instruction to reduce NumBytes
+ // FIXME: Move this to ARMAddressingModes.h?
+ unsigned RotAmt = countLeadingZeros(ThisVal);
+ ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+ NumBytes &= ~ThisVal;
+ assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+ "Bit extraction didn't work?");
}
// Build the new ADD / SUB.
@@ -375,6 +370,8 @@ negativeOffsetOpcode(unsigned opcode)
case ARM::t2STRBi12: return ARM::t2STRBi8;
case ARM::t2STRHi12: return ARM::t2STRHi8;
case ARM::t2PLDi12: return ARM::t2PLDi8;
+ case ARM::t2PLDWi12: return ARM::t2PLDWi8;
+ case ARM::t2PLIi12: return ARM::t2PLIi8;
case ARM::t2LDRi8:
case ARM::t2LDRHi8:
@@ -385,13 +382,13 @@ negativeOffsetOpcode(unsigned opcode)
case ARM::t2STRBi8:
case ARM::t2STRHi8:
case ARM::t2PLDi8:
+ case ARM::t2PLDWi8:
+ case ARM::t2PLIi8:
return opcode;
default:
- break;
+ llvm_unreachable("unknown thumb2 opcode.");
}
-
- return 0;
}
static unsigned
@@ -407,6 +404,8 @@ positiveOffsetOpcode(unsigned opcode)
case ARM::t2STRBi8: return ARM::t2STRBi12;
case ARM::t2STRHi8: return ARM::t2STRHi12;
case ARM::t2PLDi8: return ARM::t2PLDi12;
+ case ARM::t2PLDWi8: return ARM::t2PLDWi12;
+ case ARM::t2PLIi8: return ARM::t2PLIi12;
case ARM::t2LDRi12:
case ARM::t2LDRHi12:
@@ -417,13 +416,13 @@ positiveOffsetOpcode(unsigned opcode)
case ARM::t2STRBi12:
case ARM::t2STRHi12:
case ARM::t2PLDi12:
+ case ARM::t2PLDWi12:
+ case ARM::t2PLIi12:
return opcode;
default:
- break;
+ llvm_unreachable("unknown thumb2 opcode.");
}
-
- return 0;
}
static unsigned
@@ -439,6 +438,8 @@ immediateOffsetOpcode(unsigned opcode)
case ARM::t2STRBs: return ARM::t2STRBi12;
case ARM::t2STRHs: return ARM::t2STRHi12;
case ARM::t2PLDs: return ARM::t2PLDi12;
+ case ARM::t2PLDWs: return ARM::t2PLDWi12;
+ case ARM::t2PLIs: return ARM::t2PLIi12;
case ARM::t2LDRi12:
case ARM::t2LDRHi12:
@@ -449,6 +450,8 @@ immediateOffsetOpcode(unsigned opcode)
case ARM::t2STRBi12:
case ARM::t2STRHi12:
case ARM::t2PLDi12:
+ case ARM::t2PLDWi12:
+ case ARM::t2PLIi12:
case ARM::t2LDRi8:
case ARM::t2LDRHi8:
case ARM::t2LDRBi8:
@@ -458,13 +461,13 @@ immediateOffsetOpcode(unsigned opcode)
case ARM::t2STRBi8:
case ARM::t2STRHi8:
case ARM::t2PLDi8:
+ case ARM::t2PLDWi8:
+ case ARM::t2PLIi8:
return opcode;
default:
- break;
+ llvm_unreachable("unknown thumb2 opcode.");
}
-
- return 0;
}
bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
@@ -484,7 +487,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR)
AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
- if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
+ const bool IsSP = Opcode == ARM::t2ADDspImm12 || Opcode == ARM::t2ADDspImm;
+ if (IsSP || Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
Offset += MI.getOperand(FrameRegIdx+1).getImm();
unsigned PredReg;
@@ -501,14 +505,14 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
return true;
}
- bool HasCCOut = Opcode != ARM::t2ADDri12;
+ bool HasCCOut = (Opcode != ARM::t2ADDspImm12 && Opcode != ARM::t2ADDri12);
if (Offset < 0) {
Offset = -Offset;
isSub = true;
- MI.setDesc(TII.get(ARM::t2SUBri));
+ MI.setDesc(IsSP ? TII.get(ARM::t2SUBspImm) : TII.get(ARM::t2SUBri));
} else {
- MI.setDesc(TII.get(ARM::t2ADDri));
+ MI.setDesc(IsSP ? TII.get(ARM::t2ADDspImm) : TII.get(ARM::t2ADDri));
}
// Common case: small offset, fits into instruction.
@@ -524,7 +528,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
// Another common case: imm12.
if (Offset < 4096 &&
(!HasCCOut || MI.getOperand(MI.getNumOperands()-1).getReg() == 0)) {
- unsigned NewOpc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+ unsigned NewOpc = isSub ? IsSP ? ARM::t2SUBspImm12 : ARM::t2SUBri12
+ : IsSP ? ARM::t2ADDspImm12 : ARM::t2ADDri12;
MI.setDesc(TII.get(NewOpc));
MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index a6712d5a0e72..7d8dff14e1e7 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -39,7 +39,7 @@ public:
MachineBasicBlock::iterator MBBI) const override;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
index aa3aca359cb8..27605422983d 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
@@ -64,6 +64,25 @@ inline static CondCodes getOppositeCondition(CondCodes CC) {
case LE: return GT;
}
}
+
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
+ switch (CC) {
+ default: return ARMCC::AL;
+ case ARMCC::EQ: return ARMCC::EQ;
+ case ARMCC::NE: return ARMCC::NE;
+ case ARMCC::HS: return ARMCC::LS;
+ case ARMCC::LO: return ARMCC::HI;
+ case ARMCC::HI: return ARMCC::LO;
+ case ARMCC::LS: return ARMCC::HS;
+ case ARMCC::GE: return ARMCC::LE;
+ case ARMCC::LT: return ARMCC::GT;
+ case ARMCC::GT: return ARMCC::LT;
+ case ARMCC::LE: return ARMCC::GE;
+ }
+}
} // end namespace ARMCC
namespace ARMVCC {
@@ -72,6 +91,40 @@ namespace ARMVCC {
Then,
Else
};
+
+ enum VPTMaskValue {
+ T = 8, // 0b1000
+ TT = 4, // 0b0100
+ TE = 12, // 0b1100
+ TTT = 2, // 0b0010
+ TTE = 6, // 0b0110
+ TEE = 10, // 0b1010
+ TET = 14, // 0b1110
+ TTTT = 1, // 0b0001
+ TTTE = 3, // 0b0011
+ TTEE = 5, // 0b0101
+ TTET = 7, // 0b0111
+ TEEE = 9, // 0b1001
+ TEET = 11, // 0b1011
+ TETT = 13, // 0b1101
+ TETE = 15 // 0b1111
+ };
+}
+
+inline static unsigned getARMVPTBlockMask(unsigned NumInsts) {
+ switch (NumInsts) {
+ case 1:
+ return ARMVCC::T;
+ case 2:
+ return ARMVCC::TT;
+ case 3:
+ return ARMVCC::TTT;
+ case 4:
+ return ARMVCC::TTTT;
+ default:
+ break;
+ };
+ llvm_unreachable("Unexpected number of instruction in a VPT block");
}
inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) {