summaryrefslogtreecommitdiff
path: root/lib/Target/ARM
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-10-23 17:51:42 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-10-23 17:51:42 +0000
commit1d5ae1026e831016fc29fd927877c86af904481f (patch)
tree2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Target/ARM
parente6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff)
Diffstat (limited to 'lib/Target/ARM')
-rw-r--r--lib/Target/ARM/A15SDOptimizer.cpp54
-rw-r--r--lib/Target/ARM/ARM.h2
-rw-r--r--lib/Target/ARM/ARM.td42
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.cpp66
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.cpp216
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.h21
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.cpp49
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.h5
-rw-r--r--lib/Target/ARM/ARMBasicBlockInfo.cpp16
-rw-r--r--lib/Target/ARM/ARMBasicBlockInfo.h31
-rw-r--r--lib/Target/ARM/ARMCallLowering.cpp54
-rw-r--r--lib/Target/ARM/ARMCallLowering.h5
-rw-r--r--lib/Target/ARM/ARMCallingConv.cpp2
-rw-r--r--lib/Target/ARM/ARMCodeGenPrepare.cpp88
-rw-r--r--lib/Target/ARM/ARMConstantIslandPass.cpp289
-rw-r--r--lib/Target/ARM/ARMConstantPoolValue.cpp1
-rw-r--r--lib/Target/ARM/ARMExpandPseudoInsts.cpp96
-rw-r--r--lib/Target/ARM/ARMFastISel.cpp88
-rw-r--r--lib/Target/ARM/ARMFrameLowering.cpp65
-rw-r--r--lib/Target/ARM/ARMFrameLowering.h5
-rw-r--r--lib/Target/ARM/ARMISelDAGToDAG.cpp224
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp2073
-rw-r--r--lib/Target/ARM/ARMISelLowering.h45
-rw-r--r--lib/Target/ARM/ARMInstrFormats.td23
-rw-r--r--lib/Target/ARM/ARMInstrInfo.cpp2
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td127
-rw-r--r--lib/Target/ARM/ARMInstrMVE.td1430
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td191
-rw-r--r--lib/Target/ARM/ARMInstrThumb.td16
-rw-r--r--lib/Target/ARM/ARMInstrThumb2.td98
-rw-r--r--lib/Target/ARM/ARMInstrVFP.td96
-rw-r--r--lib/Target/ARM/ARMInstructionSelector.cpp41
-rw-r--r--lib/Target/ARM/ARMLegalizerInfo.cpp2
-rw-r--r--lib/Target/ARM/ARMLoadStoreOptimizer.cpp32
-rw-r--r--lib/Target/ARM/ARMLowOverheadLoops.cpp364
-rw-r--r--lib/Target/ARM/ARMMCInstLower.cpp4
-rw-r--r--lib/Target/ARM/ARMMachineFunctionInfo.h8
-rw-r--r--lib/Target/ARM/ARMParallelDSP.cpp675
-rw-r--r--lib/Target/ARM/ARMPredicates.td2
-rw-r--r--lib/Target/ARM/ARMRegisterInfo.td18
-rw-r--r--lib/Target/ARM/ARMScheduleA9.td4
-rw-r--r--lib/Target/ARM/ARMScheduleM4.td24
-rw-r--r--lib/Target/ARM/ARMSubtarget.cpp14
-rw-r--r--lib/Target/ARM/ARMSubtarget.h30
-rw-r--r--lib/Target/ARM/ARMTargetMachine.cpp13
-rw-r--r--lib/Target/ARM/ARMTargetTransformInfo.cpp370
-rw-r--r--lib/Target/ARM/ARMTargetTransformInfo.h24
-rw-r--r--lib/Target/ARM/AsmParser/ARMAsmParser.cpp251
-rw-r--r--lib/Target/ARM/Disassembler/ARMDisassembler.cpp31
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h20
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp14
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h3
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp6
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp12
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h5
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp21
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp6
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp4
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp2
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp10
-rw-r--r--lib/Target/ARM/MLxExpansionPass.cpp42
-rw-r--r--lib/Target/ARM/MVETailPredication.cpp519
-rw-r--r--lib/Target/ARM/MVEVPTBlockPass.cpp278
-rw-r--r--lib/Target/ARM/Thumb1FrameLowering.cpp8
-rw-r--r--lib/Target/ARM/Thumb1InstrInfo.cpp17
-rw-r--r--lib/Target/ARM/Thumb2ITBlockPass.cpp134
-rw-r--r--lib/Target/ARM/Thumb2InstrInfo.cpp38
-rw-r--r--lib/Target/ARM/Thumb2SizeReduction.cpp28
-rw-r--r--lib/Target/ARM/ThumbRegisterInfo.cpp11
69 files changed, 6194 insertions, 2411 deletions
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index fb238bfc9cbc..30b9c8071ba2 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -133,9 +133,9 @@ bool A15SDOptimizer::usesRegClass(MachineOperand &MO,
const TargetRegisterClass *TRC) {
if (!MO.isReg())
return false;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return MRI->getRegClass(Reg)->hasSuperClassEq(TRC);
else
return TRC->contains(Reg);
@@ -151,7 +151,7 @@ unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
// Get the subreg type that is most likely to be coalesced
// for an SPR register that will be used in VDUP32d pseudo.
unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
- if (!TRI->isVirtualRegister(SReg))
+ if (!Register::isVirtualRegister(SReg))
return getDPRLaneFromSPR(SReg);
MachineInstr *MI = MRI->getVRegDef(SReg);
@@ -166,7 +166,7 @@ unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
SReg = MI->getOperand(1).getReg();
}
- if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+ if (Register::isVirtualRegister(SReg)) {
if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1;
return ARM::ssub_0;
}
@@ -191,8 +191,8 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
for (MachineOperand &MO : MI->operands()) {
if ((!MO.isReg()) || (!MO.isUse()))
continue;
- unsigned Reg = MO.getReg();
- if (!TRI->isVirtualRegister(Reg))
+ Register Reg = MO.getReg();
+ if (!Register::isVirtualRegister(Reg))
continue;
MachineOperand *Op = MI->findRegisterDefOperand(Reg);
@@ -213,8 +213,8 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
for (MachineOperand &MODef : Def->operands()) {
if ((!MODef.isReg()) || (!MODef.isDef()))
continue;
- unsigned DefReg = MODef.getReg();
- if (!TRI->isVirtualRegister(DefReg)) {
+ Register DefReg = MODef.getReg();
+ if (!Register::isVirtualRegister(DefReg)) {
IsDead = false;
break;
}
@@ -245,10 +245,10 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
}
if (MI->isInsertSubreg()) {
- unsigned DPRReg = MI->getOperand(1).getReg();
- unsigned SPRReg = MI->getOperand(2).getReg();
+ Register DPRReg = MI->getOperand(1).getReg();
+ Register SPRReg = MI->getOperand(2).getReg();
- if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) {
+ if (Register::isVirtualRegister(DPRReg) && Register::isVirtualRegister(SPRReg)) {
MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg());
MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg());
@@ -267,7 +267,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
// Find the thing we're subreg copying out of - is it of the same
// regclass as DPRMI? (i.e. a DPR or QPR).
- unsigned FullReg = SPRMI->getOperand(1).getReg();
+ Register FullReg = SPRMI->getOperand(1).getReg();
const TargetRegisterClass *TRC =
MRI->getRegClass(MI->getOperand(1).getReg());
if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
@@ -296,9 +296,9 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
if (!MI->getOperand(I).isReg())
continue;
++NumTotal;
- unsigned OpReg = MI->getOperand(I).getReg();
+ Register OpReg = MI->getOperand(I).getReg();
- if (!TRI->isVirtualRegister(OpReg))
+ if (!Register::isVirtualRegister(OpReg))
break;
MachineInstr *Def = MRI->getVRegDef(OpReg);
@@ -342,7 +342,7 @@ bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) {
MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
if (!MI->isFullCopy())
return MI;
- if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+ if (!Register::isVirtualRegister(MI->getOperand(1).getReg()))
return nullptr;
MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
if (!Def)
@@ -369,8 +369,8 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
Reached.insert(MI);
if (MI->isPHI()) {
for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
- unsigned Reg = MI->getOperand(I).getReg();
- if (!TRI->isVirtualRegister(Reg)) {
+ Register Reg = MI->getOperand(I).getReg();
+ if (!Register::isVirtualRegister(Reg)) {
continue;
}
MachineInstr *NewMI = MRI->getVRegDef(Reg);
@@ -379,7 +379,7 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
Front.push_back(NewMI);
}
} else if (MI->isFullCopy()) {
- if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+ if (!Register::isVirtualRegister(MI->getOperand(1).getReg()))
continue;
MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg());
if (!NewMI)
@@ -418,8 +418,8 @@ unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL, unsigned Reg,
unsigned Lane, bool QPR) {
- unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
- &ARM::DPRRegClass);
+ Register Out =
+ MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : &ARM::DPRRegClass);
BuildMI(MBB, InsertBefore, DL,
TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d), Out)
.addReg(Reg)
@@ -434,7 +434,7 @@ unsigned A15SDOptimizer::createExtractSubreg(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL, unsigned DReg, unsigned Lane,
const TargetRegisterClass *TRC) {
- unsigned Out = MRI->createVirtualRegister(TRC);
+ Register Out = MRI->createVirtualRegister(TRC);
BuildMI(MBB,
InsertBefore,
DL,
@@ -448,7 +448,7 @@ unsigned A15SDOptimizer::createExtractSubreg(
unsigned A15SDOptimizer::createRegSequence(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL, unsigned Reg1, unsigned Reg2) {
- unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
+ Register Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
BuildMI(MBB,
InsertBefore,
DL,
@@ -466,7 +466,7 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL, unsigned Ssub0,
unsigned Ssub1) {
- unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+ Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
BuildMI(MBB, InsertBefore, DL, TII->get(ARM::VEXTd32), Out)
.addReg(Ssub0)
.addReg(Ssub1)
@@ -478,7 +478,7 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
unsigned A15SDOptimizer::createInsertSubreg(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) {
- unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
+ Register Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
BuildMI(MBB,
InsertBefore,
DL,
@@ -494,7 +494,7 @@ unsigned
A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL) {
- unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+ Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
BuildMI(MBB,
InsertBefore,
DL,
@@ -602,7 +602,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
// we can end up with multiple defs of this DPR.
SmallVector<MachineInstr *, 8> DefSrcs;
- if (!TRI->isVirtualRegister(*I))
+ if (!Register::isVirtualRegister(*I))
continue;
MachineInstr *Def = MRI->getVRegDef(*I);
if (!Def)
@@ -622,7 +622,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
// Collect all the uses of this MI's DPR def for updating later.
SmallVector<MachineOperand*, 8> Uses;
- unsigned DPRDefReg = MI->getOperand(0).getReg();
+ Register DPRDefReg = MI->getOperand(0).getReg();
for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
E = MRI->use_end(); I != E; ++I)
Uses.push_back(&*I);
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index bf8ed6562fe7..2e6f756d522c 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -35,6 +35,7 @@ class MachineInstr;
class MCInst;
class PassRegistry;
+Pass *createMVETailPredicationPass();
FunctionPass *createARMLowOverheadLoopsPass();
Pass *createARMParallelDSPPass();
FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
@@ -67,6 +68,7 @@ void initializeThumb2SizeReducePass(PassRegistry &);
void initializeThumb2ITBlockPass(PassRegistry &);
void initializeMVEVPTBlockPass(PassRegistry &);
void initializeARMLowOverheadLoopsPass(PassRegistry &);
+void initializeMVETailPredicationPass(PassRegistry &);
} // end namespace llvm
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index b687db12eaf5..fed4cb2b9316 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -57,12 +57,15 @@ def FeatureD32 : SubtargetFeature<"d32", "HasD32", "true",
"Extend FP to 32 double registers">;
multiclass VFPver<string name, string query, string description,
- list<SubtargetFeature> prev = [],
- list<SubtargetFeature> otherimplies = []> {
+ list<SubtargetFeature> prev,
+ list<SubtargetFeature> otherimplies,
+ list<SubtargetFeature> vfp2prev = []> {
def _D16_SP: SubtargetFeature<
name#"d16sp", query#"D16SP", "true",
description#" with only 16 d-registers and no double precision",
- !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16_SP")) # otherimplies>;
+ !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16_SP")) #
+ !foreach(v, vfp2prev, !cast<SubtargetFeature>(v # "_SP")) #
+ otherimplies>;
def _SP: SubtargetFeature<
name#"sp", query#"SP", "true",
description#" with no double precision",
@@ -72,6 +75,7 @@ multiclass VFPver<string name, string query, string description,
name#"d16", query#"D16", "true",
description#" with only 16 d-registers",
!foreach(v, prev, !cast<SubtargetFeature>(v # "_D16")) #
+ vfp2prev #
otherimplies # [FeatureFP64, !cast<SubtargetFeature>(NAME # "_D16_SP")]>;
def "": SubtargetFeature<
name, query, "true", description,
@@ -80,11 +84,17 @@ multiclass VFPver<string name, string query, string description,
!cast<SubtargetFeature>(NAME # "_SP")]>;
}
-defm FeatureVFP2: VFPver<"vfp2", "HasVFPv2", "Enable VFP2 instructions",
- [], [FeatureFPRegs]>;
+def FeatureVFP2_SP : SubtargetFeature<"vfp2sp", "HasVFPv2SP", "true",
+ "Enable VFP2 instructions with "
+ "no double precision",
+ [FeatureFPRegs]>;
+
+def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
+ "Enable VFP2 instructions",
+ [FeatureFP64, FeatureVFP2_SP]>;
defm FeatureVFP3: VFPver<"vfp3", "HasVFPv3", "Enable VFP3 instructions",
- [FeatureVFP2]>;
+ [], [], [FeatureVFP2]>;
def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
"Enable NEON instructions",
@@ -98,7 +108,7 @@ defm FeatureVFP4: VFPver<"vfp4", "HasVFPv4", "Enable VFP4 instructions",
[FeatureVFP3], [FeatureFP16]>;
defm FeatureFPARMv8: VFPver<"fp-armv8", "HasFPARMv8", "Enable ARMv8 FP",
- [FeatureVFP4]>;
+ [FeatureVFP4], []>;
def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
"Enable full half-precision "
@@ -302,9 +312,18 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
"Prefer 32-bit Thumb instrs">;
-def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2",
+def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2",
"Prefer 32-bit alignment for loops">;
+def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "1",
+ "Model MVE instructions as a 1 beat per tick architecture">;
+
+def FeatureMVEVectorCostFactor2 : SubtargetFeature<"mve2beat", "MVEVectorCostFactor", "2",
+ "Model MVE instructions as a 2 beats per tick architecture">;
+
+def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "4",
+ "Model MVE instructions as a 4 beats per tick architecture">;
+
/// Some instructions update CPSR partially, which can add false dependency for
/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
/// mapped to a separate physical register. Avoid partial CPSR update for these
@@ -1156,6 +1175,13 @@ def : ProcNoItin<"cortex-a76ae", [ARMv82a, ProcA76,
FeatureFullFP16,
FeatureDotProd]>;
+def : ProcNoItin<"neoverse-n1", [ARMv82a,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureDotProd]>;
+
def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift,
FeatureHasRetAddrStack,
FeatureNEONForFP,
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index e29077266fcd..c8c91e53c44e 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -168,7 +168,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// relatively easy to exceed the thumb branch range within a TU.
if (! ThumbIndirectPads.empty()) {
OutStreamer->EmitAssemblerFlag(MCAF_Code16);
- EmitAlignment(1);
+ EmitAlignment(Align(2));
for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) {
OutStreamer->EmitLabel(TIP.second);
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX)
@@ -203,8 +203,8 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
switch (MO.getType()) {
default: llvm_unreachable("<unknown operand type>");
case MachineOperand::MO_Register: {
- unsigned Reg = MO.getReg();
- assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ Register Reg = MO.getReg();
+ assert(Register::isPhysicalRegister(Reg));
assert(!MO.getSubReg() && "Subregs should be eliminated!");
if(ARM::GPRPairRegClass.contains(Reg)) {
const MachineFunction &MF = *MI->getParent()->getParent();
@@ -275,7 +275,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
return false;
case 'y': // Print a VFP single precision register as indexed double.
if (MI->getOperand(OpNum).isReg()) {
- unsigned Reg = MI->getOperand(OpNum).getReg();
+ Register Reg = MI->getOperand(OpNum).getReg();
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
// Find the 'd' register that has this 's' register as a sub-register,
// and determine the lane number.
@@ -302,14 +302,14 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
if (!MI->getOperand(OpNum).isReg())
return true;
const MachineOperand &MO = MI->getOperand(OpNum);
- unsigned RegBegin = MO.getReg();
+ Register RegBegin = MO.getReg();
// This takes advantage of the 2 operand-ness of ldm/stm and that we've
// already got the operands in registers that are operands to the
// inline asm statement.
O << "{";
if (ARM::GPRPairRegClass.contains(RegBegin)) {
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
- unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
+ Register Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
O << ARMInstPrinter::getRegisterName(Reg0) << ", ";
RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1);
}
@@ -378,8 +378,8 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
if (!MO.isReg())
return true;
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
- unsigned Reg = TRI->getSubReg(MO.getReg(), FirstHalf ?
- ARM::gsub_0 : ARM::gsub_1);
+ Register Reg =
+ TRI->getSubReg(MO.getReg(), FirstHalf ? ARM::gsub_0 : ARM::gsub_1);
O << ARMInstPrinter::getRegisterName(Reg);
return false;
}
@@ -391,7 +391,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
const MachineOperand &MO = MI->getOperand(RegOp);
if (!MO.isReg())
return true;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
O << ARMInstPrinter::getRegisterName(Reg);
return false;
}
@@ -400,12 +400,12 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
case 'f': { // The high doubleword register of a NEON quad register.
if (!MI->getOperand(OpNum).isReg())
return true;
- unsigned Reg = MI->getOperand(OpNum).getReg();
+ Register Reg = MI->getOperand(OpNum).getReg();
if (!ARM::QPRRegClass.contains(Reg))
return true;
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
- unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ?
- ARM::dsub_0 : ARM::dsub_1);
+ Register SubReg =
+ TRI->getSubReg(Reg, ExtraCode[0] == 'e' ? ARM::dsub_0 : ARM::dsub_1);
O << ARMInstPrinter::getRegisterName(SubReg);
return false;
}
@@ -419,7 +419,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
return true;
const MachineFunction &MF = *MI->getParent()->getParent();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if(!ARM::GPRPairRegClass.contains(Reg))
return false;
Reg = TRI->getSubReg(Reg, ARM::gsub_1);
@@ -526,7 +526,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
if (!Stubs.empty()) {
// Switch with ".non_lazy_symbol_pointer" directive.
OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
- EmitAlignment(2);
+ EmitAlignment(Align(4));
for (auto &Stub : Stubs)
emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
@@ -539,7 +539,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
if (!Stubs.empty()) {
// Switch with ".non_lazy_symbol_pointer" directive.
OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection());
- EmitAlignment(2);
+ EmitAlignment(Align(4));
for (auto &Stub : Stubs)
emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
@@ -940,7 +940,7 @@ void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
// Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
// ARM mode tables.
- EmitAlignment(2);
+ EmitAlignment(Align(4));
// Emit a label for the jump table.
MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
@@ -986,7 +986,7 @@ void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
// Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
// ARM mode tables.
- EmitAlignment(2);
+ EmitAlignment(Align(4));
// Emit a label for the jump table.
MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
@@ -1015,7 +1015,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
unsigned JTI = MO1.getIndex();
if (Subtarget->isThumb1Only())
- EmitAlignment(2);
+ EmitAlignment(Align(4));
MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
OutStreamer->EmitLabel(JTISymbol);
@@ -1058,7 +1058,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
// Make sure the next instruction is 2-byte aligned.
- EmitAlignment(1);
+ EmitAlignment(Align(2));
}
void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
@@ -1072,7 +1072,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
MF.getSubtarget().getRegisterInfo();
const MachineRegisterInfo &MachineRegInfo = MF.getRegInfo();
- unsigned FramePtr = TargetRegInfo->getFrameRegister(MF);
+ Register FramePtr = TargetRegInfo->getFrameRegister(MF);
unsigned Opc = MI->getOpcode();
unsigned SrcReg, DstReg;
@@ -1136,7 +1136,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
}
// Check for registers that are remapped (for a Thumb1 prologue that
// saves high registers).
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(Reg))
Reg = RemappedReg;
RegList.push_back(Reg);
@@ -1326,7 +1326,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// So here we generate a bl to a small jump pad that does bx rN.
// The jump pads are emitted after the function body.
- unsigned TReg = MI->getOperand(0).getReg();
+ Register TReg = MI->getOperand(0).getReg();
MCSymbol *TRegSym = nullptr;
for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) {
if (TIP.first == TReg) {
@@ -1663,8 +1663,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case ARM::tTBH_JT: {
bool Is8Bit = MI->getOpcode() == ARM::tTBB_JT;
- unsigned Base = MI->getOperand(0).getReg();
- unsigned Idx = MI->getOperand(1).getReg();
+ Register Base = MI->getOperand(0).getReg();
+ Register Idx = MI->getOperand(1).getReg();
assert(MI->getOperand(1).isKill() && "We need the index register as scratch!");
// Multiply up idx if necessary.
@@ -1844,8 +1844,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// b LSJLJEH
// movs r0, #1
// LSJLJEH:
- unsigned SrcReg = MI->getOperand(0).getReg();
- unsigned ValReg = MI->getOperand(1).getReg();
+ Register SrcReg = MI->getOperand(0).getReg();
+ Register ValReg = MI->getOperand(1).getReg();
MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true);
OutStreamer->AddComment("eh_setjmp begin");
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
@@ -1910,8 +1910,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// mov r0, #0
// add pc, pc, #0
// mov r0, #1
- unsigned SrcReg = MI->getOperand(0).getReg();
- unsigned ValReg = MI->getOperand(1).getReg();
+ Register SrcReg = MI->getOperand(0).getReg();
+ Register ValReg = MI->getOperand(1).getReg();
OutStreamer->AddComment("eh_setjmp begin");
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDri)
@@ -1967,8 +1967,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// ldr $scratch, [$src, #4]
// ldr r7, [$src]
// bx $scratch
- unsigned SrcReg = MI->getOperand(0).getReg();
- unsigned ScratchReg = MI->getOperand(1).getReg();
+ Register SrcReg = MI->getOperand(0).getReg();
+ Register ScratchReg = MI->getOperand(1).getReg();
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
.addReg(ARM::SP)
.addReg(SrcReg)
@@ -2027,8 +2027,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// ldr $scratch, [$src, #4]
// ldr r7, [$src]
// bx $scratch
- unsigned SrcReg = MI->getOperand(0).getReg();
- unsigned ScratchReg = MI->getOperand(1).getReg();
+ Register SrcReg = MI->getOperand(0).getReg();
+ Register ScratchReg = MI->getOperand(1).getReg();
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
.addReg(ScratchReg)
@@ -2095,7 +2095,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// ldr.w sp, [$src, #8]
// ldr.w pc, [$src, #4]
- unsigned SrcReg = MI->getOperand(0).getReg();
+ Register SrcReg = MI->getOperand(0).getReg();
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
.addReg(ARM::R11)
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 222aa85856a2..684cd1def977 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -172,9 +172,9 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
const MachineOperand &WB = isLoad ? MI.getOperand(1) : MI.getOperand(0);
const MachineOperand &Base = MI.getOperand(2);
const MachineOperand &Offset = MI.getOperand(NumOps - 3);
- unsigned WBReg = WB.getReg();
- unsigned BaseReg = Base.getReg();
- unsigned OffReg = Offset.getReg();
+ Register WBReg = WB.getReg();
+ Register BaseReg = Base.getReg();
+ Register OffReg = Offset.getReg();
unsigned OffImm = MI.getOperand(NumOps - 2).getImm();
ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI.getOperand(NumOps - 1).getImm();
switch (AddrMode) {
@@ -276,8 +276,8 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
if (LV) {
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI.getOperand(i);
- if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
- unsigned Reg = MO.getReg();
+ if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
+ Register Reg = MO.getReg();
LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
if (MO.isDef()) {
@@ -966,8 +966,8 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
SmallSet<unsigned, 4> DstRegs;
#endif
for (unsigned i = 0; i != SubRegs; ++i) {
- unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
- unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
+ Register Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
+ Register Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
assert(Dst && Src && "Bad sub-register");
#ifndef NDEBUG
assert(!DstRegs.count(Src) && "destructive vector copy");
@@ -1019,7 +1019,7 @@ ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
if (!SubIdx)
return MIB.addReg(Reg, State);
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
return MIB.addReg(Reg, State, SubIdx);
}
@@ -1133,7 +1133,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
case 24:
if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
// Use aligned spills if the stack can be realigned.
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ Subtarget.hasNEON()) {
BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64TPseudo))
.addFrameIndex(FI)
.addImm(16)
@@ -1155,7 +1156,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 32:
if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ Subtarget.hasNEON()) {
// FIXME: It's possible to only store part of the QQ register if the
// spilled def has a sub-register index.
BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64QPseudo))
@@ -1337,7 +1339,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
}
- if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ if (Register::isPhysicalRegister(DestReg))
MIB.addReg(DestReg, RegState::ImplicitDefine);
} else
llvm_unreachable("Unknown reg class!");
@@ -1368,7 +1370,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 24:
if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ Subtarget.hasNEON()) {
BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
.addFrameIndex(FI)
.addImm(16)
@@ -1382,7 +1385,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
- if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ if (Register::isPhysicalRegister(DestReg))
MIB.addReg(DestReg, RegState::ImplicitDefine);
}
} else
@@ -1390,7 +1393,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 32:
if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ Subtarget.hasNEON()) {
BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
.addFrameIndex(FI)
.addImm(16)
@@ -1405,7 +1409,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
- if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ if (Register::isPhysicalRegister(DestReg))
MIB.addReg(DestReg, RegState::ImplicitDefine);
}
} else
@@ -1425,7 +1429,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI);
- if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ if (Register::isPhysicalRegister(DestReg))
MIB.addReg(DestReg, RegState::ImplicitDefine);
} else
llvm_unreachable("Unknown reg class!");
@@ -1583,8 +1587,8 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// Look for a copy between even S-registers. That is where we keep floats
// when using NEON v2f32 instructions for f32 arithmetic.
- unsigned DstRegS = MI.getOperand(0).getReg();
- unsigned SrcRegS = MI.getOperand(1).getReg();
+ Register DstRegS = MI.getOperand(0).getReg();
+ Register SrcRegS = MI.getOperand(1).getReg();
if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS))
return false;
@@ -1794,12 +1798,11 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0,
if (MI0.getNumOperands() != MI1.getNumOperands())
return false;
- unsigned Addr0 = MI0.getOperand(1).getReg();
- unsigned Addr1 = MI1.getOperand(1).getReg();
+ Register Addr0 = MI0.getOperand(1).getReg();
+ Register Addr1 = MI1.getOperand(1).getReg();
if (Addr0 != Addr1) {
- if (!MRI ||
- !TargetRegisterInfo::isVirtualRegister(Addr0) ||
- !TargetRegisterInfo::isVirtualRegister(Addr1))
+ if (!MRI || !Register::isVirtualRegister(Addr0) ||
+ !Register::isVirtualRegister(Addr1))
return false;
// This assumes SSA form.
@@ -2076,6 +2079,38 @@ isProfitableToIfCvt(MachineBasicBlock &TBB,
return PredCost <= UnpredCost;
}
+unsigned
+ARMBaseInstrInfo::extraSizeToPredicateInstructions(const MachineFunction &MF,
+ unsigned NumInsts) const {
+ // Thumb2 needs a 2-byte IT instruction to predicate up to 4 instructions.
+ // ARM has a condition code field in every predicable instruction, using it
+ // doesn't change code size.
+ return Subtarget.isThumb2() ? divideCeil(NumInsts, 4) * 2 : 0;
+}
+
+unsigned
+ARMBaseInstrInfo::predictBranchSizeForIfCvt(MachineInstr &MI) const {
+ // If this branch is likely to be folded into the comparison to form a
+ // CB(N)Z, then removing it won't reduce code size at all, because that will
+ // just replace the CB(N)Z with a CMP.
+ if (MI.getOpcode() == ARM::t2Bcc &&
+ findCMPToFoldIntoCBZ(&MI, &getRegisterInfo()))
+ return 0;
+
+ unsigned Size = getInstSizeInBytes(MI);
+
+ // For Thumb2, all branches are 32-bit instructions during the if conversion
+ // pass, but may be replaced with 16-bit instructions during size reduction.
+ // Since the branches considered by if conversion tend to be forward branches
+ // over small basic blocks, they are very likely to be in range for the
+ // narrow instructions, so we assume the final code size will be half what it
+ // currently is.
+ if (Subtarget.isThumb2())
+ Size /= 2;
+
+ return Size;
+}
+
bool
ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
MachineBasicBlock &FMBB) const {
@@ -2141,7 +2176,7 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
MachineInstr *
ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
const TargetInstrInfo *TII) const {
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return nullptr;
if (!MRI.hasOneNonDBGUse(Reg))
return nullptr;
@@ -2163,7 +2198,7 @@ ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
// MI can't have any tied operands, that would conflict with predication.
if (MO.isTied())
return nullptr;
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ if (Register::isPhysicalRegister(MO.getReg()))
return nullptr;
if (MO.isDef() && !MO.isDead())
return nullptr;
@@ -2211,7 +2246,7 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI,
// Find new register class to use.
MachineOperand FalseReg = MI.getOperand(Invert ? 2 : 1);
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
if (!MRI.constrainRegClass(DestReg, PreviousClass))
return nullptr;
@@ -2298,6 +2333,7 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
{ARM::tSUBSrr, ARM::tSUBrr},
{ARM::tSBCS, ARM::tSBC},
{ARM::tRSBS, ARM::tRSB},
+ {ARM::tLSLSri, ARM::tLSLri},
{ARM::t2ADDSri, ARM::t2ADDri},
{ARM::t2ADDSrr, ARM::t2ADDrr},
@@ -2420,7 +2456,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
MachineOperand &MO = MI->getOperand(i);
RegList.push_back(MO);
- if (MO.isReg() && TRI->getEncodingValue(MO.getReg()) < FirstRegEnc)
+ if (MO.isReg() && !MO.isImplicit() &&
+ TRI->getEncodingValue(MO.getReg()) < FirstRegEnc)
FirstRegEnc = TRI->getEncodingValue(MO.getReg());
}
@@ -2430,7 +2467,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
for (int CurRegEnc = FirstRegEnc - 1; CurRegEnc >= 0 && RegsNeeded;
--CurRegEnc) {
unsigned CurReg = RegClass->getRegister(CurRegEnc);
- if (IsT1PushPop && CurReg > ARM::R7)
+ if (IsT1PushPop && CurRegEnc > TRI->getEncodingValue(ARM::R7))
continue;
if (!IsPop) {
// Pushing any register is completely harmless, mark the register involved
@@ -3039,18 +3076,22 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
break;
case ARM::VSELEQD:
case ARM::VSELEQS:
+ case ARM::VSELEQH:
CC = ARMCC::EQ;
break;
case ARM::VSELGTD:
case ARM::VSELGTS:
+ case ARM::VSELGTH:
CC = ARMCC::GT;
break;
case ARM::VSELGED:
case ARM::VSELGES:
+ case ARM::VSELGEH:
CC = ARMCC::GE;
break;
- case ARM::VSELVSS:
case ARM::VSELVSD:
+ case ARM::VSELVSS:
+ case ARM::VSELVSH:
CC = ARMCC::VS;
break;
}
@@ -3271,9 +3312,9 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
}
unsigned OpIdx = Commute ? 2 : 1;
- unsigned Reg1 = UseMI.getOperand(OpIdx).getReg();
+ Register Reg1 = UseMI.getOperand(OpIdx).getReg();
bool isKill = UseMI.getOperand(OpIdx).isKill();
- unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+ Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(NewUseOpc),
NewReg)
.addReg(Reg1, getKillRegState(isKill))
@@ -3335,15 +3376,15 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
case ARM::LDRSB_POST:
case ARM::LDRSH_POST: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rm = MI.getOperand(3).getReg();
return (Rt == Rm) ? 4 : 3;
}
case ARM::LDR_PRE_REG:
case ARM::LDRB_PRE_REG: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rm = MI.getOperand(3).getReg();
if (Rt == Rm)
return 3;
unsigned ShOpVal = MI.getOperand(4).getImm();
@@ -3372,8 +3413,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
case ARM::LDRH_PRE:
case ARM::STRH_PRE: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rm = MI.getOperand(3).getReg();
if (!Rm)
return 2;
if (Rt == Rm)
@@ -3384,8 +3425,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
case ARM::LDR_POST_REG:
case ARM::LDRB_POST_REG:
case ARM::LDRH_POST: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rm = MI.getOperand(3).getReg();
return (Rt == Rm) ? 3 : 2;
}
@@ -3404,10 +3445,10 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
case ARM::LDRSB_PRE:
case ARM::LDRSH_PRE: {
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rm = MI.getOperand(3).getReg();
if (Rm == 0)
return 3;
- unsigned Rt = MI.getOperand(0).getReg();
+ Register Rt = MI.getOperand(0).getReg();
if (Rt == Rm)
return 4;
unsigned ShOpVal = MI.getOperand(4).getImm();
@@ -3422,9 +3463,9 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
}
case ARM::LDRD: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rn = MI.getOperand(2).getReg();
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rn = MI.getOperand(2).getReg();
+ Register Rm = MI.getOperand(3).getReg();
if (Rm)
return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
: 3;
@@ -3432,7 +3473,7 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
}
case ARM::STRD: {
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rm = MI.getOperand(3).getReg();
if (Rm)
return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
: 3;
@@ -3448,9 +3489,9 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
return 4;
case ARM::LDRD_PRE: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rn = MI.getOperand(3).getReg();
- unsigned Rm = MI.getOperand(4).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rn = MI.getOperand(3).getReg();
+ Register Rm = MI.getOperand(4).getReg();
if (Rm)
return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
: 4;
@@ -3458,13 +3499,13 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
}
case ARM::t2LDRD_PRE: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rn = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rn = MI.getOperand(3).getReg();
return (Rt == Rn) ? 4 : 3;
}
case ARM::STRD_PRE: {
- unsigned Rm = MI.getOperand(4).getReg();
+ Register Rm = MI.getOperand(4).getReg();
if (Rm)
return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
: 4;
@@ -3495,8 +3536,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
return 2;
case ARM::t2LDRDi8: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rn = MI.getOperand(2).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rn = MI.getOperand(2).getReg();
return (Rt == Rn) ? 3 : 2;
}
@@ -3745,7 +3786,7 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
}
bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const {
- unsigned BaseReg = MI.getOperand(0).getReg();
+ Register BaseReg = MI.getOperand(0).getReg();
for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) {
const auto &Op = MI.getOperand(i);
if (Op.isReg() && Op.getReg() == BaseReg)
@@ -4219,7 +4260,7 @@ int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
return -1;
const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
- unsigned Reg = DefMO.getReg();
+ Register Reg = DefMO.getReg();
const MachineInstr *ResolvedDefMI = &DefMI;
unsigned DefAdj = 0;
@@ -4328,10 +4369,10 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
}
const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
- const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
+ auto *DefMN = cast<MachineSDNode>(DefNode);
unsigned DefAlign = !DefMN->memoperands_empty()
? (*DefMN->memoperands_begin())->getAlignment() : 0;
- const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode);
+ auto *UseMN = cast<MachineSDNode>(UseNode);
unsigned UseAlign = !UseMN->memoperands_empty()
? (*UseMN->memoperands_begin())->getAlignment() : 0;
int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign,
@@ -4708,7 +4749,7 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
if (MI.getOperand(i).isImplicit() ||
!MI.getOperand(i).isReg())
continue;
- unsigned Reg = MI.getOperand(i).getReg();
+ Register Reg = MI.getOperand(i).getReg();
if (Reg < ARM::R0 || Reg > ARM::R7) {
if (!(MI.getOpcode() == ARM::tPUSH && Reg == ARM::LR) &&
!(MI.getOpcode() == ARM::tPOP_RET && Reg == ARM::PC)) {
@@ -4731,7 +4772,7 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- unsigned Reg = MI->getOperand(0).getReg();
+ Register Reg = MI->getOperand(0).getReg();
const GlobalValue *GV =
cast<GlobalValue>((*MI->memoperands_begin())->getValue());
MachineInstrBuilder MIB;
@@ -5104,7 +5145,7 @@ unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance(
const MachineOperand &MO = MI.getOperand(OpNum);
if (MO.readsReg())
return 0;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
int UseOp = -1;
switch (MI.getOpcode()) {
@@ -5134,7 +5175,7 @@ unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance(
return 0;
// We must be able to clobber the whole D-reg.
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
// Virtual register must be a def undef foo:ssub_0 operand.
if (!MO.getSubReg() || MI.readsVirtualRegister(Reg))
return 0;
@@ -5159,8 +5200,8 @@ void ARMBaseInstrInfo::breakPartialRegDependency(
assert(TRI && "Need TRI instance");
const MachineOperand &MO = MI.getOperand(OpNum);
- unsigned Reg = MO.getReg();
- assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+ Register Reg = MO.getReg();
+ assert(Register::isPhysicalRegister(Reg) &&
"Can't break virtual register dependencies.");
unsigned DReg = Reg;
@@ -5337,7 +5378,7 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br,
// is not redefined between the cmp and the br.
if (CmpMI->getOpcode() != ARM::tCMPi8 && CmpMI->getOpcode() != ARM::t2CMPri)
return nullptr;
- unsigned Reg = CmpMI->getOperand(0).getReg();
+ Register Reg = CmpMI->getOperand(0).getReg();
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(*CmpMI, PredReg);
if (Pred != ARMCC::AL || CmpMI->getOperand(1).getImm() != 0)
@@ -5349,3 +5390,50 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br,
return &*CmpMI;
}
+
+unsigned llvm::ConstantMaterializationCost(unsigned Val,
+ const ARMSubtarget *Subtarget,
+ bool ForCodesize) {
+ if (Subtarget->isThumb()) {
+ if (Val <= 255) // MOV
+ return ForCodesize ? 2 : 1;
+ if (Subtarget->hasV6T2Ops() && (Val <= 0xffff || // MOV
+ ARM_AM::getT2SOImmVal(Val) != -1 || // MOVW
+ ARM_AM::getT2SOImmVal(~Val) != -1)) // MVN
+ return ForCodesize ? 4 : 1;
+ if (Val <= 510) // MOV + ADDi8
+ return ForCodesize ? 4 : 2;
+ if (~Val <= 255) // MOV + MVN
+ return ForCodesize ? 4 : 2;
+ if (ARM_AM::isThumbImmShiftedVal(Val)) // MOV + LSL
+ return ForCodesize ? 4 : 2;
+ } else {
+ if (ARM_AM::getSOImmVal(Val) != -1) // MOV
+ return ForCodesize ? 4 : 1;
+ if (ARM_AM::getSOImmVal(~Val) != -1) // MVN
+ return ForCodesize ? 4 : 1;
+ if (Subtarget->hasV6T2Ops() && Val <= 0xffff) // MOVW
+ return ForCodesize ? 4 : 1;
+ if (ARM_AM::isSOImmTwoPartVal(Val)) // two instrs
+ return ForCodesize ? 8 : 2;
+ }
+ if (Subtarget->useMovt()) // MOVW + MOVT
+ return ForCodesize ? 8 : 2;
+ return ForCodesize ? 8 : 3; // Literal pool load
+}
+
+bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
+ const ARMSubtarget *Subtarget,
+ bool ForCodesize) {
+ // Check with ForCodesize
+ unsigned Cost1 = ConstantMaterializationCost(Val1, Subtarget, ForCodesize);
+ unsigned Cost2 = ConstantMaterializationCost(Val2, Subtarget, ForCodesize);
+ if (Cost1 < Cost2)
+ return true;
+ if (Cost1 > Cost2)
+ return false;
+
+ // If they are equal, try with !ForCodesize
+ return ConstantMaterializationCost(Val1, Subtarget, !ForCodesize) <
+ ConstantMaterializationCost(Val2, Subtarget, !ForCodesize);
+}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index c28983fcc15c..c232b6f0b45d 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -276,6 +276,10 @@ public:
return NumCycles == 1;
}
+ unsigned extraSizeToPredicateInstructions(const MachineFunction &MF,
+ unsigned NumInsts) const override;
+ unsigned predictBranchSizeForIfCvt(MachineInstr &MI) const override;
+
bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
MachineBasicBlock &FMBB) const override;
@@ -601,7 +605,8 @@ bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
unsigned FrameReg, int &Offset,
- const ARMBaseInstrInfo &TII);
+ const ARMBaseInstrInfo &TII,
+ const TargetRegisterInfo *TRI);
/// Return true if Reg is defd between From and To
bool registerDefinedBetween(unsigned Reg, MachineBasicBlock::iterator From,
@@ -620,6 +625,20 @@ void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond);
void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond,
unsigned Inactive);
+/// Returns the number of instructions required to materialize the given
+/// constant in a register, or 3 if a literal pool load is needed.
+/// If ForCodesize is specified, an approximate cost in bytes is returned.
+unsigned ConstantMaterializationCost(unsigned Val,
+ const ARMSubtarget *Subtarget,
+ bool ForCodesize = false);
+
+/// Returns true if Val1 has a lower Constant Materialization Cost than Val2.
+/// Uses the cost from ConstantMaterializationCost, first with ForCodesize as
+/// specified. If the scores are equal, return the comparison for !ForCodesize.
+bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
+ const ARMSubtarget *Subtarget,
+ bool ForCodesize = false);
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index dc99b37742da..1eaf871867e0 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -174,6 +174,12 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
: CSR_AAPCS_ThisReturn_RegMask;
}
+ArrayRef<MCPhysReg> ARMBaseRegisterInfo::getIntraCallClobberedRegs(
+ const MachineFunction *MF) const {
+ static const MCPhysReg IntraCallClobberedRegs[] = {ARM::R12};
+ return ArrayRef<MCPhysReg>(IntraCallClobberedRegs);
+}
+
BitVector ARMBaseRegisterInfo::
getReservedRegs(const MachineFunction &MF) const {
const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
@@ -185,7 +191,7 @@ getReservedRegs(const MachineFunction &MF) const {
markSuperRegs(Reserved, ARM::PC);
markSuperRegs(Reserved, ARM::FPSCR);
markSuperRegs(Reserved, ARM::APSR_NZCV);
- if (TFI->hasFP(MF))
+ if (TFI->hasFP(MF) || STI.isTargetDarwin())
markSuperRegs(Reserved, getFramePointerReg(STI));
if (hasBasePointer(MF))
markSuperRegs(Reserved, BasePtr);
@@ -217,7 +223,7 @@ isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const {
const TargetRegisterClass *
ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
- const MachineFunction &) const {
+ const MachineFunction &MF) const {
const TargetRegisterClass *Super = RC;
TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
do {
@@ -225,11 +231,13 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
case ARM::GPRRegClassID:
case ARM::SPRRegClassID:
case ARM::DPRRegClassID:
+ case ARM::GPRPairRegClassID:
+ return Super;
case ARM::QPRRegClassID:
case ARM::QQPRRegClassID:
case ARM::QQQQPRRegClassID:
- case ARM::GPRPairRegClassID:
- return Super;
+ if (MF.getSubtarget<ARMSubtarget>().hasNEON())
+ return Super;
}
Super = *I++;
} while (Super);
@@ -317,7 +325,7 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
return false;
unsigned PairedPhys = 0;
- if (TargetRegisterInfo::isPhysicalRegister(Paired)) {
+ if (Register::isPhysicalRegister(Paired)) {
PairedPhys = Paired;
} else if (VRM && VRM->hasPhys(Paired)) {
PairedPhys = getPairedGPR(VRM->getPhys(Paired), Odd, this);
@@ -347,7 +355,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
Hint.first == (unsigned)ARMRI::RegPairEven) &&
- TargetRegisterInfo::isVirtualRegister(Hint.second)) {
+ Register::isVirtualRegister(Hint.second)) {
// If 'Reg' is one of the even / odd register pair and it's now changed
// (e.g. coalesced) into a different register. The other register of the
// pair allocation hint must be updated to reflect the relationship
@@ -357,7 +365,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
// Make sure the pair has not already divorced.
if (Hint.second == Reg) {
MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
- if (TargetRegisterInfo::isVirtualRegister(NewReg))
+ if (Register::isVirtualRegister(NewReg))
MRI->setRegAllocationHint(NewReg,
Hint.first == (unsigned)ARMRI::RegPairOdd ? ARMRI::RegPairEven
: ARMRI::RegPairOdd, OtherReg);
@@ -663,7 +671,7 @@ void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII);
else {
assert(AFI->isThumb2Function());
- Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
+ Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII, this);
}
assert(Done && "Unable to resolve frame index!");
(void)Done;
@@ -775,7 +783,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
else {
assert(AFI->isThumb2Function());
- Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
+ Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII, this);
}
if (Done)
return;
@@ -783,21 +791,32 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// If we get here, the immediate doesn't fit into the instruction. We folded
// as much as possible above, handle the rest, providing a register that is
// SP+LargeImm.
- assert((Offset ||
- (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 ||
- (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) &&
- "This code isn't needed if offset already handled!");
+ assert(
+ (Offset ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6 ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7 ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7s2 ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) ==
+ ARMII::AddrModeT2_i7s4) &&
+ "This code isn't needed if offset already handled!");
unsigned ScratchReg = 0;
int PIdx = MI.findFirstPredOperandIdx();
ARMCC::CondCodes Pred = (PIdx == -1)
? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg();
- if (Offset == 0)
+
+ const MCInstrDesc &MCID = MI.getDesc();
+ const TargetRegisterClass *RegClass =
+ TII.getRegClass(MCID, FIOperandNum, this, *MI.getParent()->getParent());
+
+ if (Offset == 0 &&
+ (Register::isVirtualRegister(FrameReg) || RegClass->contains(FrameReg)))
// Must be addrmode4/6.
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false);
else {
- ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass);
+ ScratchReg = MF.getRegInfo().createVirtualRegister(RegClass);
if (!AFI->isThumbFunction())
emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
Offset, Pred, PredReg, TII);
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 7e2c72b4d712..477f3ad0a9a7 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -129,6 +129,9 @@ public:
const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF,
CallingConv::ID) const;
+ ArrayRef<MCPhysReg>
+ getIntraCallClobberedRegs(const MachineFunction *MF) const override;
+
BitVector getReservedRegs(const MachineFunction &MF) const override;
bool isAsmClobberable(const MachineFunction &MF,
unsigned PhysReg) const override;
@@ -176,8 +179,6 @@ public:
Register getFrameRegister(const MachineFunction &MF) const override;
unsigned getBaseRegister() const { return BasePtr; }
- bool isLowRegister(unsigned Reg) const;
-
/// emitLoadConstPool - Emits a load from constpool to materialize the
/// specified immediate.
diff --git a/lib/Target/ARM/ARMBasicBlockInfo.cpp b/lib/Target/ARM/ARMBasicBlockInfo.cpp
index 2de90e816b33..00a2231f59e3 100644
--- a/lib/Target/ARM/ARMBasicBlockInfo.cpp
+++ b/lib/Target/ARM/ARMBasicBlockInfo.cpp
@@ -6,14 +6,16 @@
//
//===----------------------------------------------------------------------===//
+#include "ARMBasicBlockInfo.h"
#include "ARM.h"
#include "ARMBaseInstrInfo.h"
-#include "ARMBasicBlockInfo.h"
#include "ARMMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/Support/Debug.h"
#include <vector>
#define DEBUG_TYPE "arm-bb-utils"
@@ -47,7 +49,7 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) {
BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
BBI.Size = 0;
BBI.Unalign = 0;
- BBI.PostAlign = 0;
+ BBI.PostAlign = Align::None();
for (MachineInstr &I : *MBB) {
BBI.Size += TII->getInstSizeInBytes(I);
@@ -62,8 +64,8 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) {
// tBR_JTr contains a .align 2 directive.
if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
- BBI.PostAlign = 2;
- MBB->getParent()->ensureAlignment(2);
+ BBI.PostAlign = Align(4);
+ MBB->getParent()->ensureAlignment(Align(4));
}
}
@@ -126,9 +128,9 @@ void ARMBasicBlockUtils::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
for(unsigned i = BBNum + 1, e = MF.getNumBlockIDs(); i < e; ++i) {
// Get the offset and known bits at the end of the layout predecessor.
// Include the alignment of the current block.
- unsigned LogAlign = MF.getBlockNumbered(i)->getAlignment();
- unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
- unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
+ const Align Align = MF.getBlockNumbered(i)->getAlignment();
+ const unsigned Offset = BBInfo[i - 1].postOffset(Align);
+ const unsigned KnownBits = BBInfo[i - 1].postKnownBits(Align);
// This is where block i begins. Stop if the offset is already correct,
// and we have updated 2 blocks. This is the maximum number of blocks
diff --git a/lib/Target/ARM/ARMBasicBlockInfo.h b/lib/Target/ARM/ARMBasicBlockInfo.h
index 400bba351cec..13df399ed995 100644
--- a/lib/Target/ARM/ARMBasicBlockInfo.h
+++ b/lib/Target/ARM/ARMBasicBlockInfo.h
@@ -21,17 +21,18 @@
namespace llvm {
+struct BasicBlockInfo;
using BBInfoVector = SmallVectorImpl<BasicBlockInfo>;
/// UnknownPadding - Return the worst case padding that could result from
/// unknown offset bits. This does not include alignment padding caused by
/// known offset bits.
///
-/// @param LogAlign log2(alignment)
+/// @param Alignment alignment
/// @param KnownBits Number of known low offset bits.
-inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
- if (KnownBits < LogAlign)
- return (1u << LogAlign) - (1u << KnownBits);
+inline unsigned UnknownPadding(Align Alignment, unsigned KnownBits) {
+ if (KnownBits < Log2(Alignment))
+ return Alignment.value() - (1ull << KnownBits);
return 0;
}
@@ -65,10 +66,9 @@ struct BasicBlockInfo {
/// multiple of 1 << Unalign.
uint8_t Unalign = 0;
- /// PostAlign - When non-zero, the block terminator contains a .align
- /// directive, so the end of the block is aligned to 1 << PostAlign
- /// bytes.
- uint8_t PostAlign = 0;
+ /// PostAlign - When > 1, the block terminator contains a .align
+ /// directive, so the end of the block is aligned to PostAlign bytes.
+ Align PostAlign;
BasicBlockInfo() = default;
@@ -84,16 +84,16 @@ struct BasicBlockInfo {
return Bits;
}
- /// Compute the offset immediately following this block. If LogAlign is
+ /// Compute the offset immediately following this block. If Align is
/// specified, return the offset the successor block will get if it has
/// this alignment.
- unsigned postOffset(unsigned LogAlign = 0) const {
+ unsigned postOffset(Align Alignment = Align::None()) const {
unsigned PO = Offset + Size;
- unsigned LA = std::max(unsigned(PostAlign), LogAlign);
- if (!LA)
+ const Align PA = std::max(PostAlign, Alignment);
+ if (PA == Align::None())
return PO;
// Add alignment padding from the terminator.
- return PO + UnknownPadding(LA, internalKnownBits());
+ return PO + UnknownPadding(PA, internalKnownBits());
}
/// Compute the number of known low bits of postOffset. If this block
@@ -101,9 +101,8 @@ struct BasicBlockInfo {
/// instruction alignment. An aligned terminator may increase the number
/// of know bits.
/// If LogAlign is given, also consider the alignment of the next block.
- unsigned postKnownBits(unsigned LogAlign = 0) const {
- return std::max(std::max(unsigned(PostAlign), LogAlign),
- internalKnownBits());
+ unsigned postKnownBits(Align Align = Align::None()) const {
+ return std::max(Log2(std::max(PostAlign, Align)), internalKnownBits());
}
};
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 0cbe6e1871e4..d3b595ce8323 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -90,6 +90,8 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
: ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+ bool isIncomingArgumentHandler() const override { return false; }
+
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
@@ -169,8 +171,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info, CCState &State) override {
- if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State))
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+ CCState &State) override {
+ if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State))
return true;
StackSize =
@@ -199,9 +202,8 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
if (SplitVTs.size() == 1) {
// Even if there is no splitting to do, we still want to replace the
// original type (e.g. pointer type -> integer).
- auto Flags = OrigArg.Flags;
- unsigned OriginalAlignment = DL.getABITypeAlignment(OrigArg.Ty);
- Flags.setOrigAlign(OriginalAlignment);
+ auto Flags = OrigArg.Flags[0];
+ Flags.setOrigAlign(Align(DL.getABITypeAlignment(OrigArg.Ty)));
SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
Flags, OrigArg.IsFixed);
return;
@@ -211,10 +213,9 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) {
EVT SplitVT = SplitVTs[i];
Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
- auto Flags = OrigArg.Flags;
+ auto Flags = OrigArg.Flags[0];
- unsigned OriginalAlignment = DL.getABITypeAlignment(SplitTy);
- Flags.setOrigAlign(OriginalAlignment);
+ Flags.setOrigAlign(Align(DL.getABITypeAlignment(SplitTy)));
bool NeedsConsecutiveRegisters =
TLI.functionArgumentNeedsConsecutiveRegisters(
@@ -286,7 +287,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
CCAssignFn AssignFn)
: ValueHandler(MIRBuilder, MRI, AssignFn) {}
- bool isArgumentHandler() const override { return true; }
+ bool isIncomingArgumentHandler() const override { return true; }
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
@@ -298,7 +299,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
int FI = MFI.CreateFixedObject(Size, Offset, true);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
- unsigned AddrReg =
+ Register AddrReg =
MRI.createGenericVirtualRegister(LLT::pointer(MPO.getAddrSpace(), 32));
MIRBuilder.buildFrameIndex(AddrReg, FI);
@@ -405,6 +406,7 @@ struct FormalArgHandler : public IncomingValueHandler {
: IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
void markPhysRegUsed(unsigned PhysReg) override {
+ MIRBuilder.getMRI()->addLiveIn(PhysReg);
MIRBuilder.getMBB().addLiveIn(PhysReg);
}
};
@@ -498,11 +500,7 @@ unsigned getCallOpcode(const ARMSubtarget &STI, bool isDirect) {
}
} // end anonymous namespace
-bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
- CallingConv::ID CallConv,
- const MachineOperand &Callee,
- const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const {
+bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const {
MachineFunction &MF = MIRBuilder.getMF();
const auto &TLI = *getTLI<ARMTargetLowering>();
const auto &DL = MF.getDataLayout();
@@ -520,7 +518,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Create the call instruction so we can add the implicit uses of arg
// registers, but don't insert it yet.
- bool IsDirect = !Callee.isReg();
+ bool IsDirect = !Info.Callee.isReg();
auto CallOpcode = getCallOpcode(STI, IsDirect);
auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode);
@@ -528,35 +526,35 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (IsThumb)
MIB.add(predOps(ARMCC::AL));
- MIB.add(Callee);
+ MIB.add(Info.Callee);
if (!IsDirect) {
- auto CalleeReg = Callee.getReg();
- if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) {
+ auto CalleeReg = Info.Callee.getReg();
+ if (CalleeReg && !Register::isPhysicalRegister(CalleeReg)) {
unsigned CalleeIdx = IsThumb ? 2 : 0;
MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(),
- *MIB.getInstr(), MIB->getDesc(), Callee, CalleeIdx));
+ *MIB.getInstr(), MIB->getDesc(), Info.Callee, CalleeIdx));
}
}
- MIB.addRegMask(TRI->getCallPreservedMask(MF, CallConv));
+ MIB.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
bool IsVarArg = false;
SmallVector<ArgInfo, 8> ArgInfos;
- for (auto Arg : OrigArgs) {
+ for (auto Arg : Info.OrigArgs) {
if (!isSupportedType(DL, TLI, Arg.Ty))
return false;
if (!Arg.IsFixed)
IsVarArg = true;
- if (Arg.Flags.isByVal())
+ if (Arg.Flags[0].isByVal())
return false;
splitToValueTypes(Arg, ArgInfos, MF);
}
- auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, IsVarArg);
+ auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, IsVarArg);
OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
return false;
@@ -564,13 +562,13 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Now we can add the actual call instruction to the correct basic block.
MIRBuilder.insertInstr(MIB);
- if (!OrigRet.Ty->isVoidTy()) {
- if (!isSupportedType(DL, TLI, OrigRet.Ty))
+ if (!Info.OrigRet.Ty->isVoidTy()) {
+ if (!isSupportedType(DL, TLI, Info.OrigRet.Ty))
return false;
ArgInfos.clear();
- splitToValueTypes(OrigRet, ArgInfos, MF);
- auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, IsVarArg);
+ splitToValueTypes(Info.OrigRet, ArgInfos, MF);
+ auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, IsVarArg);
CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
return false;
diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h
index 794127b5ebc7..ddbc9feb90e2 100644
--- a/lib/Target/ARM/ARMCallLowering.h
+++ b/lib/Target/ARM/ARMCallLowering.h
@@ -38,9 +38,8 @@ public:
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const override;
- bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
- const MachineOperand &Callee, const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const override;
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
private:
bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
diff --git a/lib/Target/ARM/ARMCallingConv.cpp b/lib/Target/ARM/ARMCallingConv.cpp
index 5ede7c67f7c2..92ebc542b423 100644
--- a/lib/Target/ARM/ARMCallingConv.cpp
+++ b/lib/Target/ARM/ARMCallingConv.cpp
@@ -193,7 +193,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
// Try to allocate a contiguous block of registers, each of the correct
// size to hold one member.
auto &DL = State.getMachineFunction().getDataLayout();
- unsigned StackAlign = DL.getStackAlignment();
+ unsigned StackAlign = DL.getStackAlignment().value();
unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
ArrayRef<MCPhysReg> RegList;
diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp
index 2fc5f4aaab50..1c2c8aef55bb 100644
--- a/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -179,16 +179,12 @@ public:
}
static bool GenerateSignBits(Value *V) {
- if (auto *Arg = dyn_cast<Argument>(V))
- return Arg->hasSExtAttr();
-
if (!isa<Instruction>(V))
return false;
unsigned Opc = cast<Instruction>(V)->getOpcode();
return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
- Opc == Instruction::SRem || Opc == Instruction::SExt ||
- Opc == Instruction::SIToFP;
+ Opc == Instruction::SRem || Opc == Instruction::SExt;
}
static bool EqualTypeSize(Value *V) {
@@ -806,54 +802,48 @@ void IRPromoter::Mutate(Type *OrigTy,
/// return value is zeroext. We don't allow opcodes that can introduce sign
/// bits.
bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
- if (auto *I = dyn_cast<ICmpInst>(V)) {
- // Now that we allow small types than TypeSize, only allow icmp of
- // TypeSize because they will require a trunc to be legalised.
- // TODO: Allow icmp of smaller types, and calculate at the end
- // whether the transform would be beneficial.
- if (isa<PointerType>(I->getOperand(0)->getType()))
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ switch (I->getOpcode()) {
+ default:
+ return isa<BinaryOperator>(I) && isSupportedType(I) &&
+ !GenerateSignBits(I);
+ case Instruction::GetElementPtr:
+ case Instruction::Store:
+ case Instruction::Br:
+ case Instruction::Switch:
return true;
- return EqualTypeSize(I->getOperand(0));
- }
-
- if (GenerateSignBits(V)) {
- LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n");
- return false;
- }
-
- // Memory instructions
- if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
- return true;
-
- // Branches and targets.
- if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
- return true;
-
- // Non-instruction values that we can handle.
- if ((isa<Constant>(V) && !isa<ConstantExpr>(V)) || isa<Argument>(V))
+ case Instruction::PHI:
+ case Instruction::Select:
+ case Instruction::Ret:
+ case Instruction::Load:
+ case Instruction::Trunc:
+ case Instruction::BitCast:
+ return isSupportedType(I);
+ case Instruction::ZExt:
+ return isSupportedType(I->getOperand(0));
+ case Instruction::ICmp:
+ // Now that we allow small types than TypeSize, only allow icmp of
+ // TypeSize because they will require a trunc to be legalised.
+ // TODO: Allow icmp of smaller types, and calculate at the end
+ // whether the transform would be beneficial.
+ if (isa<PointerType>(I->getOperand(0)->getType()))
+ return true;
+ return EqualTypeSize(I->getOperand(0));
+ case Instruction::Call: {
+ // Special cases for calls as we need to check for zeroext
+ // TODO We should accept calls even if they don't have zeroext, as they
+ // can still be sinks.
+ auto *Call = cast<CallInst>(I);
+ return isSupportedType(Call) &&
+ Call->hasRetAttr(Attribute::AttrKind::ZExt);
+ }
+ }
+ } else if (isa<Constant>(V) && !isa<ConstantExpr>(V)) {
return isSupportedType(V);
-
- if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V) ||
- isa<LoadInst>(V))
+ } else if (isa<Argument>(V))
return isSupportedType(V);
- if (auto *Cast = dyn_cast<CastInst>(V))
- return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0));
-
- // Special cases for calls as we need to check for zeroext
- // TODO We should accept calls even if they don't have zeroext, as they can
- // still be sinks.
- if (auto *Call = dyn_cast<CallInst>(V))
- return isSupportedType(Call) &&
- Call->hasRetAttr(Attribute::AttrKind::ZExt);
-
- if (!isa<BinaryOperator>(V))
- return false;
-
- if (!isSupportedType(V))
- return false;
-
- return true;
+ return isa<BasicBlock>(V);
}
/// Check that the type of V would be promoted and that the original type is
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 60e5d7bf6098..24ca25f73e96 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -26,8 +26,10 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -69,6 +71,7 @@ STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk");
STATISTIC(NumCBZ, "Number of CBZ / CBNZ formed");
STATISTIC(NumJTMoved, "Number of jump table destination blocks moved");
STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted");
+STATISTIC(NumLEInserted, "Number of LE backwards branches inserted");
static cl::opt<bool>
AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
@@ -212,6 +215,7 @@ namespace {
const ARMBaseInstrInfo *TII;
const ARMSubtarget *STI;
ARMFunctionInfo *AFI;
+ MachineDominatorTree *DT = nullptr;
bool isThumb;
bool isThumb1;
bool isThumb2;
@@ -224,6 +228,11 @@ namespace {
bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
@@ -238,7 +247,7 @@ namespace {
void doInitialJumpTablePlacement(std::vector<MachineInstr *> &CPEMIs);
bool BBHasFallthrough(MachineBasicBlock *MBB);
CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
- unsigned getCPELogAlign(const MachineInstr *CPEMI);
+ Align getCPEAlign(const MachineInstr *CPEMI);
void scanFunctionJumpTables();
void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
@@ -327,8 +336,7 @@ LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
const BasicBlockInfo &BBI = BBInfo[J];
dbgs() << format("%08x %bb.%u\t", BBI.Offset, J)
<< " kb=" << unsigned(BBI.KnownBits)
- << " ua=" << unsigned(BBI.Unalign)
- << " pa=" << unsigned(BBI.PostAlign)
+ << " ua=" << unsigned(BBI.Unalign) << " pa=" << Log2(BBI.PostAlign)
<< format(" size=%#x\n", BBInfo[J].Size);
}
});
@@ -349,6 +357,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
isPositionIndependentOrROPI =
STI->getTargetLowering()->isPositionIndependent() || STI->isROPI();
AFI = MF->getInfo<ARMFunctionInfo>();
+ DT = &getAnalysis<MachineDominatorTree>();
isThumb = AFI->isThumbFunction();
isThumb1 = AFI->isThumb1OnlyFunction();
@@ -357,9 +366,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
HasFarJump = false;
bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB);
- // This pass invalidates liveness information when it splits basic blocks.
- MF->getRegInfo().invalidateLiveness();
-
// Renumber all of the machine basic blocks in the function, guaranteeing that
// the numbers agree with the position of the block in the function.
MF->RenumberBlocks();
@@ -398,7 +404,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
// Functions with jump tables need an alignment of 4 because they use the ADR
// instruction, which aligns the PC to 4 bytes before adding an offset.
if (!T2JumpTables.empty())
- MF->ensureAlignment(2);
+ MF->ensureAlignment(Align(4));
/// Remove dead constant pool entries.
MadeChange |= removeUnusedCPEntries();
@@ -487,8 +493,9 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
MF->push_back(BB);
- // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
- unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+ // MachineConstantPool measures alignment in bytes.
+ const Align MaxAlign(MCP->getConstantPoolAlignment());
+ const unsigned MaxLogAlign = Log2(MaxAlign);
// Mark the basic block as required by the const-pool.
BB->setAlignment(MaxAlign);
@@ -501,7 +508,8 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
// alignment of all entries as long as BB is sufficiently aligned. Keep
// track of the insertion point for each alignment. We are going to bucket
// sort the entries as they are created.
- SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+ SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxLogAlign + 1,
+ BB->end());
// Add all of the constants from the constant pool to the end block, use an
// identity mapping of CPI's to CPE's.
@@ -526,7 +534,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
// Ensure that future entries with higher alignment get inserted before
// CPEMI. This is bucket sort with iterators.
- for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+ for (unsigned a = LogAlign + 1; a <= MaxLogAlign; ++a)
if (InsPoint[a] == InsAt)
InsPoint[a] = CPEMI;
@@ -640,29 +648,27 @@ ARMConstantIslands::findConstPoolEntry(unsigned CPI,
return nullptr;
}
-/// getCPELogAlign - Returns the required alignment of the constant pool entry
-/// represented by CPEMI. Alignment is measured in log2(bytes) units.
-unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+/// getCPEAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.
+Align ARMConstantIslands::getCPEAlign(const MachineInstr *CPEMI) {
switch (CPEMI->getOpcode()) {
case ARM::CONSTPOOL_ENTRY:
break;
case ARM::JUMPTABLE_TBB:
- return isThumb1 ? 2 : 0;
+ return isThumb1 ? Align(4) : Align(1);
case ARM::JUMPTABLE_TBH:
- return isThumb1 ? 2 : 1;
+ return isThumb1 ? Align(4) : Align(2);
case ARM::JUMPTABLE_INSTS:
- return 1;
+ return Align(2);
case ARM::JUMPTABLE_ADDRS:
- return 2;
+ return Align(4);
default:
llvm_unreachable("unknown constpool entry kind");
}
unsigned CPI = getCombinedIndex(CPEMI);
assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
- unsigned Align = MCP->getConstants()[CPI].getAlignment();
- assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
- return Log2_32(Align);
+ return Align(MCP->getConstants()[CPI].getAlignment());
}
/// scanFunctionJumpTables - Do a scan of the function, building up
@@ -687,7 +693,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
BBInfoVector &BBInfo = BBUtils->getBBInfo();
// The known bits of the entry block offset are determined by the function
// alignment.
- BBInfo.front().KnownBits = MF->getAlignment();
+ BBInfo.front().KnownBits = Log2(MF->getAlignment());
// Compute block offsets and known bits.
BBUtils->adjustBBOffsetsAfter(&MF->front());
@@ -824,11 +830,6 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
Scale = 2; // +-(offset_8*2)
NegOk = true;
break;
-
- case ARM::tLDRHi:
- Bits = 5;
- Scale = 2; // +(offset_5*2)
- break;
}
// Remember that this is a user of a CP entry.
@@ -885,6 +886,13 @@ void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
MachineBasicBlock *OrigBB = MI->getParent();
+ // Collect liveness information at MI.
+ LivePhysRegs LRs(*MF->getSubtarget().getRegisterInfo());
+ LRs.addLiveOuts(*OrigBB);
+ auto LivenessEnd = ++MachineBasicBlock::iterator(MI).getReverse();
+ for (MachineInstr &LiveMI : make_range(OrigBB->rbegin(), LivenessEnd))
+ LRs.stepBackward(LiveMI);
+
// Create a new MBB for the code after the OrigBB.
MachineBasicBlock *NewBB =
MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
@@ -913,6 +921,12 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
// OrigBB branches to NewBB.
OrigBB->addSuccessor(NewBB);
+ // Update live-in information in the new block.
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ for (MCPhysReg L : LRs)
+ if (!MRI.isReserved(L))
+ NewBB->addLiveIn(L);
+
// Update internal data structures to account for the newly inserted MBB.
// This is almost the same as updateForInsertedWaterBlock, except that
// the Water goes after OrigBB, not NewBB.
@@ -1007,13 +1021,13 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
MachineBasicBlock* Water, CPUser &U,
unsigned &Growth) {
BBInfoVector &BBInfo = BBUtils->getBBInfo();
- unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
- unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
- unsigned NextBlockOffset, NextBlockAlignment;
+ const Align CPEAlign = getCPEAlign(U.CPEMI);
+ const unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPEAlign);
+ unsigned NextBlockOffset;
+ Align NextBlockAlignment;
MachineFunction::const_iterator NextBlock = Water->getIterator();
if (++NextBlock == MF->end()) {
NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
- NextBlockAlignment = 0;
} else {
NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
NextBlockAlignment = NextBlock->getAlignment();
@@ -1028,13 +1042,13 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
Growth = CPEEnd - NextBlockOffset;
// Compute the padding that would go at the end of the CPE to align the next
// block.
- Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
+ Growth += offsetToAlignment(CPEEnd, NextBlockAlignment);
// If the CPE is to be inserted before the instruction, that will raise
// the offset of the instruction. Also account for unknown alignment padding
// in blocks between CPE and the user.
if (CPEOffset < UserOffset)
- UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign);
+ UserOffset += Growth + UnknownPadding(MF->getAlignment(), Log2(CPEAlign));
} else
// CPE fits in existing padding.
Growth = 0;
@@ -1200,8 +1214,8 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
// inserting islands between BB0 and BB1 makes other accesses out of range.
MachineBasicBlock *UserBB = U.MI->getParent();
BBInfoVector &BBInfo = BBUtils->getBBInfo();
- unsigned MinNoSplitDisp =
- BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI));
+ const Align CPEAlign = getCPEAlign(U.CPEMI);
+ unsigned MinNoSplitDisp = BBInfo[UserBB->getNumber()].postOffset(CPEAlign);
if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2)
return false;
for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
@@ -1254,7 +1268,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
CPUser &U = CPUsers[CPUserIndex];
MachineInstr *UserMI = U.MI;
MachineInstr *CPEMI = U.CPEMI;
- unsigned CPELogAlign = getCPELogAlign(CPEMI);
+ const Align CPEAlign = getCPEAlign(CPEMI);
MachineBasicBlock *UserMBB = UserMI->getParent();
BBInfoVector &BBInfo = BBUtils->getBBInfo();
const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
@@ -1267,7 +1281,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
// Size of branch to insert.
unsigned Delta = isThumb1 ? 2 : 4;
// Compute the offset where the CPE will begin.
- unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+ unsigned CPEOffset = UserBBI.postOffset(CPEAlign) + Delta;
if (isOffsetInRange(UserOffset, CPEOffset, U)) {
LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
@@ -1308,11 +1322,11 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
// Try to split the block so it's fully aligned. Compute the latest split
// point where we can add a 4-byte branch instruction, and then align to
- // LogAlign which is the largest possible alignment in the function.
- unsigned LogAlign = MF->getAlignment();
- assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+ // Align which is the largest possible alignment in the function.
+ const Align Align = MF->getAlignment();
+ assert(Align >= CPEAlign && "Over-aligned constant pool entry");
unsigned KnownBits = UserBBI.internalKnownBits();
- unsigned UPad = UnknownPadding(LogAlign, KnownBits);
+ unsigned UPad = UnknownPadding(Align, KnownBits);
unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad;
LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x",
BaseInsertOffset));
@@ -1323,7 +1337,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
BaseInsertOffset -= 4;
LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
- << " la=" << LogAlign << " kb=" << KnownBits
+ << " la=" << Log2(Align) << " kb=" << KnownBits
<< " up=" << UPad << '\n');
// This could point off the end of the block if we've already got constant
@@ -1337,6 +1351,28 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
BaseInsertOffset =
std::max(UserBBI.postOffset() - UPad - 8,
UserOffset + TII->getInstSizeInBytes(*UserMI) + 1);
+ // If the CP is referenced(ie, UserOffset) is in first four instructions
+ // after IT, this recalculated BaseInsertOffset could be in the middle of
+ // an IT block. If it is, change the BaseInsertOffset to just after the
+ // IT block. This still make the CP Entry is in range becuase of the
+ // following reasons.
+ // 1. The initial BaseseInsertOffset calculated is (UserOffset +
+ // U.getMaxDisp() - UPad).
+ // 2. An IT block is only at most 4 instructions plus the "it" itself (18
+ // bytes).
+ // 3. All the relevant instructions support much larger Maximum
+ // displacement.
+ MachineBasicBlock::iterator I = UserMI;
+ ++I;
+ for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI),
+ PredReg = 0;
+ I->getOpcode() != ARM::t2IT &&
+ getITInstrPredicate(*I, PredReg) != ARMCC::AL;
+ Offset += TII->getInstSizeInBytes(*I), I = std::next(I)) {
+ BaseInsertOffset =
+ std::max(BaseInsertOffset, Offset + TII->getInstSizeInBytes(*I) + 1);
+ assert(I != UserMBB->end() && "Fell off end of block");
+ }
LLVM_DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
}
unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
@@ -1354,8 +1390,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
CPUser &U = CPUsers[CPUIndex];
if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
// Shift intertion point by one unit of alignment so it is within reach.
- BaseInsertOffset -= 1u << LogAlign;
- EndInsertOffset -= 1u << LogAlign;
+ BaseInsertOffset -= Align.value();
+ EndInsertOffset -= Align.value();
}
// This is overly conservative, as we don't account for CPEMIs being
// reused within the block, but it doesn't matter much. Also assume CPEs
@@ -1397,9 +1433,10 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
}
// We really must not split an IT block.
- LLVM_DEBUG(unsigned PredReg; assert(
- !isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
-
+#ifndef NDEBUG
+ unsigned PredReg;
+ assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL);
+#endif
NewMBB = splitBlockBeforeInstr(&*MI);
}
@@ -1464,9 +1501,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
// Always align the new block because CP entries can be smaller than 4
// bytes. Be careful not to decrease the existing alignment, e.g. NewMBB may
// be an already aligned constant pool block.
- const unsigned Align = isThumb ? 1 : 2;
- if (NewMBB->getAlignment() < Align)
- NewMBB->setAlignment(Align);
+ const Align Alignment = isThumb ? Align(2) : Align(4);
+ if (NewMBB->getAlignment() < Alignment)
+ NewMBB->setAlignment(Alignment);
// Remove the original WaterList entry; we want subsequent insertions in
// this vicinity to go after the one we're about to insert. This
@@ -1495,7 +1532,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
decrementCPEReferenceCount(CPI, CPEMI);
// Mark the basic block as aligned as required by the const-pool entry.
- NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+ NewIsland->setAlignment(getCPEAlign(U.CPEMI));
// Increase the size of the island block to account for the new entry.
BBUtils->adjustBBSize(NewIsland, Size);
@@ -1529,10 +1566,11 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
BBInfo[CPEBB->getNumber()].Size = 0;
// This block no longer needs to be aligned.
- CPEBB->setAlignment(0);
- } else
+ CPEBB->setAlignment(Align::None());
+ } else {
// Entries are sorted by descending alignment, so realign from the front.
- CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin()));
+ CPEBB->setAlignment(getCPEAlign(&*CPEBB->begin()));
+ }
BBUtils->adjustBBOffsetsAfter(CPEBB);
// An island has only one predecessor BB and one successor BB. Check if
@@ -1620,7 +1658,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
// L2:
ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm();
CC = ARMCC::getOppositeCondition(CC);
- unsigned CCReg = MI->getOperand(2).getReg();
+ Register CCReg = MI->getOperand(2).getReg();
// If the branch is at the end of its MBB and that has a fall-through block,
// direct the updated conditional branch to the fall-through block. Otherwise,
@@ -1778,16 +1816,10 @@ bool ARMConstantIslands::optimizeThumb2Instructions() {
return MadeChange;
}
+
bool ARMConstantIslands::optimizeThumb2Branches() {
- bool MadeChange = false;
- // The order in which branches appear in ImmBranches is approximately their
- // order within the function body. By visiting later branches first, we reduce
- // the distance between earlier forward branches and their targets, making it
- // more likely that the cbn?z optimization, which can only apply to forward
- // branches, will succeed.
- for (unsigned i = ImmBranches.size(); i != 0; --i) {
- ImmBranch &Br = ImmBranches[i-1];
+ auto TryShrinkBranch = [this](ImmBranch &Br) {
unsigned Opcode = Br.MI->getOpcode();
unsigned NewOpc = 0;
unsigned Scale = 1;
@@ -1815,47 +1847,115 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
BBUtils->adjustBBSize(MBB, -2);
BBUtils->adjustBBOffsetsAfter(MBB);
++NumT2BrShrunk;
- MadeChange = true;
+ return true;
}
}
+ return false;
+ };
- Opcode = Br.MI->getOpcode();
- if (Opcode != ARM::tBcc)
- continue;
+ struct ImmCompare {
+ MachineInstr* MI = nullptr;
+ unsigned NewOpc = 0;
+ };
+
+ auto FindCmpForCBZ = [this](ImmBranch &Br, ImmCompare &ImmCmp,
+ MachineBasicBlock *DestBB) {
+ ImmCmp.MI = nullptr;
+ ImmCmp.NewOpc = 0;
// If the conditional branch doesn't kill CPSR, then CPSR can be liveout
// so this transformation is not safe.
if (!Br.MI->killsRegister(ARM::CPSR))
- continue;
+ return false;
- NewOpc = 0;
unsigned PredReg = 0;
+ unsigned NewOpc = 0;
ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg);
if (Pred == ARMCC::EQ)
NewOpc = ARM::tCBZ;
else if (Pred == ARMCC::NE)
NewOpc = ARM::tCBNZ;
- if (!NewOpc)
- continue;
- MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+ else
+ return false;
+
// Check if the distance is within 126. Subtract starting offset by 2
// because the cmp will be eliminated.
unsigned BrOffset = BBUtils->getOffsetOf(Br.MI) + 4 - 2;
BBInfoVector &BBInfo = BBUtils->getBBInfo();
unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
if (BrOffset >= DestOffset || (DestOffset - BrOffset) > 126)
- continue;
+ return false;
// Search backwards to find a tCMPi8
auto *TRI = STI->getRegisterInfo();
MachineInstr *CmpMI = findCMPToFoldIntoCBZ(Br.MI, TRI);
if (!CmpMI || CmpMI->getOpcode() != ARM::tCMPi8)
+ return false;
+
+ ImmCmp.MI = CmpMI;
+ ImmCmp.NewOpc = NewOpc;
+ return true;
+ };
+
+ auto TryConvertToLE = [this](ImmBranch &Br, ImmCompare &Cmp) {
+ if (Br.MI->getOpcode() != ARM::t2Bcc || !STI->hasLOB() ||
+ STI->hasMinSize())
+ return false;
+
+ MachineBasicBlock *MBB = Br.MI->getParent();
+ MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+ if (BBUtils->getOffsetOf(MBB) < BBUtils->getOffsetOf(DestBB) ||
+ !BBUtils->isBBInRange(Br.MI, DestBB, 4094))
+ return false;
+
+ if (!DT->dominates(DestBB, MBB))
+ return false;
+
+ // We queried for the CBN?Z opcode based upon the 'ExitBB', the opposite
+ // target of Br. So now we need to reverse the condition.
+ Cmp.NewOpc = Cmp.NewOpc == ARM::tCBZ ? ARM::tCBNZ : ARM::tCBZ;
+
+ MachineInstrBuilder MIB = BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(),
+ TII->get(ARM::t2LE));
+ MIB.add(Br.MI->getOperand(0));
+ Br.MI->eraseFromParent();
+ Br.MI = MIB;
+ ++NumLEInserted;
+ return true;
+ };
+
+ bool MadeChange = false;
+
+ // The order in which branches appear in ImmBranches is approximately their
+ // order within the function body. By visiting later branches first, we reduce
+ // the distance between earlier forward branches and their targets, making it
+ // more likely that the cbn?z optimization, which can only apply to forward
+ // branches, will succeed.
+ for (ImmBranch &Br : reverse(ImmBranches)) {
+ MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+ MachineBasicBlock *MBB = Br.MI->getParent();
+ MachineBasicBlock *ExitBB = &MBB->back() == Br.MI ?
+ MBB->getFallThrough() :
+ MBB->back().getOperand(0).getMBB();
+
+ ImmCompare Cmp;
+ if (FindCmpForCBZ(Br, Cmp, ExitBB) && TryConvertToLE(Br, Cmp)) {
+ DestBB = ExitBB;
+ MadeChange = true;
+ } else {
+ FindCmpForCBZ(Br, Cmp, DestBB);
+ MadeChange |= TryShrinkBranch(Br);
+ }
+
+ unsigned Opcode = Br.MI->getOpcode();
+ if ((Opcode != ARM::tBcc && Opcode != ARM::t2LE) || !Cmp.NewOpc)
continue;
- unsigned Reg = CmpMI->getOperand(0).getReg();
+ Register Reg = Cmp.MI->getOperand(0).getReg();
// Check for Kill flags on Reg. If they are present remove them and set kill
// on the new CBZ.
+ auto *TRI = STI->getRegisterInfo();
MachineBasicBlock::iterator KillMI = Br.MI;
bool RegKilled = false;
do {
@@ -1865,19 +1965,32 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
RegKilled = true;
break;
}
- } while (KillMI != CmpMI);
+ } while (KillMI != Cmp.MI);
// Create the new CBZ/CBNZ
- MachineBasicBlock *MBB = Br.MI->getParent();
- LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
+ LLVM_DEBUG(dbgs() << "Fold: " << *Cmp.MI << " and: " << *Br.MI);
MachineInstr *NewBR =
- BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(NewOpc))
+ BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(Cmp.NewOpc))
.addReg(Reg, getKillRegState(RegKilled))
.addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags());
- CmpMI->eraseFromParent();
- Br.MI->eraseFromParent();
- Br.MI = NewBR;
+
+ Cmp.MI->eraseFromParent();
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
BBInfo[MBB->getNumber()].Size -= 2;
+
+ if (Br.MI->getOpcode() == ARM::tBcc) {
+ Br.MI->eraseFromParent();
+ Br.MI = NewBR;
+ } else if (&MBB->back() != Br.MI) {
+ // We've generated an LE and already erased the original conditional
+ // branch. The CBN?Z is now used to branch to the other successor, so an
+ // unconditional branch terminator is now redundant.
+ MachineInstr *LastMI = &MBB->back();
+ if (LastMI != Br.MI) {
+ BBInfo[MBB->getNumber()].Size -= LastMI->getDesc().getSize();
+ LastMI->eraseFromParent();
+ }
+ }
BBUtils->adjustBBOffsetsAfter(MBB);
++NumCBZ;
MadeChange = true;
@@ -1931,8 +2044,8 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI,
// of BaseReg, but only if the t2ADDrs can be removed.
// + Some instruction other than t2ADDrs computing the entry. Not seen in
// the wild, but we should be careful.
- unsigned EntryReg = JumpMI->getOperand(0).getReg();
- unsigned BaseReg = LEAMI->getOperand(0).getReg();
+ Register EntryReg = JumpMI->getOperand(0).getReg();
+ Register BaseReg = LEAMI->getOperand(0).getReg();
CanDeleteLEA = true;
BaseRegKill = false;
@@ -2009,7 +2122,7 @@ static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI,
// but the JT now uses PC. Finds the last ADD (if any) that def's EntryReg
// and is not clobbered / used.
MachineInstr *RemovableAdd = nullptr;
- unsigned EntryReg = JumpMI->getOperand(0).getReg();
+ Register EntryReg = JumpMI->getOperand(0).getReg();
// Find the last ADD to set EntryReg
MachineBasicBlock::iterator I(LEAMI);
@@ -2106,7 +2219,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
// %idx = tLSLri %idx, 2
// %base = tLEApcrelJT
// %t = tLDRr %base, %idx
- unsigned BaseReg = User.MI->getOperand(0).getReg();
+ Register BaseReg = User.MI->getOperand(0).getReg();
if (User.MI->getIterator() == User.MI->getParent()->begin())
continue;
@@ -2116,7 +2229,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
!Shift->getOperand(2).isKill())
continue;
IdxReg = Shift->getOperand(2).getReg();
- unsigned ShiftedIdxReg = Shift->getOperand(0).getReg();
+ Register ShiftedIdxReg = Shift->getOperand(0).getReg();
// It's important that IdxReg is live until the actual TBB/TBH. Most of
// the range is checked later, but the LEA might still clobber it and not
@@ -2313,6 +2426,10 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
MachineFunction::iterator MBBI = ++JTBB->getIterator();
MF->insert(MBBI, NewBB);
+ // Copy live-in information to new block.
+ for (const MachineBasicBlock::RegisterMaskPair &RegMaskPair : BB->liveins())
+ NewBB->addLiveIn(RegMaskPair);
+
// Add an unconditional branch from NewBB to BB.
// There doesn't seem to be meaningful DebugInfo available; this doesn't
// correspond directly to anything in the source.
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 3bdb0e1ef62d..72c95f441265 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -17,6 +17,7 @@
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index b32ba3eeea18..563fdda56104 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -481,7 +481,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
unsigned OpIdx = 0;
bool DstIsDead = MI.getOperand(OpIdx).isDead();
- unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+ Register DstReg = MI.getOperand(OpIdx++).getReg();
if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 ||
TableEntry->RealOpc == ARM::VLD2DUPd16x2 ||
TableEntry->RealOpc == ARM::VLD2DUPd32x2) {
@@ -492,7 +492,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
assert(RegSpc == OddDblSpc && "Unexpected spacing!");
SubRegIndex = ARM::dsub_1;
}
- unsigned SubReg = TRI->getSubReg(DstReg, SubRegIndex);
+ Register SubReg = TRI->getSubReg(DstReg, SubRegIndex);
unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0,
&ARM::DPairSpcRegClass);
MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead));
@@ -624,7 +624,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
bool SrcIsKill = MI.getOperand(OpIdx).isKill();
bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
- unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+ Register SrcReg = MI.getOperand(OpIdx++).getReg();
unsigned D0, D1, D2, D3;
GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
MIB.addReg(D0, getUndefRegState(SrcIsUndef));
@@ -760,7 +760,7 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
}
bool SrcIsKill = MI.getOperand(OpIdx).isKill();
- unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+ Register SrcReg = MI.getOperand(OpIdx++).getReg();
unsigned D0, D1, D2, D3;
GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3);
MIB.addReg(D0);
@@ -789,6 +789,7 @@ static bool IsAnAddressOperand(const MachineOperand &MO) {
case MachineOperand::MO_Immediate:
case MachineOperand::MO_CImmediate:
case MachineOperand::MO_FPImmediate:
+ case MachineOperand::MO_ShuffleMask:
return false;
case MachineOperand::MO_MachineBasicBlock:
return true;
@@ -828,7 +829,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
unsigned Opcode = MI.getOpcode();
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
@@ -932,13 +933,13 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Dest = MI.getOperand(0);
- unsigned TempReg = MI.getOperand(1).getReg();
+ Register TempReg = MI.getOperand(1).getReg();
// Duplicating undef operands into 2 instructions does not guarantee the same
// value on both; However undef should be replaced by xzr anyway.
assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
- unsigned AddrReg = MI.getOperand(2).getReg();
- unsigned DesiredReg = MI.getOperand(3).getReg();
- unsigned NewReg = MI.getOperand(4).getReg();
+ Register AddrReg = MI.getOperand(2).getReg();
+ Register DesiredReg = MI.getOperand(3).getReg();
+ Register NewReg = MI.getOperand(4).getReg();
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -1035,8 +1036,8 @@ static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
unsigned Flags, bool IsThumb,
const TargetRegisterInfo *TRI) {
if (IsThumb) {
- unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
- unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
+ Register RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
+ Register RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
MIB.addReg(RegLo, Flags);
MIB.addReg(RegHi, Flags);
} else
@@ -1051,19 +1052,19 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
MachineOperand &Dest = MI.getOperand(0);
- unsigned TempReg = MI.getOperand(1).getReg();
+ Register TempReg = MI.getOperand(1).getReg();
// Duplicating undef operands into 2 instructions does not guarantee the same
// value on both; However undef should be replaced by xzr anyway.
assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
- unsigned AddrReg = MI.getOperand(2).getReg();
- unsigned DesiredReg = MI.getOperand(3).getReg();
+ Register AddrReg = MI.getOperand(2).getReg();
+ Register DesiredReg = MI.getOperand(3).getReg();
MachineOperand New = MI.getOperand(4);
New.setIsKill(false);
- unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
- unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
- unsigned DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0);
- unsigned DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1);
+ Register DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
+ Register DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
+ Register DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0);
+ Register DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1);
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -1204,8 +1205,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
NewMI->addOperand(MBBI->getOperand(i));
- // Delete the pseudo instruction TCRETURN.
+
+ // Update call site info and delete the pseudo instruction TCRETURN.
+ MBB.getParent()->moveCallSiteInfo(&MI, &*NewMI);
MBB.erase(MBBI);
+
MBBI = NewMI;
return true;
}
@@ -1336,7 +1340,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// for us. Otherwise, expand to nothing.
if (RI.hasBasePointer(MF)) {
int32_t NumBytes = AFI->getFramePtrSpillOffset();
- unsigned FramePtr = RI.getFrameRegister(MF);
+ Register FramePtr = RI.getFrameRegister(MF);
assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) &&
"base pointer without frame pointer?");
@@ -1412,7 +1416,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MachineConstantPoolValue *CPV =
ARMConstantPoolSymbol::Create(MF->getFunction().getContext(),
"__aeabi_read_tp", PCLabelID, 0);
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg)
.addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
@@ -1435,6 +1439,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MIB.cloneMemRefs(MI);
TransferImpOps(MI, MIB, MIB);
+ MI.getMF()->moveCallSiteInfo(&MI, &*MIB);
MI.eraseFromParent();
return true;
}
@@ -1442,7 +1447,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::t2LDRpci_pic: {
unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
? ARM::tLDRpci : ARM::t2LDRpci;
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
MachineInstrBuilder MIB1 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg)
@@ -1464,7 +1469,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::LDRLIT_ga_pcrel_ldr:
case ARM::tLDRLIT_ga_abs:
case ARM::tLDRLIT_ga_pcrel: {
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
const MachineOperand &MO1 = MI.getOperand(1);
auto Flags = MO1.getTargetFlags();
@@ -1522,7 +1527,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::t2MOV_ga_pcrel: {
// Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode.
unsigned LabelId = AFI->createPICLabelUId();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
const MachineOperand &MO1 = MI.getOperand(1);
const GlobalValue *GV = MO1.getGlobal();
@@ -1586,7 +1591,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// Grab the Q register destination.
bool DstIsDead = MI.getOperand(OpIdx).isDead();
- unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+ Register DstReg = MI.getOperand(OpIdx++).getReg();
// Copy the source register.
MIB.add(MI.getOperand(OpIdx++));
@@ -1596,8 +1601,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MIB.add(MI.getOperand(OpIdx++));
// Add the destination operands (D subregs).
- unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
- unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
+ Register D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
+ Register D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead))
.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
@@ -1617,7 +1622,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// Grab the Q register source.
bool SrcIsKill = MI.getOperand(OpIdx).isKill();
- unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+ Register SrcReg = MI.getOperand(OpIdx++).getReg();
// Copy the destination register.
MachineOperand Dst(MI.getOperand(OpIdx++));
@@ -1628,8 +1633,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MIB.add(MI.getOperand(OpIdx++));
// Add the source operands (D subregs).
- unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
- unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+ Register D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+ Register D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
MIB.addReg(D0, SrcIsKill ? RegState::Kill : 0)
.addReg(D1, SrcIsKill ? RegState::Kill : 0);
@@ -1915,6 +1920,37 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::CMP_SWAP_64:
return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
+
+ case ARM::tBL_PUSHLR:
+ case ARM::BL_PUSHLR: {
+ const bool Thumb = Opcode == ARM::tBL_PUSHLR;
+ Register Reg = MI.getOperand(0).getReg();
+ assert(Reg == ARM::LR && "expect LR register!");
+ MachineInstrBuilder MIB;
+ if (Thumb) {
+ // push {lr}
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPUSH))
+ .add(predOps(ARMCC::AL))
+ .addReg(Reg);
+
+ // bl __gnu_mcount_nc
+ MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tBL));
+ } else {
+ // stmdb sp!, {lr}
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::STMDB_UPD))
+ .addReg(ARM::SP, RegState::Define)
+ .addReg(ARM::SP)
+ .add(predOps(ARMCC::AL))
+ .addReg(Reg);
+
+ // bl __gnu_mcount_nc
+ MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::BL));
+ }
+ MIB.cloneMemRefs(MI);
+ for (unsigned i = 1; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i));
+ MI.eraseFromParent();
+ return true;
+ }
}
}
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 6e274d269bf2..1fc5ff6921c6 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -191,8 +191,8 @@ class ARMFastISel final : public FastISel {
bool isTypeLegal(Type *Ty, MVT &VT);
bool isLoadTypeLegal(Type *Ty, MVT &VT);
bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
- bool isZExt, bool isEquality);
- bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+ bool isZExt);
+ bool ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
unsigned Alignment = 0, bool isZExt = true,
bool allocReg = true);
bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
@@ -219,15 +219,15 @@ class ARMFastISel final : public FastISel {
bool Return,
bool isVarArg);
bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
- SmallVectorImpl<unsigned> &ArgRegs,
+ SmallVectorImpl<Register> &ArgRegs,
SmallVectorImpl<MVT> &ArgVTs,
SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
- SmallVectorImpl<unsigned> &RegArgs,
+ SmallVectorImpl<Register> &RegArgs,
CallingConv::ID CC,
unsigned &NumBytes,
bool isVarArg);
unsigned getLibcallReg(const Twine &Name);
- bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+ bool FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs,
const Instruction *I, CallingConv::ID CC,
unsigned &NumBytes, bool isVarArg);
bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
@@ -301,7 +301,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
unsigned Op0, bool Op0IsKill) {
- unsigned ResultReg = createResultReg(RC);
+ Register ResultReg = createResultReg(RC);
const MCInstrDesc &II = TII.get(MachineInstOpcode);
// Make sure the input operand is sufficiently constrained to be legal
@@ -913,7 +913,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
AddOptionalDefs(MIB);
}
-bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
unsigned Alignment, bool isZExt, bool allocReg) {
unsigned Opc;
bool useAM3 = false;
@@ -1045,7 +1045,7 @@ bool ARMFastISel::SelectLoad(const Instruction *I) {
Address Addr;
if (!ARMComputeAddress(I->getOperand(0), Addr)) return false;
- unsigned ResultReg;
+ Register ResultReg;
if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
return false;
updateValueMap(I, ResultReg);
@@ -1259,8 +1259,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
if (ARMPred == ARMCC::AL) return false;
// Emit the compare.
- if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
- CI->isEquality()))
+ if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
return false;
unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
@@ -1349,7 +1348,7 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
}
bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
- bool isZExt, bool isEquality) {
+ bool isZExt) {
Type *Ty = Src1Value->getType();
EVT SrcEVT = TLI.getValueType(DL, Ty, true);
if (!SrcEVT.isSimple()) return false;
@@ -1397,19 +1396,11 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
// TODO: Verify compares.
case MVT::f32:
isICmp = false;
- // Equality comparisons shouldn't raise Invalid on uordered inputs.
- if (isEquality)
- CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS;
- else
- CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
+ CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS;
break;
case MVT::f64:
isICmp = false;
- // Equality comparisons shouldn't raise Invalid on uordered inputs.
- if (isEquality)
- CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD;
- else
- CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED;
+ CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD;
break;
case MVT::i1:
case MVT::i8:
@@ -1485,8 +1476,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
if (ARMPred == ARMCC::AL) return false;
// Emit the compare.
- if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
- CI->isEquality()))
+ if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
return false;
// Now set a register based on the comparison. Explicitly set the predicates
@@ -1893,10 +1883,10 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
}
bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
- SmallVectorImpl<unsigned> &ArgRegs,
+ SmallVectorImpl<Register> &ArgRegs,
SmallVectorImpl<MVT> &ArgVTs,
SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
- SmallVectorImpl<unsigned> &RegArgs,
+ SmallVectorImpl<Register> &RegArgs,
CallingConv::ID CC,
unsigned &NumBytes,
bool isVarArg) {
@@ -1960,7 +1950,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
const Value *ArgVal = Args[VA.getValNo()];
- unsigned Arg = ArgRegs[VA.getValNo()];
+ Register Arg = ArgRegs[VA.getValNo()];
MVT ArgVT = ArgVTs[VA.getValNo()];
assert((!ArgVT.isVector() && ArgVT.getSizeInBits() <= 64) &&
@@ -2039,7 +2029,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
return true;
}
-bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs,
const Instruction *I, CallingConv::ID CC,
unsigned &NumBytes, bool isVarArg) {
// Issue CALLSEQ_END
@@ -2060,7 +2050,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
// double fp reg we want.
MVT DestVT = RVLocs[0].getValVT();
const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT);
- unsigned ResultReg = createResultReg(DstRC);
+ Register ResultReg = createResultReg(DstRC);
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(ARM::VMOVDRR), ResultReg)
.addReg(RVLocs[0].getLocReg())
@@ -2081,7 +2071,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
const TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
- unsigned ResultReg = createResultReg(DstRC);
+ Register ResultReg = createResultReg(DstRC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY),
ResultReg).addReg(RVLocs[0].getLocReg());
@@ -2162,7 +2152,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
}
// Make the copy.
- unsigned DstReg = VA.getLocReg();
+ Register DstReg = VA.getLocReg();
const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
// Avoid a cross-class copy. This is very unlikely.
if (!SrcRC->contains(DstReg))
@@ -2231,7 +2221,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
// Set up the argument vectors.
SmallVector<Value*, 8> Args;
- SmallVector<unsigned, 8> ArgRegs;
+ SmallVector<Register, 8> ArgRegs;
SmallVector<MVT, 8> ArgVTs;
SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
Args.reserve(I->getNumOperands());
@@ -2247,8 +2237,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
if (!isTypeLegal(ArgTy, ArgVT)) return false;
ISD::ArgFlagsTy Flags;
- unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
- Flags.setOrigAlign(OriginalAlignment);
+ Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy)));
Args.push_back(Op);
ArgRegs.push_back(Arg);
@@ -2257,13 +2246,13 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
}
// Handle the arguments now that we've gotten them.
- SmallVector<unsigned, 4> RegArgs;
+ SmallVector<Register, 4> RegArgs;
unsigned NumBytes;
if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
RegArgs, CC, NumBytes, false))
return false;
- unsigned CalleeReg = 0;
+ Register CalleeReg;
if (Subtarget->genLongCalls()) {
CalleeReg = getLibcallReg(TLI.getLibcallName(Call));
if (CalleeReg == 0) return false;
@@ -2282,7 +2271,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
MIB.addExternalSymbol(TLI.getLibcallName(Call));
// Add implicit physical register uses to the call.
- for (unsigned R : RegArgs)
+ for (Register R : RegArgs)
MIB.addReg(R, RegState::Implicit);
// Add a register mask with the call-preserved registers.
@@ -2290,7 +2279,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
// Finish off the call including any return values.
- SmallVector<unsigned, 4> UsedRegs;
+ SmallVector<Register, 4> UsedRegs;
if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false;
// Set all unused physreg defs as dead.
@@ -2340,7 +2329,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
// Set up the argument vectors.
SmallVector<Value*, 8> Args;
- SmallVector<unsigned, 8> ArgRegs;
+ SmallVector<Register, 8> ArgRegs;
SmallVector<MVT, 8> ArgVTs;
SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
unsigned arg_size = CS.arg_size();
@@ -2377,12 +2366,11 @@ bool ARMFastISel::SelectCall(const Instruction *I,
ArgVT != MVT::i1)
return false;
- unsigned Arg = getRegForValue(*i);
- if (Arg == 0)
+ Register Arg = getRegForValue(*i);
+ if (!Arg.isValid())
return false;
- unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
- Flags.setOrigAlign(OriginalAlignment);
+ Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy)));
Args.push_back(*i);
ArgRegs.push_back(Arg);
@@ -2391,7 +2379,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
}
// Handle the arguments now that we've gotten them.
- SmallVector<unsigned, 4> RegArgs;
+ SmallVector<Register, 4> RegArgs;
unsigned NumBytes;
if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
RegArgs, CC, NumBytes, isVarArg))
@@ -2401,7 +2389,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
if (!GV || Subtarget->genLongCalls()) UseReg = true;
- unsigned CalleeReg = 0;
+ Register CalleeReg;
if (UseReg) {
if (IntrMemName)
CalleeReg = getLibcallReg(IntrMemName);
@@ -2427,7 +2415,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
MIB.addExternalSymbol(IntrMemName, 0);
// Add implicit physical register uses to the call.
- for (unsigned R : RegArgs)
+ for (Register R : RegArgs)
MIB.addReg(R, RegState::Implicit);
// Add a register mask with the call-preserved registers.
@@ -2435,7 +2423,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
// Finish off the call including any return values.
- SmallVector<unsigned, 4> UsedRegs;
+ SmallVector<Register, 4> UsedRegs;
if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg))
return false;
@@ -2476,7 +2464,7 @@ bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
}
bool RV;
- unsigned ResultReg;
+ Register ResultReg;
RV = ARMEmitLoad(VT, ResultReg, Src);
assert(RV && "Should be able to handle this load.");
RV = ARMEmitStore(VT, ResultReg, Dest);
@@ -2506,7 +2494,7 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
const ARMBaseRegisterInfo *RegInfo =
static_cast<const ARMBaseRegisterInfo *>(Subtarget->getRegisterInfo());
- unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+ Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
unsigned SrcReg = FramePtr;
// Recursively load frame address
@@ -2947,7 +2935,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
Address Addr;
if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
- unsigned ResultReg = MI->getOperand(0).getReg();
+ Register ResultReg = MI->getOperand(0).getReg();
if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
return false;
MachineBasicBlock::iterator I(MI);
@@ -2974,7 +2962,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
MachineMemOperand::MOLoad, 4, 4);
- unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
+ Register TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp;
MachineInstrBuilder MIB =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg)
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index bedb779bcba0..01ae93086dcb 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -76,7 +76,7 @@ skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
unsigned NumAlignedDPRCS2Regs);
ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti)
- : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
+ : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, Align(4)),
STI(sti) {}
bool ARMFrameLowering::keepFramePointer(const MachineFunction &MF) const {
@@ -376,7 +376,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
// to determine the end of the prologue.
DebugLoc dl;
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ Register FramePtr = RegInfo->getFrameRegister(MF);
// Determine the sizes of each callee-save spill areas and record which frame
// belongs to which callee-save spill areas.
@@ -780,7 +780,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
int NumBytes = (int)MFI.getStackSize();
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ Register FramePtr = RegInfo->getFrameRegister(MF);
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
@@ -1503,11 +1503,17 @@ static unsigned EstimateFunctionSizeInBytes(const MachineFunction &MF,
/// instructions will require a scratch register during their expansion later.
// FIXME: Move to TII?
static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
- const TargetFrameLowering *TFI) {
+ const TargetFrameLowering *TFI,
+ bool &HasNonSPFrameIndex) {
const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ const ARMBaseInstrInfo &TII =
+ *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
unsigned Limit = (1 << 12) - 1;
for (auto &MBB : MF) {
for (auto &MI : MBB) {
+ if (MI.isDebugInstr())
+ continue;
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
if (!MI.getOperand(i).isFI())
continue;
@@ -1518,13 +1524,29 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
Limit = std::min(Limit, (1U << 8) - 1);
break;
}
+ // t2ADDri will not require an extra register, it can reuse the
+ // destination.
+ if (MI.getOpcode() == ARM::t2ADDri || MI.getOpcode() == ARM::t2ADDri12)
+ break;
+
+ const MCInstrDesc &MCID = MI.getDesc();
+ const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI, MF);
+ if (RegClass && !RegClass->contains(ARM::SP))
+ HasNonSPFrameIndex = true;
// Otherwise check the addressing mode.
switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) {
+ case ARMII::AddrMode_i12:
+ case ARMII::AddrMode2:
+ // Default 12 bit limit.
+ break;
case ARMII::AddrMode3:
case ARMII::AddrModeT2_i8:
Limit = std::min(Limit, (1U << 8) - 1);
break;
+ case ARMII::AddrMode5FP16:
+ Limit = std::min(Limit, ((1U << 8) - 1) * 2);
+ break;
case ARMII::AddrMode5:
case ARMII::AddrModeT2_i8s4:
case ARMII::AddrModeT2_ldrex:
@@ -1541,8 +1563,17 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
// Addressing modes 4 & 6 (load/store) instructions can't encode an
// immediate offset for stack references.
return 0;
- default:
+ case ARMII::AddrModeT2_i7:
+ Limit = std::min(Limit, ((1U << 7) - 1) * 1);
+ break;
+ case ARMII::AddrModeT2_i7s2:
+ Limit = std::min(Limit, ((1U << 7) - 1) * 2);
break;
+ case ARMII::AddrModeT2_i7s4:
+ Limit = std::min(Limit, ((1U << 7) - 1) * 4);
+ break;
+ default:
+ llvm_unreachable("Unhandled addressing mode in stack size limit calculation");
}
break; // At most one FI per instruction
}
@@ -1623,7 +1654,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
(void)TRI; // Silence unused warning in non-assert builds.
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ Register FramePtr = RegInfo->getFrameRegister(MF);
// Spill R4 if Thumb2 function requires stack realignment - it will be used as
// scratch register. Also spill R4 if Thumb2 function has varsized objects,
@@ -1784,6 +1815,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
EstimatedStackSize += 16; // For possible paddings.
unsigned EstimatedRSStackSizeLimit, EstimatedRSFixedSizeLimit;
+ bool HasNonSPFrameIndex = false;
if (AFI->isThumb1OnlyFunction()) {
// For Thumb1, don't bother to iterate over the function. The only
// instruction that requires an emergency spill slot is a store to a
@@ -1804,7 +1836,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
EstimatedRSStackSizeLimit = (1U << 8) * 4;
EstimatedRSFixedSizeLimit = (1U << 5) * 4;
} else {
- EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this);
+ EstimatedRSStackSizeLimit =
+ estimateRSStackSizeLimit(MF, this, HasNonSPFrameIndex);
EstimatedRSFixedSizeLimit = EstimatedRSStackSizeLimit;
}
// Final estimate of whether sp or bp-relative accesses might require
@@ -1830,12 +1863,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit;
bool BigFrameOffsets = HasLargeStack || !HasBPOrFixedSP ||
- HasLargeArgumentList;
+ HasLargeArgumentList || HasNonSPFrameIndex;
LLVM_DEBUG(dbgs() << "EstimatedLimit: " << EstimatedRSStackSizeLimit
- << "; EstimatedStack" << EstimatedStackSize
- << "; EstimatedFPStack" << MaxFixedOffset - MaxFPOffset
- << "; BigFrameOffsets: " << BigFrameOffsets
- << "\n");
+ << "; EstimatedStack: " << EstimatedStackSize
+ << "; EstimatedFPStack: " << MaxFixedOffset - MaxFPOffset
+ << "; BigFrameOffsets: " << BigFrameOffsets << "\n");
if (BigFrameOffsets ||
!CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
AFI->setHasStackFrame(true);
@@ -2080,9 +2112,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
ExtraCSSpill = true;
}
}
- if (!ExtraCSSpill) {
+ if (!ExtraCSSpill && RS) {
// Reserve a slot closest to SP or frame pointer.
- assert(RS && "Register scavenging not provided");
LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n");
const TargetRegisterClass &RC = ARM::GPRRegClass;
unsigned Size = TRI->getSpillSize(RC);
@@ -2097,6 +2128,12 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
AFI->setLRIsSpilledForFarJump(true);
}
AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
+
+ // If we have the "returned" parameter attribute which guarantees that we
+ // return the value which was passed in r0 unmodified (e.g. C++ 'structors),
+ // record that fact for IPRA.
+ if (AFI->getPreservesR0())
+ SavedRegs.set(ARM::R0);
}
MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 7544ca3c38d6..6d8aee597945 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -63,6 +63,11 @@ public:
bool enableShrinkWrapping(const MachineFunction &MF) const override {
return true;
}
+ bool isProfitableForNoCSROpt(const Function &F) const override {
+ // The no-CSR optimisation is bad for code size on ARM, because we can save
+ // many registers with a single PUSH/POP pair.
+ return false;
+ }
private:
void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index b349627b67b1..8f6515c423eb 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -139,6 +139,8 @@ public:
bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
SDValue &OffImm);
bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm);
+ template <unsigned Shift>
+ bool SelectTAddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm);
// Thumb 2 Addressing Modes:
bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
@@ -146,9 +148,12 @@ public:
SDValue &OffImm);
bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
SDValue &OffImm);
- template<unsigned Shift>
- bool SelectT2AddrModeImm7(SDValue N, SDValue &Base,
- SDValue &OffImm);
+ template <unsigned Shift>
+ bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm);
+ bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm,
+ unsigned Shift);
+ template <unsigned Shift>
+ bool SelectT2AddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm);
bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base,
SDValue &OffReg, SDValue &ShImm);
bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm);
@@ -179,6 +184,7 @@ private:
bool tryARMIndexedLoad(SDNode *N);
bool tryT1IndexedLoad(SDNode *N);
bool tryT2IndexedLoad(SDNode *N);
+ bool tryMVEIndexedLoad(SDNode *N);
/// SelectVLD - Select NEON load intrinsics. NumVecs should be
/// 1, 2, 3 or 4. The opcode arrays specify the instructions used for
@@ -246,10 +252,6 @@ private:
SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs,
bool is64BitVector);
- /// Returns the number of instructions required to materialize the given
- /// constant in a register, or 3 if a literal pool load is needed.
- unsigned ConstantMaterializationCost(unsigned Val) const;
-
/// Checks if N is a multiplication by a constant where we can extract out a
/// power of two from the constant so that it can be used in a shift, but only
/// if it simplifies the materialization of the constant. Returns true if it
@@ -450,27 +452,6 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
(ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
}
-unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
- if (Subtarget->isThumb()) {
- if (Val <= 255) return 1; // MOV
- if (Subtarget->hasV6T2Ops() &&
- (Val <= 0xffff || // MOV
- ARM_AM::getT2SOImmVal(Val) != -1 || // MOVW
- ARM_AM::getT2SOImmVal(~Val) != -1)) // MVN
- return 1;
- if (Val <= 510) return 2; // MOV + ADDi8
- if (~Val <= 255) return 2; // MOV + MVN
- if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL
- } else {
- if (ARM_AM::getSOImmVal(Val) != -1) return 1; // MOV
- if (ARM_AM::getSOImmVal(~Val) != -1) return 1; // MVN
- if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
- if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs
- }
- if (Subtarget->useMovt()) return 2; // MOVW + MOVT
- return 3; // Literal pool load
-}
-
bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
unsigned MaxShift,
unsigned &PowerOfTwo,
@@ -500,8 +481,8 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
// Only optimise if the new cost is better
unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo);
NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32);
- unsigned OldCost = ConstantMaterializationCost(MulConstVal);
- unsigned NewCost = ConstantMaterializationCost(NewMulConstVal);
+ unsigned OldCost = ConstantMaterializationCost(MulConstVal, Subtarget);
+ unsigned NewCost = ConstantMaterializationCost(NewMulConstVal, Subtarget);
return NewCost < OldCost;
}
@@ -1172,6 +1153,28 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
return false;
}
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectTAddrModeImm7(SDValue N, SDValue &Base,
+ SDValue &OffImm) {
+ if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) {
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80,
+ RHSC)) {
+ Base = N.getOperand(0);
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+ OffImm =
+ CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+
+ // Base only.
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// Thumb 2 Addressing Modes
@@ -1278,35 +1281,59 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
return false;
}
-template<unsigned Shift>
-bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N,
- SDValue &Base, SDValue &OffImm) {
- if (N.getOpcode() == ISD::SUB ||
- CurDAG->isBaseWithConstantOffset(N)) {
- if (auto RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
- int RHSC = (int)RHS->getZExtValue();
- if (N.getOpcode() == ISD::SUB)
- RHSC = -RHSC;
-
- if (isShiftedInt<7, Shift>(RHSC)) {
- Base = N.getOperand(0);
- if (Base.getOpcode() == ISD::FrameIndex) {
- int FI = cast<FrameIndexSDNode>(Base)->getIndex();
- Base = CurDAG->getTargetFrameIndex(
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, SDValue &Base,
+ SDValue &OffImm) {
+ if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) {
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80,
+ RHSC)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
FI, TLI->getPointerTy(CurDAG->getDataLayout()));
- }
- OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
- return true;
}
+
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+ OffImm =
+ CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32);
+ return true;
}
}
// Base only.
Base = N;
- OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
return true;
}
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
+ SDValue &OffImm) {
+ return SelectT2AddrModeImm7Offset(Op, N, OffImm, Shift);
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
+ SDValue &OffImm,
+ unsigned Shift) {
+ unsigned Opcode = Op->getOpcode();
+ ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+ ? cast<LoadSDNode>(Op)->getAddressingMode()
+ : cast<StoreSDNode>(Op)->getAddressingMode();
+ int RHSC;
+ if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { // 7 bits.
+ OffImm =
+ ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
+ ? CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32)
+ : CurDAG->getTargetConstant(-RHSC * (1 << Shift), SDLoc(N),
+ MVT::i32);
+ return true;
+ }
+ return false;
+}
+
bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
SDValue &Base,
SDValue &OffReg, SDValue &ShImm) {
@@ -1565,6 +1592,68 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
return false;
}
+bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ if (AM == ISD::UNINDEXED)
+ return false;
+ EVT LoadedVT = LD->getMemoryVT();
+ if (!LoadedVT.isVector())
+ return false;
+ bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+ SDValue Offset;
+ bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+ unsigned Opcode = 0;
+ unsigned Align = LD->getAlignment();
+ bool IsLE = Subtarget->isLittle();
+
+ if (Align >= 2 && LoadedVT == MVT::v4i16 &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) {
+ if (isSExtLd)
+ Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post;
+ else
+ Opcode = isPre ? ARM::MVE_VLDRHU32_pre : ARM::MVE_VLDRHU32_post;
+ } else if (LoadedVT == MVT::v8i8 &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+ if (isSExtLd)
+ Opcode = isPre ? ARM::MVE_VLDRBS16_pre : ARM::MVE_VLDRBS16_post;
+ else
+ Opcode = isPre ? ARM::MVE_VLDRBU16_pre : ARM::MVE_VLDRBU16_post;
+ } else if (LoadedVT == MVT::v4i8 &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+ if (isSExtLd)
+ Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post;
+ else
+ Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post;
+ } else if (Align >= 4 &&
+ (IsLE || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 2))
+ Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post;
+ else if (Align >= 2 &&
+ (IsLE || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1))
+ Opcode = isPre ? ARM::MVE_VLDRHU16_pre : ARM::MVE_VLDRHU16_post;
+ else if ((IsLE || LoadedVT == MVT::v16i8) &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0))
+ Opcode = isPre ? ARM::MVE_VLDRBU8_pre : ARM::MVE_VLDRBU8_post;
+ else
+ return false;
+
+ SDValue Chain = LD->getChain();
+ SDValue Base = LD->getBasePtr();
+ SDValue Ops[] = {Base, Offset,
+ CurDAG->getTargetConstant(ARMVCC::None, SDLoc(N), MVT::i32),
+ CurDAG->getRegister(0, MVT::i32), Chain};
+ SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), LD->getValueType(0),
+ MVT::i32, MVT::Other, Ops);
+ transferMemOperands(N, New);
+ ReplaceUses(SDValue(N, 0), SDValue(New, 1));
+ ReplaceUses(SDValue(N, 1), SDValue(New, 0));
+ ReplaceUses(SDValue(N, 2), SDValue(New, 2));
+ CurDAG->RemoveDeadNode(N);
+ return true;
+}
+
/// Form a GPRPair pseudo register from a pair of GPR regs.
SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
SDLoc dl(V0.getNode());
@@ -2701,7 +2790,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
case ISD::Constant: {
unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
// If we can't materialize the constant we need to use a literal pool
- if (ConstantMaterializationCost(Val) > 2) {
+ if (ConstantMaterializationCost(Val, Subtarget) > 2) {
SDValue CPIdx = CurDAG->getTargetConstantPool(
ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
TLI->getPointerTy(CurDAG->getDataLayout()));
@@ -2842,8 +2931,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
bool PreferImmediateEncoding =
Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm));
if (!PreferImmediateEncoding &&
- ConstantMaterializationCost(Imm) >
- ConstantMaterializationCost(~Imm)) {
+ ConstantMaterializationCost(Imm, Subtarget) >
+ ConstantMaterializationCost(~Imm, Subtarget)) {
// The current immediate costs more to materialize than a negated
// immediate, so negate the immediate and use a BIC.
SDValue NewImm =
@@ -2987,6 +3076,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
return;
}
case ISD::LOAD: {
+ if (Subtarget->hasMVEIntegerOps() && tryMVEIndexedLoad(N))
+ return;
if (Subtarget->isThumb() && Subtarget->hasThumb2()) {
if (tryT2IndexedLoad(N))
return;
@@ -2998,13 +3089,26 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
// Other cases are autogenerated.
break;
}
- case ARMISD::WLS: {
- SDValue Ops[] = { N->getOperand(1), // Loop count
- N->getOperand(2), // Exit target
+ case ARMISD::WLS:
+ case ARMISD::LE: {
+ SDValue Ops[] = { N->getOperand(1),
+ N->getOperand(2),
+ N->getOperand(0) };
+ unsigned Opc = N->getOpcode() == ARMISD::WLS ?
+ ARM::t2WhileLoopStart : ARM::t2LoopEnd;
+ SDNode *New = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ ReplaceUses(N, New);
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
+ case ARMISD::LOOP_DEC: {
+ SDValue Ops[] = { N->getOperand(1),
+ N->getOperand(2),
N->getOperand(0) };
- SDNode *LoopStart =
- CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other, Ops);
- ReplaceUses(N, LoopStart);
+ SDNode *Dec =
+ CurDAG->getMachineNode(ARM::t2LoopDec, dl,
+ CurDAG->getVTList(MVT::i32, MVT::Other), Ops);
+ ReplaceUses(N, Dec);
CurDAG->RemoveDeadNode(N);
return;
}
@@ -4365,7 +4469,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
// Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
// the original GPRs.
- unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+ Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
SDValue Chain = SDValue(N,0);
@@ -4401,7 +4505,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
// Copy REG_SEQ into a GPRPair-typed VR and replace the original two
// i32 VRs of inline asm with it.
- unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+ Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 18bb9bf3eccc..db26feb57010 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -245,7 +245,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
for (auto VT : IntTypes) {
- addRegisterClass(VT, &ARM::QPRRegClass);
+ addRegisterClass(VT, &ARM::MQPRRegClass);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -258,12 +258,31 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::CTLZ, VT, Legal);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+ setOperationAction(ISD::BITREVERSE, VT, Legal);
+ setOperationAction(ISD::BSWAP, VT, Legal);
+ setOperationAction(ISD::SADDSAT, VT, Legal);
+ setOperationAction(ISD::UADDSAT, VT, Legal);
+ setOperationAction(ISD::SSUBSAT, VT, Legal);
+ setOperationAction(ISD::USUBSAT, VT, Legal);
// No native support for these.
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
+
+ // Vector reductions
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
if (!HasMVEFP) {
setOperationAction(ISD::SINT_TO_FP, VT, Expand);
@@ -271,11 +290,18 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
}
+
+ // Pre and Post inc are supported on loads and stores
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, VT, Legal);
+ setIndexedStoreAction(im, VT, Legal);
+ }
}
const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
for (auto VT : FloatTypes) {
- addRegisterClass(VT, &ARM::QPRRegClass);
+ addRegisterClass(VT, &ARM::MQPRRegClass);
if (!HasMVEFP)
setAllExpand(VT);
@@ -287,6 +313,16 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+
+ // Pre and Post inc are supported on loads and stores
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, VT, Legal);
+ setIndexedStoreAction(im, VT, Legal);
+ }
if (HasMVEFP) {
setOperationAction(ISD::FMINNUM, VT, Legal);
@@ -314,7 +350,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
// vector types is inhibited at integer-only level.
const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
for (auto VT : LongTypes) {
- addRegisterClass(VT, &ARM::QPRRegClass);
+ addRegisterClass(VT, &ARM::MQPRRegClass);
setAllExpand(VT);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -334,6 +370,33 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
+
+ // Pre and Post inc on these are legal, given the correct extends
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, MVT::v8i8, Legal);
+ setIndexedStoreAction(im, MVT::v8i8, Legal);
+ setIndexedLoadAction(im, MVT::v4i8, Legal);
+ setIndexedStoreAction(im, MVT::v4i8, Legal);
+ setIndexedLoadAction(im, MVT::v4i16, Legal);
+ setIndexedStoreAction(im, MVT::v4i16, Legal);
+ }
+
+ // Predicate types
+ const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1};
+ for (auto VT : pTypes) {
+ addRegisterClass(VT, &ARM::VCCRRegClass);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
+ setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+ }
}
ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
@@ -645,8 +708,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
}
- for (MVT VT : MVT::vector_valuetypes()) {
- for (MVT InnerVT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+ for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
addAllExtLoads(VT, InnerVT, Expand);
}
@@ -669,8 +732,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
addMVEVectorTypes(Subtarget->hasMVEFloatOps());
// Combine low-overhead loop intrinsics so that we can lower i1 types.
- if (Subtarget->hasLOB())
+ if (Subtarget->hasLOB()) {
setTargetDAGCombine(ISD::BRCOND);
+ setTargetDAGCombine(ISD::BR_CC);
+ }
if (Subtarget->hasNEON()) {
addDRTypeForNEON(MVT::v2f32);
@@ -837,10 +902,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::SRA);
- setTargetDAGCombine(ISD::SIGN_EXTEND);
- setTargetDAGCombine(ISD::ZERO_EXTEND);
- setTargetDAGCombine(ISD::ANY_EXTEND);
- setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::FP_TO_UINT);
setTargetDAGCombine(ISD::FDIV);
@@ -849,7 +910,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// It is legal to extload from v4i8 to v4i16 or v4i32.
for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
MVT::v2i32}) {
- for (MVT VT : MVT::integer_vector_valuetypes()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
@@ -861,6 +922,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
}
if (!Subtarget->hasFP64()) {
@@ -901,9 +966,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
}
- if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){
+ if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
- setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ if (Subtarget->hasFullFP16())
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
}
if (!Subtarget->hasFP16())
@@ -955,6 +1021,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
+ if (Subtarget->hasDSP()) {
+ setOperationAction(ISD::SADDSAT, MVT::i8, Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
+ setOperationAction(ISD::SADDSAT, MVT::i16, Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
+ }
+ if (Subtarget->hasBaseDSP()) {
+ setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
+ }
// i64 operation support.
setOperationAction(ISD::MUL, MVT::i64, Expand);
@@ -972,6 +1048,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
setOperationAction(ISD::SRL, MVT::i64, Custom);
setOperationAction(ISD::SRA, MVT::i64, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
// MVE lowers 64 bit shifts to lsll and lsrl
@@ -991,7 +1068,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// ARM does not have ROTL.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
}
@@ -1365,14 +1442,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// On ARM arguments smaller than 4 bytes are extended, so all arguments
// are at least 4 bytes aligned.
- setMinStackArgumentAlignment(4);
+ setMinStackArgumentAlignment(Align(4));
// Prefer likely predicted branches to selects on out-of-order cores.
PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
- setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());
+ setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
- setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
+ setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
if (Subtarget->isThumb() || Subtarget->isThumb2())
setTargetDAGCombine(ISD::ABS);
@@ -1472,6 +1549,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::ADDE: return "ARMISD::ADDE";
case ARMISD::SUBC: return "ARMISD::SUBC";
case ARMISD::SUBE: return "ARMISD::SUBE";
+ case ARMISD::LSLS: return "ARMISD::LSLS";
case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD";
case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR";
@@ -1496,16 +1574,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK";
case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK";
- case ARMISD::VCEQ: return "ARMISD::VCEQ";
- case ARMISD::VCEQZ: return "ARMISD::VCEQZ";
- case ARMISD::VCGE: return "ARMISD::VCGE";
- case ARMISD::VCGEZ: return "ARMISD::VCGEZ";
- case ARMISD::VCLEZ: return "ARMISD::VCLEZ";
- case ARMISD::VCGEU: return "ARMISD::VCGEU";
- case ARMISD::VCGT: return "ARMISD::VCGT";
- case ARMISD::VCGTZ: return "ARMISD::VCGTZ";
- case ARMISD::VCLTZ: return "ARMISD::VCLTZ";
- case ARMISD::VCGTU: return "ARMISD::VCGTU";
+ case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST";
+ case ARMISD::VCMP: return "ARMISD::VCMP";
+ case ARMISD::VCMPZ: return "ARMISD::VCMPZ";
case ARMISD::VTST: return "ARMISD::VTST";
case ARMISD::VSHLs: return "ARMISD::VSHLs";
@@ -1543,6 +1614,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VTRN: return "ARMISD::VTRN";
case ARMISD::VTBL1: return "ARMISD::VTBL1";
case ARMISD::VTBL2: return "ARMISD::VTBL2";
+ case ARMISD::VMOVN: return "ARMISD::VMOVN";
case ARMISD::VMULLs: return "ARMISD::VMULLs";
case ARMISD::VMULLu: return "ARMISD::VMULLu";
case ARMISD::UMAAL: return "ARMISD::UMAAL";
@@ -1560,6 +1632,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX";
case ARMISD::SMMLAR: return "ARMISD::SMMLAR";
case ARMISD::SMMLSR: return "ARMISD::SMMLSR";
+ case ARMISD::QADD16b: return "ARMISD::QADD16b";
+ case ARMISD::QSUB16b: return "ARMISD::QSUB16b";
+ case ARMISD::QADD8b: return "ARMISD::QADD8b";
+ case ARMISD::QSUB8b: return "ARMISD::QSUB8b";
case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
case ARMISD::BFI: return "ARMISD::BFI";
case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
@@ -1589,6 +1665,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
case ARMISD::WLS: return "ARMISD::WLS";
+ case ARMISD::LE: return "ARMISD::LE";
+ case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC";
+ case ARMISD::CSINV: return "ARMISD::CSINV";
+ case ARMISD::CSNEG: return "ARMISD::CSNEG";
+ case ARMISD::CSINC: return "ARMISD::CSINC";
}
return nullptr;
}
@@ -1597,6 +1678,11 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT VT) const {
if (!VT.isVector())
return getPointerTy(DL);
+
+ // MVE has a predicate register.
+ if (Subtarget->hasMVEIntegerOps() &&
+ (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8))
+ return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
return VT.changeVectorElementTypeToInteger();
}
@@ -1726,34 +1812,22 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
- ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) {
+ ARMCC::CondCodes &CondCode2) {
CondCode2 = ARMCC::AL;
- InvalidOnQNaN = true;
switch (CC) {
default: llvm_unreachable("Unknown FP condition!");
case ISD::SETEQ:
- case ISD::SETOEQ:
- CondCode = ARMCC::EQ;
- InvalidOnQNaN = false;
- break;
+ case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
case ISD::SETGT:
case ISD::SETOGT: CondCode = ARMCC::GT; break;
case ISD::SETGE:
case ISD::SETOGE: CondCode = ARMCC::GE; break;
case ISD::SETOLT: CondCode = ARMCC::MI; break;
case ISD::SETOLE: CondCode = ARMCC::LS; break;
- case ISD::SETONE:
- CondCode = ARMCC::MI;
- CondCode2 = ARMCC::GT;
- InvalidOnQNaN = false;
- break;
+ case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
case ISD::SETO: CondCode = ARMCC::VC; break;
case ISD::SETUO: CondCode = ARMCC::VS; break;
- case ISD::SETUEQ:
- CondCode = ARMCC::EQ;
- CondCode2 = ARMCC::VS;
- InvalidOnQNaN = false;
- break;
+ case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
case ISD::SETUGT: CondCode = ARMCC::HI; break;
case ISD::SETUGE: CondCode = ARMCC::PL; break;
case ISD::SETLT:
@@ -1761,10 +1835,7 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
case ISD::SETLE:
case ISD::SETULE: CondCode = ARMCC::LE; break;
case ISD::SETNE:
- case ISD::SETUNE:
- CondCode = ARMCC::NE;
- InvalidOnQNaN = false;
- break;
+ case ISD::SETUNE: CondCode = ARMCC::NE; break;
}
}
@@ -1988,6 +2059,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool isVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
+ MachineFunction::CallSiteInfo CSInfo;
bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool isThisReturn = false;
auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
@@ -2112,6 +2184,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
"unexpected use of 'returned'");
isThisReturn = true;
}
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.EnableDebugEntryValues)
+ CSInfo.emplace_back(VA.getLocReg(), i);
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else if (isByVal) {
assert(VA.isMemLoc());
@@ -2347,12 +2422,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
if (isTailCall) {
MF.getFrameInfo().setHasTailCall();
- return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
+ SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
+ DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+ return Ret;
}
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
InFlag = Chain.getValue(1);
+ DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
@@ -2431,7 +2509,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
int FI = std::numeric_limits<int>::max();
if (Arg.getOpcode() == ISD::CopyFromReg) {
unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
- if (!TargetRegisterInfo::isVirtualRegister(VR))
+ if (!Register::isVirtualRegister(VR))
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
@@ -3047,12 +3125,12 @@ ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
// Load the current TEB (thread environment block)
SDValue Ops[] = {Chain,
- DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
- DAG.getConstant(15, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(13, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(2, DL, MVT::i32)};
+ DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+ DAG.getTargetConstant(15, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ DAG.getTargetConstant(13, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ DAG.getTargetConstant(2, DL, MVT::i32)};
SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
DAG.getVTList(MVT::i32, MVT::Other), Ops);
@@ -3498,6 +3576,48 @@ SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
Op.getOperand(0));
}
+SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
+ SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
+ unsigned IntNo =
+ cast<ConstantSDNode>(
+ Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other))
+ ->getZExtValue();
+ switch (IntNo) {
+ default:
+ return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::arm_gnu_eabi_mcount: {
+ MachineFunction &MF = DAG.getMachineFunction();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc dl(Op);
+ SDValue Chain = Op.getOperand(0);
+ // call "\01__gnu_mcount_nc"
+ const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
+ const uint32_t *Mask =
+ ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ // Mark LR an implicit live-in.
+ unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+ SDValue ReturnAddress =
+ DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
+ std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue};
+ SDValue Callee =
+ DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
+ SDValue RegisterMask = DAG.getRegisterMask(Mask);
+ if (Subtarget->isThumb())
+ return SDValue(
+ DAG.getMachineNode(
+ ARM::tBL_PUSHLR, dl, ResultTys,
+ {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
+ DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
+ 0);
+ return SDValue(
+ DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
+ {ReturnAddress, Callee, RegisterMask, Chain}),
+ 0);
+ }
+ }
+}
+
SDValue
ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) const {
@@ -3898,6 +4018,12 @@ SDValue ARMTargetLowering::LowerFormalArguments(
// Transform the arguments in physical registers into virtual ones.
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+ // If this value is passed in r0 and has the returned attribute (e.g.
+ // C++ 'structors), record this fact for later use.
+ if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
+ AFI->setPreservesR0();
+ }
}
// If this is an 8 or 16-bit value, it is really passed promoted
@@ -4049,6 +4175,67 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
std::swap(LHS, RHS);
}
+ // Thumb1 has very limited immediate modes, so turning an "and" into a
+ // shift can save multiple instructions.
+ //
+ // If we have (x & C1), and C1 is an appropriate mask, we can transform it
+ // into "((x << n) >> n)". But that isn't necessarily profitable on its
+ // own. If it's the operand to an unsigned comparison with an immediate,
+ // we can eliminate one of the shifts: we transform
+ // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
+ //
+ // We avoid transforming cases which aren't profitable due to encoding
+ // details:
+ //
+ // 1. C2 fits into the immediate field of a cmp, and the transformed version
+ // would not; in that case, we're essentially trading one immediate load for
+ // another.
+ // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
+ // 3. C2 is zero; we have other code for this special case.
+ //
+ // FIXME: Figure out profitability for Thumb2; we usually can't save an
+ // instruction, since the AND is always one instruction anyway, but we could
+ // use narrow instructions in some cases.
+ if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
+ LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
+ !isSignedIntSetCC(CC)) {
+ unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
+ auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
+ uint64_t RHSV = RHSC->getZExtValue();
+ if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
+ unsigned ShiftBits = countLeadingZeros(Mask);
+ if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
+ SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
+ LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
+ RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
+ }
+ }
+ }
+
+ // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
+ // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
+ // way a cmp would.
+ // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
+ // some tweaks to the heuristics for the previous and->shift transform.
+ // FIXME: Optimize cases where the LHS isn't a shift.
+ if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
+ isa<ConstantSDNode>(RHS) &&
+ cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
+ CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
+ unsigned ShiftAmt =
+ cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
+ SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
+ DAG.getVTList(MVT::i32, MVT::i32),
+ LHS.getOperand(0),
+ DAG.getConstant(ShiftAmt, dl, MVT::i32));
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
+ Shift.getValue(1), SDValue());
+ ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
+ return Chain.getValue(1);
+ }
+
ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
// If the RHS is a constant zero then the V (overflow) flag will never be
@@ -4083,15 +4270,13 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
- SelectionDAG &DAG, const SDLoc &dl,
- bool InvalidOnQNaN) const {
+ SelectionDAG &DAG, const SDLoc &dl) const {
assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
SDValue Cmp;
- SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
if (!isFloatingPointZero(RHS))
- Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C);
+ Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
else
- Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C);
+ Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
}
@@ -4108,12 +4293,10 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
Cmp = Cmp.getOperand(0);
Opc = Cmp.getOpcode();
if (Opc == ARMISD::CMPFP)
- Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
- Cmp.getOperand(1), Cmp.getOperand(2));
+ Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
else {
assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
- Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
- Cmp.getOperand(1));
+ Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
}
return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
}
@@ -4276,6 +4459,35 @@ SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
}
+static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ EVT VT = Op.getValueType();
+ if (!Subtarget->hasDSP())
+ return SDValue();
+ if (!VT.isSimple())
+ return SDValue();
+
+ unsigned NewOpcode;
+ bool IsAdd = Op->getOpcode() == ISD::SADDSAT;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::i8:
+ NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b;
+ break;
+ case MVT::i16:
+ NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b;
+ break;
+ }
+
+ SDLoc dl(Op);
+ SDValue Add =
+ DAG.getNode(NewOpcode, dl, MVT::i32,
+ DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
+ DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
+}
+
SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Cond = Op.getOperand(0);
SDValue SelectTrue = Op.getOperand(1);
@@ -4656,10 +4868,62 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDValue TrueVal = Op.getOperand(2);
SDValue FalseVal = Op.getOperand(3);
+ ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
+ ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
+
+ if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
+ LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
+ unsigned TVal = CTVal->getZExtValue();
+ unsigned FVal = CFVal->getZExtValue();
+ unsigned Opcode = 0;
+
+ if (TVal == ~FVal) {
+ Opcode = ARMISD::CSINV;
+ } else if (TVal == ~FVal + 1) {
+ Opcode = ARMISD::CSNEG;
+ } else if (TVal + 1 == FVal) {
+ Opcode = ARMISD::CSINC;
+ } else if (TVal == FVal + 1) {
+ Opcode = ARMISD::CSINC;
+ std::swap(TrueVal, FalseVal);
+ std::swap(TVal, FVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+
+ if (Opcode) {
+ // If one of the constants is cheaper than another, materialise the
+ // cheaper one and let the csel generate the other.
+ if (Opcode != ARMISD::CSINC &&
+ HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
+ std::swap(TrueVal, FalseVal);
+ std::swap(TVal, FVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+
+ // Attempt to use ZR checking TVal is 0, possibly inverting the condition
+ // to get there. CSINC not is invertable like the other two (~(~a) == a,
+ // -(-a) == a, but (a+1)+1 != a).
+ if (FVal == 0 && Opcode != ARMISD::CSINC) {
+ std::swap(TrueVal, FalseVal);
+ std::swap(TVal, FVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+ if (TVal == 0)
+ TrueVal = DAG.getRegister(ARM::ZR, MVT::i32);
+
+ // Drops F's value because we can get it by inverting/negating TVal.
+ FalseVal = TrueVal;
+
+ SDValue ARMcc;
+ SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+ EVT VT = TrueVal.getValueType();
+ return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
+ }
+ }
if (isUnsupportedFloatingType(LHS.getValueType())) {
DAG.getTargetLoweringInfo().softenSetCCOperands(
- DAG, LHS.getValueType(), LHS, RHS, CC, dl);
+ DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands only returned one value, we should compare it to
// zero.
@@ -4701,8 +4965,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
}
ARMCC::CondCodes CondCode, CondCode2;
- bool InvalidOnQNaN;
- FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
+ FPCCToARMCC(CC, CondCode, CondCode2);
// Normalize the fp compare. If RHS is zero we prefer to keep it there so we
// match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
@@ -4727,13 +4990,13 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
}
SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
- SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
+ SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
if (CondCode2 != ARMCC::AL) {
SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
// FIXME: Needs another CMP because flag can have but one use.
- SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
+ SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
}
return Result;
@@ -4903,7 +5166,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
if (isUnsupportedFloatingType(LHS.getValueType())) {
DAG.getTargetLoweringInfo().softenSetCCOperands(
- DAG, LHS.getValueType(), LHS, RHS, CC, dl);
+ DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands only returned one value, we should compare it to
// zero.
@@ -4960,11 +5223,10 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
}
ARMCC::CondCodes CondCode, CondCode2;
- bool InvalidOnQNaN;
- FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
+ FPCCToARMCC(CC, CondCode, CondCode2);
SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
- SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
+ SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
@@ -5056,8 +5318,9 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
else
LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
Op.getValueType());
+ MakeLibCallOptions CallOptions;
return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
- /*isSigned*/ false, SDLoc(Op)).first;
+ CallOptions, SDLoc(Op)).first;
}
return Op;
@@ -5120,8 +5383,9 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
else
LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
Op.getValueType());
+ MakeLibCallOptions CallOptions;
return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
- /*isSigned*/ false, SDLoc(Op)).first;
+ CallOptions, SDLoc(Op)).first;
}
return Op;
@@ -5140,7 +5404,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
if (UseNEON) {
// Use VBSL to copy the sign bit.
- unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
+ unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
@@ -5163,7 +5427,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
- SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
+ SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
dl, MVT::i32);
AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
@@ -5243,7 +5507,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- unsigned FrameReg = ARI.getFrameRegister(MF);
+ Register FrameReg = ARI.getFrameRegister(MF);
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
while (Depth--)
FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
@@ -5253,9 +5517,9 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
- unsigned Reg = StringSwitch<unsigned>(RegName)
+Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const {
+ Register Reg = StringSwitch<unsigned>(RegName)
.Case("sp", ARM::SP)
.Default(0);
if (Reg)
@@ -5576,8 +5840,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
- if (VT.isVector()) {
- assert(ST->hasNEON());
+ if (VT.isVector() && ST->hasNEON()) {
// Compute the least significant set bit: LSB = X & -X
SDValue X = N->getOperand(0);
@@ -5777,14 +6040,15 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
unsigned ShPartsOpc = ARMISD::LSLL;
ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
- // If the shift amount is greater than 32 then do the default optimisation
- if (Con && Con->getZExtValue() > 32)
+ // If the shift amount is greater than 32 or has a greater bitwidth than 64
+ // then do the default optimisation
+ if (ShAmt->getValueType(0).getSizeInBits() > 64 ||
+ (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32)))
return SDValue();
- // Extract the lower 32 bits of the shift amount if it's an i64
- if (ShAmt->getValueType(0) == MVT::i64)
- ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt,
- DAG.getConstant(0, dl, MVT::i32));
+ // Extract the lower 32 bits of the shift amount if it's not an i32
+ if (ShAmt->getValueType(0) != MVT::i32)
+ ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
if (ShOpc == ISD::SRL) {
if (!Con)
@@ -5839,20 +6103,37 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
}
-static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
- SDValue TmpOp0, TmpOp1;
+static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
bool Invert = false;
bool Swap = false;
- unsigned Opc = 0;
+ unsigned Opc = ARMCC::AL;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
- EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
EVT VT = Op.getValueType();
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
SDLoc dl(Op);
+ EVT CmpVT;
+ if (ST->hasNEON())
+ CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
+ else {
+ assert(ST->hasMVEIntegerOps() &&
+ "No hardware support for integer vector comparison!");
+
+ if (Op.getValueType().getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ // Make sure we expand floating point setcc to scalar if we do not have
+ // mve.fp, so that we can handle them from there.
+ if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
+ return SDValue();
+
+ CmpVT = VT;
+ }
+
if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
(SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
// Special-case integer 64-bit equality comparisons. They aren't legal,
@@ -5880,60 +6161,74 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
switch (SetCCOpcode) {
default: llvm_unreachable("Illegal FP comparison");
case ISD::SETUNE:
- case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETNE:
+ if (ST->hasMVEFloatOps()) {
+ Opc = ARMCC::NE; break;
+ } else {
+ Invert = true; LLVM_FALLTHROUGH;
+ }
case ISD::SETOEQ:
- case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
+ case ISD::SETEQ: Opc = ARMCC::EQ; break;
case ISD::SETOLT:
case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETOGT:
- case ISD::SETGT: Opc = ARMISD::VCGT; break;
+ case ISD::SETGT: Opc = ARMCC::GT; break;
case ISD::SETOLE:
case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETOGE:
- case ISD::SETGE: Opc = ARMISD::VCGE; break;
+ case ISD::SETGE: Opc = ARMCC::GE; break;
case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
+ case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
+ case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
- case ISD::SETONE:
+ case ISD::SETONE: {
// Expand this to (OLT | OGT).
- TmpOp0 = Op0;
- TmpOp1 = Op1;
- Opc = ISD::OR;
- Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
- Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
- break;
- case ISD::SETUO:
- Invert = true;
- LLVM_FALLTHROUGH;
- case ISD::SETO:
+ SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
+ DAG.getConstant(ARMCC::GT, dl, MVT::i32));
+ SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
+ DAG.getConstant(ARMCC::GT, dl, MVT::i32));
+ SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+ return Result;
+ }
+ case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETO: {
// Expand this to (OLT | OGE).
- TmpOp0 = Op0;
- TmpOp1 = Op1;
- Opc = ISD::OR;
- Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
- Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
- break;
+ SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
+ DAG.getConstant(ARMCC::GT, dl, MVT::i32));
+ SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
+ DAG.getConstant(ARMCC::GE, dl, MVT::i32));
+ SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+ return Result;
+ }
}
} else {
// Integer comparisons.
switch (SetCCOpcode) {
default: llvm_unreachable("Illegal integer comparison");
- case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH;
- case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
+ case ISD::SETNE:
+ if (ST->hasMVEIntegerOps()) {
+ Opc = ARMCC::NE; break;
+ } else {
+ Invert = true; LLVM_FALLTHROUGH;
+ }
+ case ISD::SETEQ: Opc = ARMCC::EQ; break;
case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETGT: Opc = ARMISD::VCGT; break;
+ case ISD::SETGT: Opc = ARMCC::GT; break;
case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETGE: Opc = ARMISD::VCGE; break;
+ case ISD::SETGE: Opc = ARMCC::GE; break;
case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
+ case ISD::SETUGT: Opc = ARMCC::HI; break;
case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
+ case ISD::SETUGE: Opc = ARMCC::HS; break;
}
// Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
- if (Opc == ARMISD::VCEQ) {
+ if (ST->hasNEON() && Opc == ARMCC::EQ) {
SDValue AndOp;
if (ISD::isBuildVectorAllZeros(Op1.getNode()))
AndOp = Op0;
@@ -5945,10 +6240,12 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
AndOp = AndOp.getOperand(0);
if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
- Opc = ARMISD::VTST;
Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
- Invert = !Invert;
+ SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
+ if (!Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+ return Result;
}
}
}
@@ -5962,31 +6259,20 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
if (ISD::isBuildVectorAllZeros(Op1.getNode()))
SingleOp = Op0;
else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
- if (Opc == ARMISD::VCGE)
- Opc = ARMISD::VCLEZ;
- else if (Opc == ARMISD::VCGT)
- Opc = ARMISD::VCLTZ;
+ if (Opc == ARMCC::GE)
+ Opc = ARMCC::LE;
+ else if (Opc == ARMCC::GT)
+ Opc = ARMCC::LT;
SingleOp = Op1;
}
SDValue Result;
if (SingleOp.getNode()) {
- switch (Opc) {
- case ARMISD::VCEQ:
- Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
- case ARMISD::VCGE:
- Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
- case ARMISD::VCLEZ:
- Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
- case ARMISD::VCGT:
- Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
- case ARMISD::VCLTZ:
- Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
- default:
- Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
- }
+ Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp,
+ DAG.getConstant(Opc, dl, MVT::i32));
} else {
- Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
+ Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
+ DAG.getConstant(Opc, dl, MVT::i32));
}
Result = DAG.getSExtOrTrunc(Result, dl, VT);
@@ -6027,13 +6313,13 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
CCR, Chain.getValue(1));
}
-/// isNEONModifiedImm - Check if the specified splat value corresponds to a
-/// valid vector constant for a NEON or MVE instruction with a "modified immediate"
-/// operand (e.g., VMOV). If so, return the encoded value.
-static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
+/// valid vector constant for a NEON or MVE instruction with a "modified
+/// immediate" operand (e.g., VMOV). If so, return the encoded value.
+static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
unsigned SplatBitSize, SelectionDAG &DAG,
const SDLoc &dl, EVT &VT, bool is128Bits,
- NEONModImmType type) {
+ VMOVModImmType type) {
unsigned OpCmode, Imm;
// SplatBitSize is set to the smallest size that splats the vector, so a
@@ -6163,10 +6449,10 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
}
default:
- llvm_unreachable("unexpected size for isNEONModifiedImm");
+ llvm_unreachable("unexpected size for isVMOVModifiedImm");
}
- unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
+ unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
}
@@ -6246,7 +6532,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
return SDValue();
// Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
- SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
+ SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
VMovVT, false, VMOVModImm);
if (NewVal != SDValue()) {
SDLoc DL(Op);
@@ -6263,7 +6549,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
}
// Finally, try a VMVN.i32
- NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
+ NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
false, VMVNModImm);
if (NewVal != SDValue()) {
SDLoc DL(Op);
@@ -6649,6 +6935,29 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) {
return true;
}
+static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) {
+ unsigned NumElts = VT.getVectorNumElements();
+ // Make sure the mask has the right size.
+ if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
+ return false;
+
+ // If Top
+ // Look for <0, N, 2, N+2, 4, N+4, ..>.
+ // This inserts Input2 into Input1
+ // else if not Top
+ // Look for <0, N+1, 2, N+3, 4, N+5, ..>
+ // This inserts Input1 into Input2
+ unsigned Offset = Top ? 0 : 1;
+ for (unsigned i = 0; i < NumElts; i+=2) {
+ if (M[i] >= 0 && M[i] != (int)i)
+ return false;
+ if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset))
+ return false;
+ }
+
+ return true;
+}
+
// If N is an integer constant that can be moved into a register in one
// instruction, return an SDValue of such a constant (will become a MOV
// instruction). Otherwise return null.
@@ -6669,6 +6978,66 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned BoolMask;
+ unsigned BitsPerBool;
+ if (NumElts == 4) {
+ BitsPerBool = 4;
+ BoolMask = 0xf;
+ } else if (NumElts == 8) {
+ BitsPerBool = 2;
+ BoolMask = 0x3;
+ } else if (NumElts == 16) {
+ BitsPerBool = 1;
+ BoolMask = 0x1;
+ } else
+ return SDValue();
+
+ // If this is a single value copied into all lanes (a splat), we can just sign
+ // extend that single value
+ SDValue FirstOp = Op.getOperand(0);
+ if (!isa<ConstantSDNode>(FirstOp) &&
+ std::all_of(std::next(Op->op_begin()), Op->op_end(),
+ [&FirstOp](SDUse &U) {
+ return U.get().isUndef() || U.get() == FirstOp;
+ })) {
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
+ DAG.getValueType(MVT::i1));
+ return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
+ }
+
+ // First create base with bits set where known
+ unsigned Bits32 = 0;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (!isa<ConstantSDNode>(V) && !V.isUndef())
+ continue;
+ bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue();
+ if (BitSet)
+ Bits32 |= BoolMask << (i * BitsPerBool);
+ }
+
+ // Add in unknown nodes
+ SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
+ DAG.getConstant(Bits32, dl, MVT::i32));
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (isa<ConstantSDNode>(V) || V.isUndef())
+ continue;
+ Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
+ DAG.getConstant(i, dl, MVT::i32));
+ }
+
+ return Base;
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it.
SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
@@ -6677,6 +7046,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
SDLoc dl(Op);
EVT VT = Op.getValueType();
+ if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
+ return LowerBUILD_VECTOR_i1(Op, DAG, ST);
+
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
@@ -6688,7 +7060,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
(ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
// Check if an immediate VMOV works.
EVT VmovVT;
- SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+ SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VmovVT, VT.is128BitVector(),
VMOVModImm);
@@ -6700,7 +7072,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
// Try an immediate VMVN.
uint64_t NegatedImm = (~SplatBits).getZExtValue();
- Val = isNEONModifiedImm(
+ Val = isVMOVModifiedImm(
NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VmovVT, VT.is128BitVector(),
ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
@@ -7088,9 +7460,6 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
LaneMask[j] = ExtractBase + j;
}
- // Final check before we try to produce nonsense...
- if (!isShuffleMaskLegal(Mask, ShuffleVT))
- return SDValue();
// We can't handle more than two sources. This should have already
// been checked before this point.
@@ -7100,8 +7469,10 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
for (unsigned i = 0; i < Sources.size(); ++i)
ShuffleOps[i] = Sources[i].ShuffleVec;
- SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
- ShuffleOps[1], Mask);
+ SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+ ShuffleOps[1], Mask, DAG);
+ if (!Shuffle)
+ return SDValue();
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
@@ -7168,6 +7539,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
unsigned EltSize = VT.getScalarSizeInBits();
if (EltSize >= 32 ||
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+ ShuffleVectorInst::isIdentityMask(M) ||
isVREVMask(M, VT, 64) ||
isVREVMask(M, VT, 32) ||
isVREVMask(M, VT, 16))
@@ -7180,6 +7552,9 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
isReverseMask(M, VT))
return true;
+ else if (Subtarget->hasMVEIntegerOps() &&
+ (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1)))
+ return true;
else
return false;
}
@@ -7282,6 +7657,94 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
DAG.getConstant(ExtractNum, DL, MVT::i32));
}
+static EVT getVectorTyFromPredicateVector(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::v4i1:
+ return MVT::v4i32;
+ case MVT::v8i1:
+ return MVT::v8i16;
+ case MVT::v16i1:
+ return MVT::v16i8;
+ default:
+ llvm_unreachable("Unexpected vector predicate type");
+ }
+}
+
+static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT,
+ SelectionDAG &DAG) {
+ // Converting from boolean predicates to integers involves creating a vector
+ // of all ones or all zeroes and selecting the lanes based upon the real
+ // predicate.
+ SDValue AllOnes =
+ DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
+ AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
+
+ SDValue AllZeroes =
+ DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
+ AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
+
+ // Get full vector type from predicate type
+ EVT NewVT = getVectorTyFromPredicateVector(VT);
+
+ SDValue RecastV1;
+ // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
+ // this to a v16i1. This cannot be done with an ordinary bitcast because the
+ // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
+ // since we know in hardware the sizes are really the same.
+ if (VT != MVT::v16i1)
+ RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
+ else
+ RecastV1 = Pred;
+
+ // Select either all ones or zeroes depending upon the real predicate bits.
+ SDValue PredAsVector =
+ DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
+
+ // Recast our new predicate-as-integer v16i8 vector into something
+ // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
+ return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
+}
+
+static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VT = Op.getValueType();
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+ ArrayRef<int> ShuffleMask = SVN->getMask();
+
+ assert(ST->hasMVEIntegerOps() &&
+ "No support for vector shuffle of boolean predicates");
+
+ SDValue V1 = Op.getOperand(0);
+ SDLoc dl(Op);
+ if (isReverseMask(ShuffleMask, VT)) {
+ SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
+ SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
+ SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
+ DAG.getConstant(16, dl, MVT::i32));
+ return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
+ }
+
+ // Until we can come up with optimised cases for every single vector
+ // shuffle in existence we have chosen the least painful strategy. This is
+ // to essentially promote the boolean predicate to a 8-bit integer, where
+ // each predicate represents a byte. Then we fall back on a normal integer
+ // vector shuffle and convert the result back into a predicate vector. In
+ // many cases the generated code might be even better than scalar code
+ // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
+ // fields in a register into 8 other arbitrary 2-bit fields!
+ SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG);
+ EVT NewVT = PredAsVector.getValueType();
+
+ // Do the shuffle!
+ SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector,
+ DAG.getUNDEF(NewVT), ShuffleMask);
+
+ // Now return the result of comparing the shuffled vector with zero,
+ // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+ return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
+ DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+}
+
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDValue V1 = Op.getOperand(0);
@@ -7289,6 +7752,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
SDLoc dl(Op);
EVT VT = Op.getValueType();
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+ unsigned EltSize = VT.getScalarSizeInBits();
+
+ if (ST->hasMVEIntegerOps() && EltSize == 1)
+ return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
// Convert shuffles that are directly supported on NEON to target-specific
// DAG nodes, instead of keeping them as shuffles and matching them again
@@ -7298,7 +7765,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
// of the same time so that they get CSEd properly.
ArrayRef<int> ShuffleMask = SVN->getMask();
- unsigned EltSize = VT.getScalarSizeInBits();
if (EltSize <= 32) {
if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();
@@ -7364,6 +7830,14 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
.getValue(WhichResult);
}
}
+ if (ST->hasMVEIntegerOps()) {
+ if (isVMOVNMask(ShuffleMask, VT, 0))
+ return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
+ DAG.getConstant(0, dl, MVT::i32));
+ if (isVMOVNMask(ShuffleMask, VT, 1))
+ return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
+ DAG.getConstant(1, dl, MVT::i32));
+ }
// Also check for these shuffles through CONCAT_VECTORS: we canonicalize
// shuffles that produce a result larger than their operands with:
@@ -7468,8 +7942,29 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
-SDValue ARMTargetLowering::
-LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VecVT = Op.getOperand(0).getValueType();
+ SDLoc dl(Op);
+
+ assert(ST->hasMVEIntegerOps() &&
+ "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
+
+ SDValue Conv =
+ DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
+ unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ unsigned LaneWidth =
+ getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
+ unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
+ Op.getOperand(1), DAG.getValueType(MVT::i1));
+ SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
+ DAG.getConstant(~Mask, dl, MVT::i32));
+ return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
+}
+
+SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
// INSERT_VECTOR_ELT is legal only for immediate indexes.
SDValue Lane = Op.getOperand(2);
if (!isa<ConstantSDNode>(Lane))
@@ -7477,6 +7972,11 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
SDValue Elt = Op.getOperand(1);
EVT EltVT = Elt.getValueType();
+
+ if (Subtarget->hasMVEIntegerOps() &&
+ Op.getValueType().getScalarSizeInBits() == 1)
+ return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
+
if (getTypeAction(*DAG.getContext(), EltVT) ==
TargetLowering::TypePromoteFloat) {
// INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
@@ -7505,13 +8005,37 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
-static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VecVT = Op.getOperand(0).getValueType();
+ SDLoc dl(Op);
+
+ assert(ST->hasMVEIntegerOps() &&
+ "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
+
+ SDValue Conv =
+ DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
+ unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ unsigned LaneWidth =
+ getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
+ SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
+ DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
+ return Shift;
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
// EXTRACT_VECTOR_ELT is legal only for immediate indexes.
SDValue Lane = Op.getOperand(1);
if (!isa<ConstantSDNode>(Lane))
return SDValue();
SDValue Vec = Op.getOperand(0);
+ EVT VT = Vec.getValueType();
+
+ if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
+ return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
+
if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
SDLoc dl(Op);
return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
@@ -7520,7 +8044,64 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
return Op;
}
-static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ EVT Op1VT = V1.getValueType();
+ EVT Op2VT = V2.getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ assert(Op1VT == Op2VT && "Operand types don't match!");
+ assert(VT.getScalarSizeInBits() == 1 &&
+ "Unexpected custom CONCAT_VECTORS lowering");
+ assert(ST->hasMVEIntegerOps() &&
+ "CONCAT_VECTORS lowering only supported for MVE");
+
+ SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
+ SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
+
+ // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
+ // promoted to v8i16, etc.
+
+ MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
+
+ // Extract the vector elements from Op1 and Op2 one by one and truncate them
+ // to be the right size for the destination. For example, if Op1 is v4i1 then
+ // the promoted vector is v4i32. The result of concatentation gives a v8i1,
+ // which when promoted is v8i16. That means each i32 element from Op1 needs
+ // truncating to i16 and inserting in the result.
+ EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
+ SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
+ auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
+ EVT NewVT = NewV.getValueType();
+ EVT ConcatVT = ConVec.getValueType();
+ for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
+ DAG.getIntPtrConstant(i, dl));
+ ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
+ DAG.getConstant(j, dl, MVT::i32));
+ }
+ return ConVec;
+ };
+ unsigned j = 0;
+ ConVec = ExractInto(NewV1, ConVec, j);
+ ConVec = ExractInto(NewV2, ConVec, j);
+
+ // Now return the result of comparing the subvector with zero,
+ // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+ return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
+ DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VT = Op->getValueType(0);
+ if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
+ return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
+
// The only time a CONCAT_VECTORS operation can have legal types is when
// two 64-bit vectors are concatenated to a 128-bit vector.
assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
@@ -7540,6 +8121,43 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
}
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ EVT Op1VT = V1.getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue();
+
+ assert(VT.getScalarSizeInBits() == 1 &&
+ "Unexpected custom EXTRACT_SUBVECTOR lowering");
+ assert(ST->hasMVEIntegerOps() &&
+ "EXTRACT_SUBVECTOR lowering only supported for MVE");
+
+ SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
+
+ // We now have Op1 promoted to a vector of integers, where v8i1 gets
+ // promoted to v8i16, etc.
+
+ MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
+
+ EVT SubVT = MVT::getVectorVT(ElType, NumElts);
+ SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
+ for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
+ DAG.getIntPtrConstant(i, dl));
+ SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
+ DAG.getConstant(j, dl, MVT::i32));
+ }
+
+ // Now return the result of comparing the subvector with zero,
+ // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+ return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
+ DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+}
+
/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
/// element has been zero/sign-extended, depending on the isSigned parameter,
/// from an integer type half its size.
@@ -7897,7 +8515,8 @@ static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
return N0;
}
-static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
EVT VT = Op.getValueType();
assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
"unexpected type for custom-lowering ISD::SDIV");
@@ -7924,7 +8543,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
- N0 = LowerCONCAT_VECTORS(N0, DAG);
+ N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
return N0;
@@ -7932,7 +8551,8 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
return LowerSDIV_v4i16(N0, N1, dl, DAG);
}
-static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
// TODO: Should this propagate fast-math-flags?
EVT VT = Op.getValueType();
assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
@@ -7960,7 +8580,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
- N0 = LowerCONCAT_VECTORS(N0, DAG);
+ N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
@@ -8255,6 +8875,96 @@ void ARMTargetLowering::ExpandDIV_Windows(
Results.push_back(Upper);
}
+static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
+ LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+ EVT MemVT = LD->getMemoryVT();
+ assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
+ "Expected a predicate type!");
+ assert(MemVT == Op.getValueType());
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Expected a non-extending load");
+ assert(LD->isUnindexed() && "Expected a unindexed load");
+
+ // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
+ // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
+ // need to make sure that 8/4 bits are actually loaded into the correct
+ // place, which means loading the value and then shuffling the values into
+ // the bottom bits of the predicate.
+ // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
+ // for BE).
+
+ SDLoc dl(Op);
+ SDValue Load = DAG.getExtLoad(
+ ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
+ EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
+ LD->getMemOperand());
+ SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load);
+ if (MemVT != MVT::v16i1)
+ Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
+ DAG.getConstant(0, dl, MVT::i32));
+ return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
+}
+
+static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
+ StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+ EVT MemVT = ST->getMemoryVT();
+ assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
+ "Expected a predicate type!");
+ assert(MemVT == ST->getValue().getValueType());
+ assert(!ST->isTruncatingStore() && "Expected a non-extending store");
+ assert(ST->isUnindexed() && "Expected a unindexed store");
+
+ // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
+ // unset and a scalar store.
+ SDLoc dl(Op);
+ SDValue Build = ST->getValue();
+ if (MemVT != MVT::v16i1) {
+ SmallVector<SDValue, 16> Ops;
+ for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++)
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
+ DAG.getConstant(I, dl, MVT::i32)));
+ for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
+ Ops.push_back(DAG.getUNDEF(MVT::i32));
+ Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
+ }
+ SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
+ return DAG.getTruncStore(
+ ST->getChain(), dl, GRP, ST->getBasePtr(),
+ EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
+ ST->getMemOperand());
+}
+
+static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
+ MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+ MVT VT = Op.getSimpleValueType();
+ SDValue Mask = N->getMask();
+ SDValue PassThru = N->getPassThru();
+ SDLoc dl(Op);
+
+ auto IsZero = [](SDValue PassThru) {
+ return (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
+ (PassThru->getOpcode() == ARMISD::VMOVIMM &&
+ isNullConstant(PassThru->getOperand(0))));
+ };
+
+ if (IsZero(PassThru))
+ return Op;
+
+ // MVE Masked loads use zero as the passthru value. Here we convert undef to
+ // zero too, and other values are lowered to a select.
+ SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+ DAG.getTargetConstant(0, dl, MVT::i32));
+ SDValue NewLoad = DAG.getMaskedLoad(
+ VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
+ N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
+ SDValue Combo = NewLoad;
+ if (!PassThru.isUndef() &&
+ (PassThru.getOpcode() != ISD::BITCAST ||
+ !IsZero(PassThru->getOperand(0))))
+ Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
+ return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
+}
+
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
// Acquire/Release load/store is not legal for targets without a dmb or
@@ -8273,12 +8983,12 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
// Under Power Management extensions, the cycle-count is:
// mrc p15, #0, <Rt>, c9, c13, #0
SDValue Ops[] = { N->getOperand(0), // Chain
- DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
- DAG.getConstant(15, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(9, DL, MVT::i32),
- DAG.getConstant(13, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32)
+ DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+ DAG.getTargetConstant(15, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ DAG.getTargetConstant(9, DL, MVT::i32),
+ DAG.getTargetConstant(13, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32)
};
SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
@@ -8412,6 +9122,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
+ case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
Subtarget);
case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
@@ -8426,24 +9137,25 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
- case ISD::SETCC: return LowerVSETCC(Op, DAG);
+ case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
+ case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
- case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
- case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::SDIV:
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
return LowerDIV_Windows(Op, DAG, /* Signed */ true);
- return LowerSDIV(Op, DAG);
+ return LowerSDIV(Op, DAG, Subtarget);
case ISD::UDIV:
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
return LowerDIV_Windows(Op, DAG, /* Signed */ false);
- return LowerUDIV(Op, DAG);
+ return LowerUDIV(Op, DAG, Subtarget);
case ISD::ADDCARRY:
case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
case ISD::SADDO:
@@ -8452,6 +9164,15 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UADDO:
case ISD::USUBO:
return LowerUnsignedALUO(Op, DAG);
+ case ISD::SADDSAT:
+ case ISD::SSUBSAT:
+ return LowerSADDSUBSAT(Op, DAG, Subtarget);
+ case ISD::LOAD:
+ return LowerPredicateLoad(Op, DAG);
+ case ISD::STORE:
+ return LowerPredicateStore(Op, DAG);
+ case ISD::MLOAD:
+ return LowerMLOAD(Op, DAG);
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
@@ -8530,6 +9251,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res.getValue(0));
Results.push_back(Res.getValue(1));
return;
+ case ISD::SADDSAT:
+ case ISD::SSUBSAT:
+ Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
+ break;
case ISD::READCYCLECOUNTER:
ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
return;
@@ -8600,19 +9325,19 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
// orr r5, r5, #1
// add r5, pc
// str r5, [$jbuf, #+4] ; &jbuf[1]
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
.addConstantPoolIndex(CPI)
.addMemOperand(CPMMO)
.add(predOps(ARMCC::AL));
// Set the low bit because of thumb mode.
- unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ Register NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
.addReg(NewVReg1, RegState::Kill)
.addImm(0x01)
.add(predOps(ARMCC::AL))
.add(condCodeOp());
- unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ Register NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
.addReg(NewVReg2, RegState::Kill)
.addImm(PCLabelId);
@@ -8630,28 +9355,28 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
// orrs r1, r2
// add r2, $jbuf, #+4 ; &jbuf[1]
// str r1, [r2]
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
.addConstantPoolIndex(CPI)
.addMemOperand(CPMMO)
.add(predOps(ARMCC::AL));
- unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ Register NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
.addReg(NewVReg1, RegState::Kill)
.addImm(PCLabelId);
// Set the low bit because of thumb mode.
- unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ Register NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
.addReg(ARM::CPSR, RegState::Define)
.addImm(1)
.add(predOps(ARMCC::AL));
- unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ Register NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
.addReg(ARM::CPSR, RegState::Define)
.addReg(NewVReg2, RegState::Kill)
.addReg(NewVReg3, RegState::Kill)
.add(predOps(ARMCC::AL));
- unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+ Register NewVReg5 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
.addFrameIndex(FI)
.addImm(36); // &jbuf[1] :: pc
@@ -8666,13 +9391,13 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
// ldr r1, LCPI1_1
// add r1, pc, r1
// str r1, [$jbuf, #+4] ; &jbuf[1]
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
.addConstantPoolIndex(CPI)
.addImm(0)
.addMemOperand(CPMMO)
.add(predOps(ARMCC::AL));
- unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ Register NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
.addReg(NewVReg1, RegState::Kill)
.addImm(PCLabelId)
@@ -8794,7 +9519,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
bool IsPositionIndependent = isPositionIndependent();
unsigned NumLPads = LPadList.size();
if (Subtarget->isThumb2()) {
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
.addFrameIndex(FI)
.addImm(4)
@@ -8807,7 +9532,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addImm(LPadList.size())
.add(predOps(ARMCC::AL));
} else {
- unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ Register VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
.addImm(NumLPads & 0xFFFF)
.add(predOps(ARMCC::AL));
@@ -8832,12 +9557,12 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addImm(ARMCC::HI)
.addReg(ARM::CPSR);
- unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ Register NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
.addJumpTableIndex(MJTI)
.add(predOps(ARMCC::AL));
- unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ Register NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
.addReg(NewVReg3, RegState::Kill)
.addReg(NewVReg1)
@@ -8850,7 +9575,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addReg(NewVReg1)
.addJumpTableIndex(MJTI);
} else if (Subtarget->isThumb()) {
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
.addFrameIndex(FI)
.addImm(1)
@@ -8873,7 +9598,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
Align = MF->getDataLayout().getTypeAllocSize(C->getType());
unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
- unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ Register VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
.addReg(VReg1, RegState::Define)
.addConstantPoolIndex(Idx)
@@ -8889,19 +9614,19 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addImm(ARMCC::HI)
.addReg(ARM::CPSR);
- unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ Register NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
.addReg(ARM::CPSR, RegState::Define)
.addReg(NewVReg1)
.addImm(2)
.add(predOps(ARMCC::AL));
- unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ Register NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
.addJumpTableIndex(MJTI)
.add(predOps(ARMCC::AL));
- unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ Register NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
.addReg(ARM::CPSR, RegState::Define)
.addReg(NewVReg2, RegState::Kill)
@@ -8911,7 +9636,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
- unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+ Register NewVReg5 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
.addReg(NewVReg4, RegState::Kill)
.addImm(0)
@@ -8932,7 +9657,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addReg(NewVReg6, RegState::Kill)
.addJumpTableIndex(MJTI);
} else {
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
.addFrameIndex(FI)
.addImm(4)
@@ -8945,7 +9670,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addImm(NumLPads)
.add(predOps(ARMCC::AL));
} else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
- unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ Register VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
.addImm(NumLPads & 0xFFFF)
.add(predOps(ARMCC::AL));
@@ -8974,7 +9699,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
Align = MF->getDataLayout().getTypeAllocSize(C->getType());
unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
- unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ Register VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
.addReg(VReg1, RegState::Define)
.addConstantPoolIndex(Idx)
@@ -8991,20 +9716,20 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addImm(ARMCC::HI)
.addReg(ARM::CPSR);
- unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ Register NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
.addReg(NewVReg1)
.addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
.add(predOps(ARMCC::AL))
.add(condCodeOp());
- unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ Register NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
.addJumpTableIndex(MJTI)
.add(predOps(ARMCC::AL));
MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
- unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+ Register NewVReg5 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
.addReg(NewVReg3, RegState::Kill)
.addReg(NewVReg4)
@@ -9239,8 +9964,8 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator It = ++BB->getIterator();
- unsigned dest = MI.getOperand(0).getReg();
- unsigned src = MI.getOperand(1).getReg();
+ Register dest = MI.getOperand(0).getReg();
+ Register src = MI.getOperand(1).getReg();
unsigned SizeVal = MI.getOperand(2).getImm();
unsigned Align = MI.getOperand(3).getImm();
DebugLoc dl = MI.getDebugLoc();
@@ -9291,9 +10016,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
unsigned srcIn = src;
unsigned destIn = dest;
for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
- unsigned srcOut = MRI.createVirtualRegister(TRC);
- unsigned destOut = MRI.createVirtualRegister(TRC);
- unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+ Register srcOut = MRI.createVirtualRegister(TRC);
+ Register destOut = MRI.createVirtualRegister(TRC);
+ Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
IsThumb1, IsThumb2);
emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
@@ -9306,9 +10031,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
// [scratch, srcOut] = LDRB_POST(srcIn, 1)
// [destOut] = STRB_POST(scratch, destIn, 1)
for (unsigned i = 0; i < BytesLeft; i++) {
- unsigned srcOut = MRI.createVirtualRegister(TRC);
- unsigned destOut = MRI.createVirtualRegister(TRC);
- unsigned scratch = MRI.createVirtualRegister(TRC);
+ Register srcOut = MRI.createVirtualRegister(TRC);
+ Register destOut = MRI.createVirtualRegister(TRC);
+ Register scratch = MRI.createVirtualRegister(TRC);
emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
IsThumb1, IsThumb2);
emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
@@ -9351,7 +10076,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
// Load an immediate to varEnd.
- unsigned varEnd = MRI.createVirtualRegister(TRC);
+ Register varEnd = MRI.createVirtualRegister(TRC);
if (Subtarget->useMovt()) {
unsigned Vtmp = varEnd;
if ((LoopSize & 0xFFFF0000) != 0)
@@ -9401,12 +10126,12 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
// destPhi = PHI(destLoop, dst)
MachineBasicBlock *entryBB = BB;
BB = loopMBB;
- unsigned varLoop = MRI.createVirtualRegister(TRC);
- unsigned varPhi = MRI.createVirtualRegister(TRC);
- unsigned srcLoop = MRI.createVirtualRegister(TRC);
- unsigned srcPhi = MRI.createVirtualRegister(TRC);
- unsigned destLoop = MRI.createVirtualRegister(TRC);
- unsigned destPhi = MRI.createVirtualRegister(TRC);
+ Register varLoop = MRI.createVirtualRegister(TRC);
+ Register varPhi = MRI.createVirtualRegister(TRC);
+ Register srcLoop = MRI.createVirtualRegister(TRC);
+ Register srcPhi = MRI.createVirtualRegister(TRC);
+ Register destLoop = MRI.createVirtualRegister(TRC);
+ Register destPhi = MRI.createVirtualRegister(TRC);
BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
.addReg(varLoop).addMBB(loopMBB)
@@ -9420,7 +10145,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
// [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
// [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
- unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+ Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
IsThumb1, IsThumb2);
emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
@@ -9461,9 +10186,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
unsigned srcIn = srcLoop;
unsigned destIn = destLoop;
for (unsigned i = 0; i < BytesLeft; i++) {
- unsigned srcOut = MRI.createVirtualRegister(TRC);
- unsigned destOut = MRI.createVirtualRegister(TRC);
- unsigned scratch = MRI.createVirtualRegister(TRC);
+ Register srcOut = MRI.createVirtualRegister(TRC);
+ Register destOut = MRI.createVirtualRegister(TRC);
+ Register scratch = MRI.createVirtualRegister(TRC);
emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
IsThumb1, IsThumb2);
emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
@@ -9523,7 +10248,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
break;
case CodeModel::Large: {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
.addExternalSymbol("__chkstk");
@@ -9771,8 +10496,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// equality.
bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
- unsigned LHS1 = MI.getOperand(1).getReg();
- unsigned LHS2 = MI.getOperand(2).getReg();
+ Register LHS1 = MI.getOperand(1).getReg();
+ Register LHS2 = MI.getOperand(2).getReg();
if (RHSisZero) {
BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
.addReg(LHS1)
@@ -9782,8 +10507,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addReg(LHS2).addImm(0)
.addImm(ARMCC::EQ).addReg(ARM::CPSR);
} else {
- unsigned RHS1 = MI.getOperand(3).getReg();
- unsigned RHS2 = MI.getOperand(4).getReg();
+ Register RHS1 = MI.getOperand(3).getReg();
+ Register RHS2 = MI.getOperand(4).getReg();
BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
.addReg(LHS1)
.addReg(RHS1)
@@ -9844,15 +10569,15 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
Fn->insert(BBI, RSBBB);
Fn->insert(BBI, SinkBB);
- unsigned int ABSSrcReg = MI.getOperand(1).getReg();
- unsigned int ABSDstReg = MI.getOperand(0).getReg();
+ Register ABSSrcReg = MI.getOperand(1).getReg();
+ Register ABSDstReg = MI.getOperand(0).getReg();
bool ABSSrcKIll = MI.getOperand(1).isKill();
bool isThumb2 = Subtarget->isThumb2();
MachineRegisterInfo &MRI = Fn->getRegInfo();
// In Thumb mode S must not be specified if source register is the SP or
// PC and if destination register is the SP, so restrict register class
- unsigned NewRsbDstReg =
- MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
+ Register NewRsbDstReg = MRI.createVirtualRegister(
+ isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
// Transfer the remainder of BB and its successor edges to sinkMBB.
SinkBB->splice(SinkBB->begin(), BB,
@@ -9931,7 +10656,7 @@ static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
// The MEMCPY both defines and kills the scratch registers.
for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
- unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
+ Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
: &ARM::GPRRegClass);
MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
}
@@ -10369,10 +11094,7 @@ static SDValue findMUL_LOHI(SDValue V) {
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
- if (Subtarget->isThumb()) {
- if (!Subtarget->hasDSP())
- return SDValue();
- } else if (!Subtarget->hasV5TEOps())
+ if (!Subtarget->hasBaseDSP())
return SDValue();
// SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
@@ -11253,7 +11975,7 @@ static SDValue PerformANDCombine(SDNode *N,
BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
if (SplatBitSize <= 64) {
EVT VbicVT;
- SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
+ SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VbicVT, VT.is128BitVector(),
OtherModImm);
@@ -11469,6 +12191,77 @@ static SDValue PerformORCombineToBFI(SDNode *N,
return SDValue();
}
+static bool isValidMVECond(unsigned CC, bool IsFloat) {
+ switch (CC) {
+ case ARMCC::EQ:
+ case ARMCC::NE:
+ case ARMCC::LE:
+ case ARMCC::GT:
+ case ARMCC::GE:
+ case ARMCC::LT:
+ return true;
+ case ARMCC::HS:
+ case ARMCC::HI:
+ return !IsFloat;
+ default:
+ return false;
+ };
+}
+
+static SDValue PerformORCombine_i1(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
+ // together with predicates
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ ARMCC::CondCodes CondCode0 = ARMCC::AL;
+ ARMCC::CondCodes CondCode1 = ARMCC::AL;
+ if (N0->getOpcode() == ARMISD::VCMP)
+ CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2))
+ ->getZExtValue();
+ else if (N0->getOpcode() == ARMISD::VCMPZ)
+ CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1))
+ ->getZExtValue();
+ if (N1->getOpcode() == ARMISD::VCMP)
+ CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2))
+ ->getZExtValue();
+ else if (N1->getOpcode() == ARMISD::VCMPZ)
+ CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1))
+ ->getZExtValue();
+
+ if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL)
+ return SDValue();
+
+ unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0);
+ unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1);
+
+ if (!isValidMVECond(Opposite0,
+ N0->getOperand(0)->getValueType(0).isFloatingPoint()) ||
+ !isValidMVECond(Opposite1,
+ N1->getOperand(0)->getValueType(0).isFloatingPoint()))
+ return SDValue();
+
+ SmallVector<SDValue, 4> Ops0;
+ Ops0.push_back(N0->getOperand(0));
+ if (N0->getOpcode() == ARMISD::VCMP)
+ Ops0.push_back(N0->getOperand(1));
+ Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32));
+ SmallVector<SDValue, 4> Ops1;
+ Ops1.push_back(N1->getOperand(0));
+ if (N1->getOpcode() == ARMISD::VCMP)
+ Ops1.push_back(N1->getOperand(1));
+ Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32));
+
+ SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0);
+ SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1);
+ SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1);
+ return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And,
+ DCI.DAG.getAllOnesConstant(SDLoc(N), VT));
+}
+
/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
static SDValue PerformORCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
@@ -11489,7 +12282,7 @@ static SDValue PerformORCombine(SDNode *N,
BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
if (SplatBitSize <= 64) {
EVT VorrVT;
- SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+ SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VorrVT, VT.is128BitVector(),
OtherModImm);
@@ -11553,6 +12346,10 @@ static SDValue PerformORCombine(SDNode *N,
}
}
+ if (Subtarget->hasMVEIntegerOps() &&
+ (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
+ return PerformORCombine_i1(N, DCI, Subtarget);
+
// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
// reasonable.
if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
@@ -11921,6 +12718,24 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return Vec;
}
+static SDValue
+PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ SDValue Op = N->getOperand(0);
+ SDLoc dl(N);
+
+ // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
+ if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
+ // If the valuetypes are the same, we can remove the cast entirely.
+ if (Op->getOperand(0).getValueType() == VT)
+ return Op->getOperand(0);
+ return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl,
+ Op->getOperand(0).getValueType(), Op->getOperand(0));
+ }
+
+ return SDValue();
+}
+
/// PerformInsertEltCombine - Target-specific dag combine xforms for
/// ISD::INSERT_VECTOR_ELT.
static SDValue PerformInsertEltCombine(SDNode *N,
@@ -12332,7 +13147,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
// The canonical VMOV for a zero vector uses a 32-bit element size.
unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
unsigned EltBits;
- if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
+ if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
EltSize = 8;
EVT VT = N->getValueType(0);
if (EltSize > VT.getScalarSizeInBits())
@@ -12382,95 +13197,163 @@ static SDValue PerformLOADCombine(SDNode *N,
return SDValue();
}
-/// PerformSTORECombine - Target-specific dag combine xforms for
-/// ISD::STORE.
-static SDValue PerformSTORECombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- StoreSDNode *St = cast<StoreSDNode>(N);
- if (St->isVolatile())
- return SDValue();
-
- // Optimize trunc store (of multiple scalars) to shuffle and store. First,
- // pack all of the elements in one place. Next, store to memory in fewer
- // chunks.
+// Optimize trunc store (of multiple scalars) to shuffle and store. First,
+// pack all of the elements in one place. Next, store to memory in fewer
+// chunks.
+static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
+ SelectionDAG &DAG) {
SDValue StVal = St->getValue();
EVT VT = StVal.getValueType();
- if (St->isTruncatingStore() && VT.isVector()) {
- SelectionDAG &DAG = DCI.DAG;
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT StVT = St->getMemoryVT();
- unsigned NumElems = VT.getVectorNumElements();
- assert(StVT != VT && "Cannot truncate to the same type");
- unsigned FromEltSz = VT.getScalarSizeInBits();
- unsigned ToEltSz = StVT.getScalarSizeInBits();
+ if (!St->isTruncatingStore() || !VT.isVector())
+ return SDValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT StVT = St->getMemoryVT();
+ unsigned NumElems = VT.getVectorNumElements();
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromEltSz = VT.getScalarSizeInBits();
+ unsigned ToEltSz = StVT.getScalarSizeInBits();
+
+ // From, To sizes and ElemCount must be pow of two
+ if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
+ return SDValue();
- // From, To sizes and ElemCount must be pow of two
- if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ if (0 != (NumElems * FromEltSz) % ToEltSz)
+ return SDValue();
- // We are going to use the original vector elt for storing.
- // Accumulated smaller vector elements must be a multiple of the store size.
- if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
+ unsigned SizeRatio = FromEltSz / ToEltSz;
+ assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
- unsigned SizeRatio = FromEltSz / ToEltSz;
- assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
+ // Create a type on which we perform the shuffle.
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
+ NumElems * SizeRatio);
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
- // Create a type on which we perform the shuffle.
- EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
- NumElems*SizeRatio);
- assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDLoc DL(St);
+ SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i < NumElems; ++i)
+ ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
+ : i * SizeRatio;
- SDLoc DL(St);
- SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
- SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i < NumElems; ++i)
- ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
- ? (i + 1) * SizeRatio - 1
- : i * SizeRatio;
-
- // Can't shuffle using an illegal type.
- if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
-
- SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
- DAG.getUNDEF(WideVec.getValueType()),
- ShuffleVec);
- // At this point all of the data is stored at the bottom of the
- // register. We now need to save it to mem.
-
- // Find the largest store unit
- MVT StoreType = MVT::i8;
- for (MVT Tp : MVT::integer_valuetypes()) {
- if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
- StoreType = Tp;
- }
- // Didn't find a legal store type.
- if (!TLI.isTypeLegal(StoreType))
- return SDValue();
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
- // Bitcast the original vector into a vector of store-size units
- EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
- StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
- assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
- SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
- SmallVector<SDValue, 8> Chains;
- SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
- TLI.getPointerTy(DAG.getDataLayout()));
- SDValue BasePtr = St->getBasePtr();
+ SDValue Shuff = DAG.getVectorShuffle(
+ WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
+ // At this point all of the data is stored at the bottom of the
+ // register. We now need to save it to mem.
- // Perform one or more big stores into memory.
- unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
- for (unsigned I = 0; I < E; I++) {
- SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
- StoreType, ShuffWide,
- DAG.getIntPtrConstant(I, DL));
- SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
- St->getPointerInfo(), St->getAlignment(),
- St->getMemOperand()->getFlags());
- BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
- Increment);
- Chains.push_back(Ch);
- }
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ // Find the largest store unit
+ MVT StoreType = MVT::i8;
+ for (MVT Tp : MVT::integer_valuetypes()) {
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
+ StoreType = Tp;
}
+ // Didn't find a legal store type.
+ if (!TLI.isTypeLegal(StoreType))
+ return SDValue();
+
+ // Bitcast the original vector into a vector of store-size units
+ EVT StoreVecVT =
+ EVT::getVectorVT(*DAG.getContext(), StoreType,
+ VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
+ assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
+ SmallVector<SDValue, 8> Chains;
+ SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue BasePtr = St->getBasePtr();
+
+ // Perform one or more big stores into memory.
+ unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
+ for (unsigned I = 0; I < E; I++) {
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
+ ShuffWide, DAG.getIntPtrConstant(I, DL));
+ SDValue Ch =
+ DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
+ St->getAlignment(), St->getMemOperand()->getFlags());
+ BasePtr =
+ DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
+ Chains.push_back(Ch);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+}
+
+// Try taking a single vector store from an truncate (which would otherwise turn
+// into an expensive buildvector) and splitting it into a series of narrowing
+// stores.
+static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
+ SelectionDAG &DAG) {
+ if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
+ return SDValue();
+ SDValue Trunc = St->getValue();
+ if (Trunc->getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+ EVT FromVT = Trunc->getOperand(0).getValueType();
+ EVT ToVT = Trunc.getValueType();
+ if (!ToVT.isVector())
+ return SDValue();
+ assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
+ EVT ToEltVT = ToVT.getVectorElementType();
+ EVT FromEltVT = FromVT.getVectorElementType();
+
+ unsigned NumElements = 0;
+ if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8))
+ NumElements = 4;
+ if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
+ NumElements = 8;
+ if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements ||
+ FromVT.getVectorNumElements() % NumElements != 0)
+ return SDValue();
+
+ SDLoc DL(St);
+ // Details about the old store
+ SDValue Ch = St->getChain();
+ SDValue BasePtr = St->getBasePtr();
+ unsigned Alignment = St->getOriginalAlignment();
+ MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
+ AAMDNodes AAInfo = St->getAAInfo();
+
+ EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements);
+ EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements);
+
+ SmallVector<SDValue, 4> Stores;
+ for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
+ unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
+ SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
+ DAG.getConstant(i * NumElements, DL, MVT::i32));
+ SDValue Store = DAG.getTruncStore(
+ Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
+ NewToVT, Alignment, MMOFlags, AAInfo);
+ Stores.push_back(Store);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
+/// PerformSTORECombine - Target-specific dag combine xforms for
+/// ISD::STORE.
+static SDValue PerformSTORECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ StoreSDNode *St = cast<StoreSDNode>(N);
+ if (St->isVolatile())
+ return SDValue();
+ SDValue StVal = St->getValue();
+ EVT VT = StVal.getValueType();
+
+ if (Subtarget->hasNEON())
+ if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
+ return Store;
+
+ if (Subtarget->hasMVEIntegerOps())
+ if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
+ return NewToken;
if (!ISD::isNormalStore(St))
return SDValue();
@@ -12522,7 +13405,7 @@ static SDValue PerformSTORECombine(SDNode *N,
}
// If this is a legal vector store, try to combine it into a VST1_UPD.
- if (ISD::isNormalStore(N) && VT.isVector() &&
+ if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
return CombineBaseUpdate(N, DCI);
@@ -12890,6 +13773,71 @@ static SDValue PerformShiftCombine(SDNode *N,
return SDValue();
}
+// Look for a sign/zero extend of a larger than legal load. This can be split
+// into two extending loads, which are simpler to deal with than an arbitrary
+// sign extend.
+static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ if (N0.getOpcode() != ISD::LOAD)
+ return SDValue();
+ LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
+ if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
+ LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return SDValue();
+ EVT FromVT = LD->getValueType(0);
+ EVT ToVT = N->getValueType(0);
+ if (!ToVT.isVector())
+ return SDValue();
+ assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
+ EVT ToEltVT = ToVT.getVectorElementType();
+ EVT FromEltVT = FromVT.getVectorElementType();
+
+ unsigned NumElements = 0;
+ if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
+ NumElements = 4;
+ if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
+ NumElements = 8;
+ if (NumElements == 0 ||
+ FromVT.getVectorNumElements() == NumElements ||
+ FromVT.getVectorNumElements() % NumElements != 0 ||
+ !isPowerOf2_32(NumElements))
+ return SDValue();
+
+ SDLoc DL(LD);
+ // Details about the old load
+ SDValue Ch = LD->getChain();
+ SDValue BasePtr = LD->getBasePtr();
+ unsigned Alignment = LD->getOriginalAlignment();
+ MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+ AAMDNodes AAInfo = LD->getAAInfo();
+
+ ISD::LoadExtType NewExtType =
+ N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
+ EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext());
+ EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
+ unsigned NewOffset = NewFromVT.getSizeInBits() / 8;
+ SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+
+ // Split the load in half, each side of which is extended separately. This
+ // is good enough, as legalisation will take it from there. They are either
+ // already legal or they will be split further into something that is
+ // legal.
+ SDValue NewLoad1 =
+ DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset,
+ LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo);
+ SDValue NewLoad2 =
+ DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
+ LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
+ Alignment, MMOFlags, AAInfo);
+
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ SDValue(NewLoad1.getNode(), 1),
+ SDValue(NewLoad2.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2);
+}
+
/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
@@ -12927,6 +13875,10 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ if (ST->hasMVEIntegerOps())
+ if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
+ return NewLoad;
+
return SDValue();
}
@@ -13028,43 +13980,169 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
return V;
}
+// Given N, the value controlling the conditional branch, search for the loop
+// intrinsic, returning it, along with how the value is used. We need to handle
+// patterns such as the following:
+// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
+// (brcond (setcc (loop.decrement), 0, eq), exit)
+// (brcond (setcc (loop.decrement), 0, ne), header)
+static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
+ bool &Negate) {
+ switch (N->getOpcode()) {
+ default:
+ break;
+ case ISD::XOR: {
+ if (!isa<ConstantSDNode>(N.getOperand(1)))
+ return SDValue();
+ if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
+ return SDValue();
+ Negate = !Negate;
+ return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
+ }
+ case ISD::SETCC: {
+ auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!Const)
+ return SDValue();
+ if (Const->isNullValue())
+ Imm = 0;
+ else if (Const->isOne())
+ Imm = 1;
+ else
+ return SDValue();
+ CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
+ return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
+ if (IntOp != Intrinsic::test_set_loop_iterations &&
+ IntOp != Intrinsic::loop_decrement_reg)
+ return SDValue();
+ return N;
+ }
+ }
+ return SDValue();
+}
+
static SDValue PerformHWLoopCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
- // Look for (brcond (xor test.set.loop.iterations, -1)
- SDValue CC = N->getOperand(1);
- unsigned Opc = CC->getOpcode();
- SDValue Int;
- if ((Opc == ISD::XOR || Opc == ISD::SETCC) &&
- (CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) {
+ // The hwloop intrinsics that we're interested are used for control-flow,
+ // either for entering or exiting the loop:
+ // - test.set.loop.iterations will test whether its operand is zero. If it
+ // is zero, the proceeding branch should not enter the loop.
+ // - loop.decrement.reg also tests whether its operand is zero. If it is
+ // zero, the proceeding branch should not branch back to the beginning of
+ // the loop.
+ // So here, we need to check that how the brcond is using the result of each
+ // of the intrinsics to ensure that we're branching to the right place at the
+ // right time.
+
+ ISD::CondCode CC;
+ SDValue Cond;
+ int Imm = 1;
+ bool Negate = false;
+ SDValue Chain = N->getOperand(0);
+ SDValue Dest;
- assert((isa<ConstantSDNode>(CC->getOperand(1)) &&
- cast<ConstantSDNode>(CC->getOperand(1))->isOne()) &&
- "Expected to compare against 1");
+ if (N->getOpcode() == ISD::BRCOND) {
+ CC = ISD::SETEQ;
+ Cond = N->getOperand(1);
+ Dest = N->getOperand(2);
+ } else {
+ assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
+ CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+ Cond = N->getOperand(2);
+ Dest = N->getOperand(4);
+ if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
+ if (!Const->isOne() && !Const->isNullValue())
+ return SDValue();
+ Imm = Const->getZExtValue();
+ } else
+ return SDValue();
+ }
- Int = CC->getOperand(0);
- } else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN)
- Int = CC;
- else
+ SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
+ if (!Int)
return SDValue();
- unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue();
- if (IntOp != Intrinsic::test_set_loop_iterations)
- return SDValue();
+ if (Negate)
+ CC = ISD::getSetCCInverse(CC, true);
+
+ auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
+ return (CC == ISD::SETEQ && Imm == 0) ||
+ (CC == ISD::SETNE && Imm == 1) ||
+ (CC == ISD::SETLT && Imm == 1) ||
+ (CC == ISD::SETULT && Imm == 1);
+ };
+
+ auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
+ return (CC == ISD::SETEQ && Imm == 1) ||
+ (CC == ISD::SETNE && Imm == 0) ||
+ (CC == ISD::SETGT && Imm == 0) ||
+ (CC == ISD::SETUGT && Imm == 0) ||
+ (CC == ISD::SETGE && Imm == 1) ||
+ (CC == ISD::SETUGE && Imm == 1);
+ };
+
+ assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
+ "unsupported condition");
SDLoc dl(Int);
- SDValue Chain = N->getOperand(0);
+ SelectionDAG &DAG = DCI.DAG;
SDValue Elements = Int.getOperand(2);
- SDValue ExitBlock = N->getOperand(2);
+ unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
+ assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
+ && "expected single br user");
+ SDNode *Br = *N->use_begin();
+ SDValue OtherTarget = Br->getOperand(1);
+
+ // Update the unconditional branch to branch to the given Dest.
+ auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
+ SDValue NewBrOps[] = { Br->getOperand(0), Dest };
+ SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
+ };
- // TODO: Once we start supporting tail predication, we can add another
- // operand to WLS for the number of elements processed in a vector loop.
+ if (IntOp == Intrinsic::test_set_loop_iterations) {
+ SDValue Res;
+ // We expect this 'instruction' to branch when the counter is zero.
+ if (IsTrueIfZero(CC, Imm)) {
+ SDValue Ops[] = { Chain, Elements, Dest };
+ Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+ } else {
+ // The logic is the reverse of what we need for WLS, so find the other
+ // basic block target: the target of the proceeding br.
+ UpdateUncondBr(Br, Dest, DAG);
- SDValue Ops[] = { Chain, Elements, ExitBlock };
- SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
- DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
- return Res;
+ SDValue Ops[] = { Chain, Elements, OtherTarget };
+ Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+ }
+ DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
+ return Res;
+ } else {
+ SDValue Size = DAG.getTargetConstant(
+ cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
+ SDValue Args[] = { Int.getOperand(0), Elements, Size, };
+ SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
+ DAG.getVTList(MVT::i32, MVT::Other), Args);
+ DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
+
+ // We expect this instruction to branch when the count is not zero.
+ SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
+
+ // Update the unconditional branch to target the loop preheader if we've
+ // found the condition has been reversed.
+ if (Target == OtherTarget)
+ UpdateUncondBr(Br, Dest, DAG);
+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ SDValue(LoopDec.getNode(), 1), Chain);
+
+ SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
+ return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
+ }
+ return SDValue();
}
/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
@@ -13298,14 +14376,15 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
- case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget);
+ case ISD::BRCOND:
+ case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
case ARMISD::ADDC:
case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
case ARMISD::BFI: return PerformBFICombine(N, DCI);
case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
- case ISD::STORE: return PerformSTORECombine(N, DCI);
+ case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
@@ -13334,6 +14413,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return PerformVLDCombine(N, DCI);
case ARMISD::BUILD_VECTOR:
return PerformARMBUILD_VECTORCombine(N, DCI);
+ case ARMISD::PREDICATE_CAST:
+ return PerformPREDICATE_CASTCombine(N, DCI);
case ARMISD::SMULWB: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
@@ -13348,7 +14429,9 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
break;
}
- case ARMISD::SMLALBB: {
+ case ARMISD::SMLALBB:
+ case ARMISD::QADD16b:
+ case ARMISD::QSUB16b: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
@@ -13384,6 +14467,15 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
break;
}
+ case ARMISD::QADD8b:
+ case ARMISD::QSUB8b: {
+ unsigned BitWidth = N->getValueType(0).getSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
+ if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
+ (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
+ return SDValue();
+ break;
+ }
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -13457,47 +14549,38 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
if (!Subtarget->hasMVEIntegerOps())
return false;
- if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 &&
- Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 &&
- Ty != MVT::v2f64 &&
- // These are for truncated stores
- Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16)
- return false;
- if (Subtarget->isLittle()) {
- // In little-endian MVE, the store instructions VSTRB.U8,
- // VSTRH.U16 and VSTRW.U32 all store the vector register in
- // exactly the same format, and differ only in the range of
- // their immediate offset field and the required alignment.
- //
- // In particular, VSTRB.U8 can store a vector at byte alignment.
- // So at this stage we can simply say that loads/stores of all
- // 128-bit wide vector types are permitted at any alignment,
- // because we know at least _one_ instruction can manage that.
- //
- // Later on we might find that some of those loads are better
- // generated as VLDRW.U32 if alignment permits, to take
- // advantage of the larger immediate range. But for the moment,
- // all that matters is that if we don't lower the load then
- // _some_ instruction can handle it.
+ // These are for predicates
+ if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
+
+ // These are for truncated stores/narrowing loads. They are fine so long as
+ // the alignment is at least the size of the item being loaded
+ if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
+ Alignment >= VT.getScalarSizeInBits() / 8) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
+
+ // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
+ // VSTRW.U32 all store the vector register in exactly the same format, and
+ // differ only in the range of their immediate offset field and the required
+ // alignment. So there is always a store that can be used, regardless of
+ // actual type.
+ //
+ // For big endian, that is not the case. But can still emit a (VSTRB.U8;
+ // VREV64.8) pair and get the same effect. This will likely be better than
+ // aligning the vector through the stack.
+ if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
+ Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
+ Ty == MVT::v2f64) {
if (Fast)
*Fast = true;
return true;
- } else {
- // In big-endian MVE, those instructions aren't so similar
- // after all, because they reorder the bytes of the vector
- // differently. So this time we can only store a particular
- // kind of vector if its alignment is at least the element
- // type. And we can't store vectors of i64 or f64 at all
- // without having to do some postprocessing, because there's
- // no VSTRD.U64.
- if (Ty == MVT::v16i8 ||
- ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) ||
- ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) {
- if (Fast)
- *Fast = true;
- return true;
- }
}
return false;
@@ -13617,22 +14700,60 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) {
/// sext/zext can be folded into vsubl.
bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const {
- if (!Subtarget->hasNEON() || !I->getType()->isVectorTy())
+ if (!I->getType()->isVectorTy())
return false;
- switch (I->getOpcode()) {
- case Instruction::Sub:
- case Instruction::Add: {
- if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+ if (Subtarget->hasNEON()) {
+ switch (I->getOpcode()) {
+ case Instruction::Sub:
+ case Instruction::Add: {
+ if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+ return false;
+ Ops.push_back(&I->getOperandUse(0));
+ Ops.push_back(&I->getOperandUse(1));
+ return true;
+ }
+ default:
return false;
- Ops.push_back(&I->getOperandUse(0));
- Ops.push_back(&I->getOperandUse(1));
- return true;
+ }
}
- default:
+
+ if (!Subtarget->hasMVEIntegerOps())
+ return false;
+
+ auto IsSinker = [](Instruction *I, int Operand) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Mul:
+ return true;
+ case Instruction::Sub:
+ return Operand == 1;
+ default:
+ return false;
+ }
+ };
+
+ int Op = 0;
+ if (!isa<ShuffleVectorInst>(I->getOperand(Op)))
+ Op = 1;
+ if (!IsSinker(I, Op))
+ return false;
+ if (!match(I->getOperand(Op),
+ m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()),
+ m_Undef(), m_Zero()))) {
return false;
}
- return false;
+ Instruction *Shuffle = cast<Instruction>(I->getOperand(Op));
+ // All uses of the shuffle should be sunk to avoid duplicating it across gpr
+ // and vector registers
+ for (Use &U : Shuffle->uses()) {
+ Instruction *Insn = cast<Instruction>(U.getUser());
+ if (!IsSinker(Insn, U.getOperandNo()))
+ return false;
+ }
+ Ops.push_back(&Shuffle->getOperandUse(0));
+ Ops.push_back(&I->getOperandUse(Op));
+ return true;
}
bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
@@ -13641,6 +14762,11 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
if (!isTypeLegal(VT))
return false;
+ if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
+ if (Ld->isExpandingLoad())
+ return false;
+ }
+
// Don't create a loadext if we can fold the extension into a wide/long
// instruction.
// If there's more than one user instruction, the loadext is desirable no
@@ -14028,6 +15154,52 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
return false;
}
+static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
+ bool isSEXTLoad, bool isLE, SDValue &Base,
+ SDValue &Offset, bool &isInc,
+ SelectionDAG &DAG) {
+ if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+ return false;
+ if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
+ return false;
+
+ ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
+ int RHSC = (int)RHS->getZExtValue();
+
+ auto IsInRange = [&](int RHSC, int Limit, int Scale) {
+ if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
+ assert(Ptr->getOpcode() == ISD::ADD);
+ isInc = false;
+ Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
+ return true;
+ } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
+ isInc = Ptr->getOpcode() == ISD::ADD;
+ Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
+ return true;
+ }
+ return false;
+ };
+
+ // Try to find a matching instruction based on s/zext, Alignment, Offset and
+ // (in BE) type.
+ Base = Ptr->getOperand(0);
+ if (VT == MVT::v4i16) {
+ if (Align >= 2 && IsInRange(RHSC, 0x80, 2))
+ return true;
+ } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
+ if (IsInRange(RHSC, 0x80, 1))
+ return true;
+ } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) &&
+ IsInRange(RHSC, 0x80, 4))
+ return true;
+ else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) &&
+ IsInRange(RHSC, 0x80, 2))
+ return true;
+ else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
+ return true;
+ return false;
+}
+
/// getPreIndexedAddressParts - returns true by value, base pointer and
/// offset pointer and addressing mode by reference if the node's address
/// can be legally represented as pre-indexed load / store address.
@@ -14041,25 +15213,35 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
EVT VT;
SDValue Ptr;
+ unsigned Align;
bool isSEXTLoad = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
Ptr = LD->getBasePtr();
- VT = LD->getMemoryVT();
+ VT = LD->getMemoryVT();
+ Align = LD->getAlignment();
isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
Ptr = ST->getBasePtr();
- VT = ST->getMemoryVT();
+ VT = ST->getMemoryVT();
+ Align = ST->getAlignment();
} else
return false;
bool isInc;
bool isLegal = false;
- if (Subtarget->isThumb2())
- isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
- Offset, isInc, DAG);
- else
- isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
- Offset, isInc, DAG);
+ if (VT.isVector())
+ isLegal = Subtarget->hasMVEIntegerOps() &&
+ getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad,
+ Subtarget->isLittle(), Base, Offset,
+ isInc, DAG);
+ else {
+ if (Subtarget->isThumb2())
+ isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+ Offset, isInc, DAG);
+ else
+ isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+ Offset, isInc, DAG);
+ }
if (!isLegal)
return false;
@@ -14077,15 +15259,18 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
SelectionDAG &DAG) const {
EVT VT;
SDValue Ptr;
+ unsigned Align;
bool isSEXTLoad = false, isNonExt;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
- VT = LD->getMemoryVT();
+ VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
+ Align = LD->getAlignment();
isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
- VT = ST->getMemoryVT();
+ VT = ST->getMemoryVT();
Ptr = ST->getBasePtr();
+ Align = ST->getAlignment();
isNonExt = !ST->isTruncatingStore();
} else
return false;
@@ -14108,12 +15293,19 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
bool isInc;
bool isLegal = false;
- if (Subtarget->isThumb2())
- isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
- isInc, DAG);
- else
- isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+ if (VT.isVector())
+ isLegal = Subtarget->hasMVEIntegerOps() &&
+ getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad,
+ Subtarget->isLittle(), Base, Offset,
isInc, DAG);
+ else {
+ if (Subtarget->isThumb2())
+ isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+ isInc, DAG);
+ else
+ isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+ isInc, DAG);
+ }
if (!isLegal)
return false;
@@ -14369,7 +15561,8 @@ const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
/// constraint it is for this target.
ARMTargetLowering::ConstraintType
ARMTargetLowering::getConstraintType(StringRef Constraint) const {
- if (Constraint.size() == 1) {
+ unsigned S = Constraint.size();
+ if (S == 1) {
switch (Constraint[0]) {
default: break;
case 'l': return C_RegisterClass;
@@ -14377,12 +15570,12 @@ ARMTargetLowering::getConstraintType(StringRef Constraint) const {
case 'h': return C_RegisterClass;
case 'x': return C_RegisterClass;
case 't': return C_RegisterClass;
- case 'j': return C_Other; // Constant for movw.
- // An address with a single base register. Due to the way we
- // currently handle addresses it is the same as an 'r' memory constraint.
+ case 'j': return C_Immediate; // Constant for movw.
+ // An address with a single base register. Due to the way we
+ // currently handle addresses it is the same as an 'r' memory constraint.
case 'Q': return C_Memory;
}
- } else if (Constraint.size() == 2) {
+ } else if (S == 2) {
switch (Constraint[0]) {
default: break;
case 'T': return C_RegisterClass;
@@ -14535,7 +15728,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'j':
// Constant suitable for movw, must be between 0 and
// 65535.
- if (Subtarget->hasV6T2Ops())
+ if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
if (CVal >= 0 && CVal <= 65535)
break;
return;
@@ -14643,7 +15836,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'N':
- if (Subtarget->isThumb()) { // FIXME thumb2
+ if (Subtarget->isThumb1Only()) {
// This must be a constant between 0 and 31, for shift amounts.
if (CVal >= 0 && CVal <= 31)
break;
@@ -14651,7 +15844,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'O':
- if (Subtarget->isThumb()) { // FIXME thumb2
+ if (Subtarget->isThumb1Only()) {
// This must be a multiple of 4 between -508 and 508, for
// ADD/SUB sp = sp + immediate.
if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
@@ -14874,6 +16067,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
// without FP16. So we must do a function call.
SDLoc Loc(Op);
RTLIB::Libcall LC;
+ MakeLibCallOptions CallOptions;
if (SrcSz == 16) {
// Instruction from 16 -> 32
if (Subtarget->hasFP16())
@@ -14884,7 +16078,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_EXTEND");
SrcVal =
- makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first;
+ makeLibCall(DAG, LC, MVT::f32, SrcVal, CallOptions, Loc).first;
}
}
@@ -14897,7 +16091,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
LC = RTLIB::getFPEXT(MVT::f32, MVT::f64);
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_EXTEND");
- return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first;
+ return makeLibCall(DAG, LC, MVT::f64, SrcVal, CallOptions, Loc).first;
}
SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
@@ -14923,7 +16117,8 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_ROUND");
- return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first;
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, Loc).first;
}
void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
@@ -15015,7 +16210,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
- Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+ Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
// volatile loads with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOLoad;
return true;
@@ -15030,7 +16225,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
- Info.align = 0;
+ Info.align.reset();
// volatile loads with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOLoad;
return true;
@@ -15056,7 +16251,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
- Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+ Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
// volatile stores with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOStore;
return true;
@@ -15077,7 +16272,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = 0;
+ Info.align.reset();
// volatile stores with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOStore;
return true;
@@ -15090,7 +16285,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
}
@@ -15102,7 +16297,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
- Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
}
@@ -15112,7 +16307,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::i64;
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
- Info.align = 8;
+ Info.align = Align(8);
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
@@ -15122,7 +16317,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::i64;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = 8;
+ Info.align = Align(8);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
@@ -15473,6 +16668,12 @@ bool ARMTargetLowering::isLegalInterleavedAccessType(
return VecSize == 64 || VecSize % 128 == 0;
}
+unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
+ if (Subtarget->hasNEON())
+ return 4;
+ return TargetLoweringBase::getMaxSupportedInterleaveFactor();
+}
+
/// Lower an interleaved load into a vldN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
@@ -15792,15 +16993,15 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
}
/// Return the correct alignment for the current calling convention.
-unsigned
-ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
- DataLayout DL) const {
+Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
+ DataLayout DL) const {
+ const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy));
if (!ArgTy->isVectorTy())
- return DL.getABITypeAlignment(ArgTy);
+ return ABITypeAlign;
// Avoid over-aligning vector parameters. It would require realigning the
// stack and waste space for no real benefit.
- return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());
+ return std::min(ABITypeAlign, DL.getStackAlignment());
}
/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
@@ -15861,7 +17062,7 @@ void ARMTargetLowering::insertCopiesSplitCSR(
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
- unsigned NewVR = MRI->createVirtualRegister(RC);
+ Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 1675ec59a354..53813fad5afd 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -103,6 +103,7 @@ class VectorType;
ADDE, // Add using carry
SUBC, // Sub with carry
SUBE, // Sub using carry
+ LSLS, // Shift left producing carry
VMOVRRD, // double to two gprs.
VMOVDRR, // Two gprs to double.
@@ -126,17 +127,13 @@ class VectorType;
WIN__DBZCHK, // Windows' divide by zero check
WLS, // Low-overhead loops, While Loop Start
+ LOOP_DEC, // Really a part of LE, performs the sub
+ LE, // Low-overhead loops, Loop End
- VCEQ, // Vector compare equal.
- VCEQZ, // Vector compare equal to zero.
- VCGE, // Vector compare greater than or equal.
- VCGEZ, // Vector compare greater than or equal to zero.
- VCLEZ, // Vector compare less than or equal to zero.
- VCGEU, // Vector compare unsigned greater than or equal.
- VCGT, // Vector compare greater than.
- VCGTZ, // Vector compare greater than zero.
- VCLTZ, // Vector compare less than zero.
- VCGTU, // Vector compare unsigned greater than.
+ PREDICATE_CAST, // Predicate cast for MVE i1 types
+
+ VCMP, // Vector compare.
+ VCMPZ, // Vector compare to zero.
VTST, // Vector test bits.
// Vector shift by vector
@@ -200,6 +197,7 @@ class VectorType;
VTRN, // transpose
VTBL1, // 1-register shuffle with mask
VTBL2, // 2-register shuffle with mask
+ VMOVN, // MVE vmovn
// Vector multiply long:
VMULLs, // ...signed
@@ -221,6 +219,12 @@ class VectorType;
SMMLAR, // Signed multiply long, round and add
SMMLSR, // Signed multiply long, subtract and round
+ // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b stands for.
+ QADD8b,
+ QSUB8b,
+ QADD16b,
+ QSUB16b,
+
// Operands of the standard BUILD_VECTOR node are not legalized, which
// is fine if BUILD_VECTORs are always lowered to shuffles or other
// operations, but for ARM some BUILD_VECTORs are legal as-is and their
@@ -243,6 +247,11 @@ class VectorType;
// instructions.
MEMCPY,
+ // V8.1MMainline condition select
+ CSINV, // Conditional select invert.
+ CSNEG, // Conditional select negate.
+ CSINC, // Conditional select increment.
+
// Vector load N-element structure to all lanes:
VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
VLD2DUP,
@@ -539,7 +548,7 @@ class VectorType;
Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
- unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+ unsigned getMaxSupportedInterleaveFactor() const override;
bool lowerInterleavedLoad(LoadInst *LI,
ArrayRef<ShuffleVectorInst *> Shuffles,
@@ -608,8 +617,8 @@ class VectorType;
void finalizeLowering(MachineFunction &MF) const override;
/// Return the correct alignment for the current calling convention.
- unsigned getABIAlignmentForCallingConv(Type *ArgTy,
- DataLayout DL) const override;
+ Align getABIAlignmentForCallingConv(Type *ArgTy,
+ DataLayout DL) const override;
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
@@ -670,6 +679,8 @@ class VectorType;
SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -721,8 +732,8 @@ class VectorType;
void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const;
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
@@ -814,7 +825,7 @@ class VectorType;
SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
- const SDLoc &dl, bool InvalidOnQNaN) const;
+ const SDLoc &dl) const;
SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
@@ -838,7 +849,7 @@ class VectorType;
void setAllExpand(MVT VT);
};
- enum NEONModImmType {
+ enum VMOVModImmType {
VMOVModImm,
VMVNModImm,
MVEVMVNModImm,
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index bc93a058720c..1da32ad2af6c 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -188,6 +188,13 @@ def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
let DecoderMethod = "DecodeCCOutOperand";
}
+// Transform to generate the inverse of a condition code during ISel
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+ ARMCC::CondCodes CC = static_cast<ARMCC::CondCodes>(N->getZExtValue());
+ return CurDAG->getTargetConstant(ARMCC::getOppositeCondition(CC), SDLoc(N),
+ MVT::i32);
+}]>;
+
// VPT predicate
def VPTPredNOperand : AsmOperandClass {
@@ -401,6 +408,8 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
// mnemonic (when not in an IT block) or preclude it (when in an IT block).
bit thumbArithFlagSetting = 0;
+ bit validForTailPredication = 0;
+
// If this is a pseudo instruction, mark it isCodeGenOnly.
let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
@@ -412,6 +421,7 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
let TSFlags{14} = canXformTo16Bit;
let TSFlags{18-15} = D.Value;
let TSFlags{19} = thumbArithFlagSetting;
+ let TSFlags{20} = validForTailPredication;
let Constraints = cstr;
let Itinerary = itin;
@@ -455,6 +465,7 @@ class AsmPseudoInst<string asm, dag iops, dag oops = (outs)>
let isCodeGenOnly = 0; // So we get asm matcher for it.
let AsmString = asm;
let isPseudo = 1;
+ let hasNoSchedulingInfo = 1;
}
class ARMAsmPseudo<string asm, dag iops, dag oops = (outs)>
@@ -2282,7 +2293,7 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
let Inst{24} = SIMM{7};
let Inst{18-16} = SIMM{6-4};
let Inst{3-0} = SIMM{3-0};
- let DecoderMethod = "DecodeNEONModImmInstruction";
+ let DecoderMethod = "DecodeVMOVModImmInstruction";
}
// NEON 2 vector register format.
@@ -2724,6 +2735,16 @@ def complexrotateopodd : Operand<i32> {
let PrintMethod = "printComplexRotationOp<180, 90>";
}
+def MveSaturateOperand : AsmOperandClass {
+ let PredicateMethod = "isMveSaturateOp";
+ let DiagnosticString = "saturate operand must be 48 or 64";
+ let Name = "MveSaturate";
+}
+def saturateop : Operand<i32> {
+ let ParserMatchClass = MveSaturateOperand;
+ let PrintMethod = "printMveSaturateOp";
+}
+
// Data type suffix token aliases. Implements Table A7-3 in the ARM ARM.
def : TokenAlias<".s8", ".i8">;
def : TokenAlias<".u8", ".i8">;
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 388c889349b7..a802d5a06f07 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -117,7 +117,7 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- unsigned Reg = MI->getOperand(0).getReg();
+ Register Reg = MI->getOperand(0).getReg();
MachineInstrBuilder MIB;
MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg)
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index e35145463852..fe696222ec70 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -51,8 +51,6 @@ def SDT_ARMAnd : SDTypeProfile<1, 2,
SDTCisVT<2, i32>]>;
def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
-def SDT_ARMFCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>,
- SDTCisVT<2, i32>]>;
def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
@@ -108,14 +106,24 @@ def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>,
// TODO Add another operand for 'Size' so that we can re-use this node when we
// start supporting *TP versions.
-def SDT_ARMWhileLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>,
- SDTCisVT<1, OtherVT>]>;
+def SDT_ARMLoLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>,
+ SDTCisVT<1, OtherVT>]>;
def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>;
def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
def ARMSmlsldx : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>;
+def SDT_ARMCSel : SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisInt<3>,
+ SDTCisVT<3, i32>]>;
+
+def ARMcsinv : SDNode<"ARMISD::CSINV", SDT_ARMCSel, [SDNPOptInGlue]>;
+def ARMcsneg : SDNode<"ARMISD::CSNEG", SDT_ARMCSel, [SDNPOptInGlue]>;
+def ARMcsinc : SDNode<"ARMISD::CSINC", SDT_ARMCSel, [SDNPOptInGlue]>;
+
def SDT_MulHSR : SDTypeProfile<1, 3, [SDTCisVT<0,i32>,
SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
@@ -194,6 +202,7 @@ def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>;
def ARMaddc : SDNode<"ARMISD::ADDC", SDTBinaryArithWithFlags,
[SDNPCommutative]>;
def ARMsubc : SDNode<"ARMISD::SUBC", SDTBinaryArithWithFlags>;
+def ARMlsls : SDNode<"ARMISD::LSLS", SDTBinaryArithWithFlags>;
def ARMadde : SDNode<"ARMISD::ADDE", SDTBinaryArithWithFlagsInOut>;
def ARMsube : SDNode<"ARMISD::SUBE", SDTBinaryArithWithFlagsInOut>;
@@ -229,6 +238,11 @@ def ARMsmlalbt : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>;
def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>;
def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>;
+def ARMqadd8b : SDNode<"ARMISD::QADD8b", SDT_ARMAnd, []>;
+def ARMqsub8b : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>;
+def ARMqadd16b : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>;
+def ARMqsub16b : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>;
+
// Vector operations shared between NEON and MVE
def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
@@ -265,8 +279,16 @@ def ARMvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>;
def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>;
def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>;
-def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop,
- [SDNPHasChain]>;
+def SDTARMVCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+ SDTCisInt<3>]>;
+def SDTARMVCMPZ : SDTypeProfile<1, 2, [SDTCisInt<2>]>;
+
+def ARMvcmp : SDNode<"ARMISD::VCMP", SDTARMVCMP>;
+def ARMvcmpz : SDNode<"ARMISD::VCMPZ", SDTARMVCMPZ>;
+
+def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
+def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
+def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
//===----------------------------------------------------------------------===//
// ARM Flag Definitions.
@@ -1948,7 +1970,7 @@ multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
/// the function. The first operand is the ID# for this instruction, the second
/// is the index into the MachineConstantPool that this is, the third is the
/// size in bytes of this constant pool entry.
-let hasSideEffects = 0, isNotDuplicable = 1 in
+let hasSideEffects = 0, isNotDuplicable = 1, hasNoSchedulingInfo = 1 in
def CONSTPOOL_ENTRY :
PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
i32imm:$size), NoItinerary, []>;
@@ -2361,6 +2383,12 @@ let isCall = 1,
def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func),
8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
Requires<[IsARM]>, Sched<[WriteBr]>;
+
+ // push lr before the call
+ def BL_PUSHLR : ARMPseudoInst<(outs), (ins GPRlr:$ra, arm_bl_target:$func),
+ 4, IIC_Br,
+ []>,
+ Requires<[IsARM]>, Sched<[WriteBr]>;
}
let isBranch = 1, isTerminator = 1 in {
@@ -3727,6 +3755,23 @@ let DecoderMethod = "DecodeQADDInstruction" in
[(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))]>;
}
+def : ARMV5TEPat<(saddsat GPR:$a, GPR:$b),
+ (QADD GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(ssubsat GPR:$a, GPR:$b),
+ (QSUB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+ (QDADD rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV5TEPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
+ (QDSUB rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn),
+ (QADD8 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn),
+ (QSUB8 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn),
+ (QADD16 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn),
+ (QSUB16 rGPR:$Rm, rGPR:$Rn)>;
+
def UQADD16 : AAIIntrinsic<0b01100110, 0b11110001, "uqadd16", int_arm_uqadd16>;
def UQADD8 : AAIIntrinsic<0b01100110, 0b11111001, "uqadd8", int_arm_uqadd8>;
def UQSUB16 : AAIIntrinsic<0b01100110, 0b11110111, "uqsub16", int_arm_uqsub16>;
@@ -4870,14 +4915,13 @@ def SB : AInoP<(outs), (ins), MiscFrm, NoItinerary, "sb", "", []>,
let hasSideEffects = 1;
}
-let usesCustomInserter = 1, Defs = [CPSR] in {
-
-// Pseudo instruction that combines movs + predicated rsbmi
-// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
+ // Pseudo instruction that combines movs + predicated rsbmi
+ // to implement integer ABS
def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
}
-let usesCustomInserter = 1, Defs = [CPSR] in {
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
def COPY_STRUCT_BYVAL_I32 : PseudoInst<
(outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
NoItinerary,
@@ -5085,8 +5129,8 @@ def SWPB: AIswp<1, (outs GPRnopc:$Rt),
def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
- [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
- imm:$CRm, imm:$opc2)]>,
+ [(int_arm_cdp timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+ timm:$CRm, timm:$opc2)]>,
Requires<[IsARM,PreV8]> {
bits<4> opc1;
bits<4> CRn;
@@ -5109,8 +5153,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
- [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
- imm:$CRm, imm:$opc2)]>,
+ [(int_arm_cdp2 timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+ timm:$CRm, timm:$opc2)]>,
Requires<[IsARM,PreV8]> {
let Inst{31-28} = 0b1111;
bits<4> opc1;
@@ -5289,15 +5333,15 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
}
}
-defm LDC : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm LDCL : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm LDC2 : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
-defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm LDC : LdStCop <1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm LDCL : LdStCop <1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm LDC2 : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
-defm STC : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm STCL : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm STC2 : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
-defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm STC : LdStCop <0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm STCL : LdStCop <0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm STC2 : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
} // DecoderNamespace = "CoProc"
@@ -5333,8 +5377,8 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
(outs),
(ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
c_imm:$CRm, imm0_7:$opc2),
- [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
- imm:$CRm, imm:$opc2)]>,
+ [(int_arm_mcr timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+ timm:$CRm, timm:$opc2)]>,
ComplexDeprecationPredicate<"MCR">;
def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
(MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
@@ -5347,8 +5391,8 @@ def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
(MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
c_imm:$CRm, 0, pred:$p)>;
-def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
- (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : ARMPat<(int_arm_mrc timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2),
+ (MRC p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
class MovRCopro2<string opc, bit direction, dag oops, dag iops,
list<dag> pattern>
@@ -5379,8 +5423,8 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
(outs),
(ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
c_imm:$CRm, imm0_7:$opc2),
- [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
- imm:$CRm, imm:$opc2)]>,
+ [(int_arm_mcr2 timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+ timm:$CRm, timm:$opc2)]>,
Requires<[IsARM,PreV8]>;
def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
(MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
@@ -5394,9 +5438,9 @@ def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
(MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
c_imm:$CRm, 0)>;
-def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
- imm:$CRm, imm:$opc2),
- (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : ARMV5TPat<(int_arm_mrc2 timm:$cop, timm:$opc1, timm:$CRn,
+ timm:$CRm, timm:$opc2),
+ (MRC2 p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag>
pattern = []>
@@ -5422,8 +5466,8 @@ class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag>
def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
(outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
GPRnopc:$Rt2, c_imm:$CRm),
- [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt,
- GPRnopc:$Rt2, imm:$CRm)]>;
+ [(int_arm_mcrr timm:$cop, timm:$opc1, GPRnopc:$Rt,
+ GPRnopc:$Rt2, timm:$CRm)]>;
def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */,
(outs GPRnopc:$Rt, GPRnopc:$Rt2),
(ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
@@ -5455,8 +5499,8 @@ class MovRRCopro2<string opc, bit direction, dag oops, dag iops,
def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
(outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
GPRnopc:$Rt2, c_imm:$CRm),
- [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
- GPRnopc:$Rt2, imm:$CRm)]>;
+ [(int_arm_mcrr2 timm:$cop, timm:$opc1, GPRnopc:$Rt,
+ GPRnopc:$Rt2, timm:$CRm)]>;
def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */,
(outs GPRnopc:$Rt, GPRnopc:$Rt2),
@@ -5579,12 +5623,12 @@ def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn),
def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
[SDNPHasChain, SDNPSideEffect]>;
-let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
+let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP], hasNoSchedulingInfo = 1 in
def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;
def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK,
[SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
-let usesCustomInserter = 1, Defs = [CPSR] in
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in
def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary,
[(win__dbzchk tGPR:$divisor)]>;
@@ -6131,10 +6175,10 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
ComplexDeprecationPredicate<"IT">;
-let mayLoad = 1, mayStore =1, hasSideEffects = 1 in
+let mayLoad = 1, mayStore =1, hasSideEffects = 1, hasNoSchedulingInfo = 1 in
def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
NoItinerary,
- [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>;
+ [(set GPR:$Rd, (int_arm_space timm:$size, GPR:$Rn))]>;
//===----------------------------------
// Atomic cmpxchg for -O0
@@ -6174,4 +6218,5 @@ def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary,
let hasSideEffects = 1;
let Size = 0;
let AsmString = "@ COMPILER BARRIER";
+ let hasNoSchedulingInfo = 1;
}
diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td
index 3e7ae55c7fc8..4f67cd6e47cc 100644
--- a/lib/Target/ARM/ARMInstrMVE.td
+++ b/lib/Target/ARM/ARMInstrMVE.td
@@ -160,7 +160,8 @@ class TMemImm7ShiftOffsetAsmOperand<int shift> : AsmOperandClass {
let RenderMethod = "addMemImmOffsetOperands";
}
-class taddrmode_imm7<int shift> : MemOperand {
+class taddrmode_imm7<int shift> : MemOperand,
+ ComplexPattern<i32, 2, "SelectTAddrModeImm7<"#shift#">", []> {
let ParserMatchClass = TMemImm7ShiftOffsetAsmOperand<shift>;
// They are printed the same way as the T2 imm8 version
let PrintMethod = "printT2AddrModeImm8Operand<false>";
@@ -221,7 +222,9 @@ def t2am_imm7shift0OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<0>;
def t2am_imm7shift1OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<1>;
def t2am_imm7shift2OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<2>;
-class t2am_imm7_offset<int shift> : MemOperand {
+class t2am_imm7_offset<int shift> : MemOperand,
+ ComplexPattern<i32, 1, "SelectT2AddrModeImm7Offset<"#shift#">",
+ [], [SDNPWantRoot]> {
// They are printed the same way as the imm8 version
let PrintMethod = "printT2AddrModeImm8OffsetOperand";
let ParserMatchClass =
@@ -371,6 +374,8 @@ class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4, list<dag> pattern=[]>
let Inst{7-6} = 0b00;
let Inst{5-4} = op5_4{1-0};
let Inst{3-0} = 0b1101;
+
+ let Unpredictable{8-6} = 0b111;
}
def MVE_SQRSHR : MVE_ScalarShiftSRegReg<"sqrshr", 0b10>;
@@ -403,18 +408,17 @@ class MVE_ScalarShiftDRegImm<string iname, bits<2> op5_4, bit op16,
let Inst{3-0} = 0b1111;
}
-class MVE_ScalarShiftDRegReg<string iname, bit op5, bit op16,
- list<dag> pattern=[]>
+class MVE_ScalarShiftDRegRegBase<string iname, dag iops, string asm,
+ bit op5, bit op16, list<dag> pattern=[]>
: MVE_ScalarShiftDoubleReg<
- iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm),
- "$RdaLo, $RdaHi, $Rm", "@earlyclobber $RdaHi,@earlyclobber $RdaLo,"
- "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
+ iname, iops, asm, "@earlyclobber $RdaHi,@earlyclobber $RdaLo,"
+ "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
pattern> {
bits<4> Rm;
let Inst{16} = op16;
let Inst{15-12} = Rm{3-0};
- let Inst{7-6} = 0b00;
+ let Inst{6} = 0b0;
let Inst{5} = op5;
let Inst{4} = 0b0;
let Inst{3-0} = 0b1101;
@@ -427,27 +431,44 @@ class MVE_ScalarShiftDRegReg<string iname, bit op5, bit op16,
let DecoderMethod = "DecodeMVEOverlappingLongShift";
}
-def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
+class MVE_ScalarShiftDRegReg<string iname, bit op5, list<dag> pattern=[]>
+ : MVE_ScalarShiftDRegRegBase<
+ iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm),
+ "$RdaLo, $RdaHi, $Rm", op5, 0b0, pattern> {
+
+ let Inst{7} = 0b0;
+}
+
+class MVE_ScalarShiftDRegRegWithSat<string iname, bit op5, list<dag> pattern=[]>
+ : MVE_ScalarShiftDRegRegBase<
+ iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm, saturateop:$sat),
+ "$RdaLo, $RdaHi, $sat, $Rm", op5, 0b1, pattern> {
+ bit sat;
+
+ let Inst{7} = sat;
+}
+
+def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMasrl tGPREven:$RdaLo_src,
tGPROdd:$RdaHi_src, rGPR:$Rm))]>;
def MVE_ASRLi : MVE_ScalarShiftDRegImm<"asrl", 0b10, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMasrl tGPREven:$RdaLo_src,
- tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
-def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
+ tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
+def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMlsll tGPREven:$RdaLo_src,
tGPROdd:$RdaHi_src, rGPR:$Rm))]>;
def MVE_LSLLi : MVE_ScalarShiftDRegImm<"lsll", 0b00, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMlsll tGPREven:$RdaLo_src,
- tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
+ tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
def MVE_LSRL : MVE_ScalarShiftDRegImm<"lsrl", 0b01, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMlsrl tGPREven:$RdaLo_src,
- tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
+ tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
-def MVE_SQRSHRL : MVE_ScalarShiftDRegReg<"sqrshrl", 0b1, 0b1>;
+def MVE_SQRSHRL : MVE_ScalarShiftDRegRegWithSat<"sqrshrl", 0b1>;
def MVE_SQSHLL : MVE_ScalarShiftDRegImm<"sqshll", 0b11, 0b1>;
def MVE_SRSHRL : MVE_ScalarShiftDRegImm<"srshrl", 0b10, 0b1>;
-def MVE_UQRSHLL : MVE_ScalarShiftDRegReg<"uqrshll", 0b0, 0b1>;
+def MVE_UQRSHLL : MVE_ScalarShiftDRegRegWithSat<"uqrshll", 0b0>;
def MVE_UQSHLL : MVE_ScalarShiftDRegImm<"uqshll", 0b00, 0b1>;
def MVE_URSHRL : MVE_ScalarShiftDRegImm<"urshrl", 0b01, 0b1>;
@@ -531,6 +552,19 @@ defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>;
defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>;
defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>;
+ def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>;
+ def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))),
+ (i32 (MVE_VADDVu32acc $src2, $src1))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))),
+ (i32 (MVE_VADDVu16acc $src2, $src1))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))),
+ (i32 (MVE_VADDVu8acc $src2, $src1))>;
+
+}
+
class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
bit A, bit U, list<dag> pattern=[]>
: MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname,
@@ -636,6 +670,35 @@ multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 0b1>;
defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0b0>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))),
+ (i32 (MVE_VMAXVs8 (t2MVNi (i32 127)), $src))>;
+ def : Pat<(i32 (vecreduce_smax (v8i16 MQPR:$src))),
+ (i32 (MVE_VMAXVs16 (t2MOVi32imm (i32 -32768)), $src))>;
+ def : Pat<(i32 (vecreduce_smax (v4i32 MQPR:$src))),
+ (i32 (MVE_VMAXVs32 (t2MOVi (i32 -2147483648)), $src))>;
+ def : Pat<(i32 (vecreduce_umax (v16i8 MQPR:$src))),
+ (i32 (MVE_VMAXVu8 (t2MOVi (i32 0)), $src))>;
+ def : Pat<(i32 (vecreduce_umax (v8i16 MQPR:$src))),
+ (i32 (MVE_VMAXVu16 (t2MOVi (i32 0)), $src))>;
+ def : Pat<(i32 (vecreduce_umax (v4i32 MQPR:$src))),
+ (i32 (MVE_VMAXVu32 (t2MOVi (i32 0)), $src))>;
+
+ def : Pat<(i32 (vecreduce_smin (v16i8 MQPR:$src))),
+ (i32 (MVE_VMINVs8 (t2MOVi (i32 127)), $src))>;
+ def : Pat<(i32 (vecreduce_smin (v8i16 MQPR:$src))),
+ (i32 (MVE_VMINVs16 (t2MOVi16 (i32 32767)), $src))>;
+ def : Pat<(i32 (vecreduce_smin (v4i32 MQPR:$src))),
+ (i32 (MVE_VMINVs32 (t2MVNi (i32 -2147483648)), $src))>;
+ def : Pat<(i32 (vecreduce_umin (v16i8 MQPR:$src))),
+ (i32 (MVE_VMINVu8 (t2MOVi (i32 255)), $src))>;
+ def : Pat<(i32 (vecreduce_umin (v8i16 MQPR:$src))),
+ (i32 (MVE_VMINVu16 (t2MOVi16 (i32 65535)), $src))>;
+ def : Pat<(i32 (vecreduce_umin (v4i32 MQPR:$src))),
+ (i32 (MVE_VMINVu32 (t2MOVi (i32 4294967295)), $src))>;
+
+}
+
multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b0, bit_7>;
def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>;
@@ -667,57 +730,57 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
let Inst{0} = bit_0;
}
-multiclass MVE_VMLAMLSDAV_X<string iname, string suffix, dag iops, string cstr,
- bit sz, bit bit_28, bit A, bit bit_8, bit bit_0,
- list<dag> pattern=[]> {
- def _noexch : MVE_VMLAMLSDAV<iname, suffix, iops, cstr, sz,
- bit_28, A, 0b0, bit_8, bit_0, pattern>;
- def _exch : MVE_VMLAMLSDAV<iname # "x", suffix, iops, cstr, sz,
- bit_28, A, 0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLAMLSDAV_A<string iname, string x, string suffix,
+ bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
+ list<dag> pattern=[]> {
+ def ""#x#suffix : MVE_VMLAMLSDAV<iname # x, suffix,
+ (ins MQPR:$Qn, MQPR:$Qm), "",
+ sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
+ def "a"#x#suffix : MVE_VMLAMLSDAV<iname # "a" # x, suffix,
+ (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
+ "$RdaDest = $RdaSrc",
+ sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
+}
+
+multiclass MVE_VMLAMLSDAV_AX<string iname, string suffix, bit sz, bit bit_28,
+ bit bit_8, bit bit_0, list<dag> pattern=[]> {
+ defm "" : MVE_VMLAMLSDAV_A<iname, "", suffix, sz, bit_28,
+ 0b0, bit_8, bit_0, pattern>;
+ defm "" : MVE_VMLAMLSDAV_A<iname, "x", suffix, sz, bit_28,
+ 0b1, bit_8, bit_0, pattern>;
}
-multiclass MVE_VMLAMLSDAV_XA<string iname, string suffix, bit sz, bit bit_28,
- bit bit_8, bit bit_0, list<dag> pattern=[]> {
- defm _noacc : MVE_VMLAMLSDAV_X<iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
- sz, bit_28, 0b0, bit_8, bit_0, pattern>;
- defm _acc : MVE_VMLAMLSDAV_X<iname # "a", suffix,
- (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
- "$RdaDest = $RdaSrc",
- sz, bit_28, 0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit bit_8,
+ list<dag> pattern=[]> {
+ defm "" : MVE_VMLAMLSDAV_AX<"vmladav", "s"#suffix,
+ sz, 0b0, bit_8, 0b0, pattern>;
+ defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", "u"#suffix,
+ sz, 0b1, 0b0, bit_8, 0b0, pattern>;
}
-multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit U, bit bit_8,
- list<dag> pattern=[]> {
- defm "" : MVE_VMLAMLSDAV_XA<"vmladav", suffix, sz, U, bit_8, 0b0, pattern>;
+multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28,
+ list<dag> pattern=[]> {
+ defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", "s"#suffix,
+ sz, bit_28, 0b0, 0b1, pattern>;
}
-defm MVE_VMLADAVs16 : MVE_VMLADAV_multi<"s16", 0b0, 0b0, 0b0>;
-defm MVE_VMLADAVs32 : MVE_VMLADAV_multi<"s32", 0b1, 0b0, 0b0>;
-defm MVE_VMLADAVu16 : MVE_VMLADAV_multi<"u16", 0b0, 0b1, 0b0>;
-defm MVE_VMLADAVu32 : MVE_VMLADAV_multi<"u32", 0b1, 0b1, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi< "8", 0b0, 0b1>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<"16", 0b0, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<"32", 0b1, 0b0>;
-defm MVE_VMLADAVs8 : MVE_VMLADAV_multi<"s8", 0b0, 0b0, 0b1>;
-defm MVE_VMLADAVu8 : MVE_VMLADAV_multi<"u8", 0b0, 0b1, 0b1>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi< "8", 0b0, 0b1>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"16", 0b0, 0b0>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"32", 0b1, 0b0>;
// vmlav aliases vmladav
-foreach acc = ["_acc", "_noacc"] in {
+foreach acc = ["", "a"] in {
foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in {
- def : MVEInstAlias<!strconcat("vmlav", !if(!eq(acc, "_acc"), "a", ""),
- "${vp}.", suffix, "\t$RdaDest, $Qn, $Qm"),
- (!cast<Instruction>("MVE_VMLADAV"#suffix#acc#"_noexch")
+ def : MVEInstAlias<"vmlav"#acc#"${vp}."#suffix#"\t$RdaDest, $Qn, $Qm",
+ (!cast<Instruction>("MVE_VMLADAV"#acc#suffix)
tGPREven:$RdaDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
}
}
-multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28,
- list<dag> pattern=[]> {
- defm "" : MVE_VMLAMLSDAV_XA<"vmlsdav", suffix, sz, bit_28, 0b0, 0b1, pattern>;
-}
-
-defm MVE_VMLSDAVs8 : MVE_VMLSDAV_multi<"s8", 0, 0b1>;
-defm MVE_VMLSDAVs16 : MVE_VMLSDAV_multi<"s16", 0, 0b0>;
-defm MVE_VMLSDAVs32 : MVE_VMLSDAV_multi<"s32", 1, 0b0>;
-
// Base class for VMLALDAV and VMLSLDAV, VRMLALDAVH, VRMLSLDAVH
class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
@@ -742,82 +805,83 @@ class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
let Inst{0} = bit_0;
}
-multiclass MVE_VMLALDAVBase_X<string iname, string suffix, dag iops,
- string cstr, bit sz, bit bit_28, bit A,
- bit bit_8, bit bit_0, list<dag> pattern=[]> {
- def _noexch : MVE_VMLALDAVBase<iname, suffix, iops, cstr, sz,
- bit_28, A, 0b0, bit_8, bit_0, pattern>;
- def _exch : MVE_VMLALDAVBase<iname # "x", suffix, iops, cstr, sz,
- bit_28, A, 0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix,
+ bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
+ list<dag> pattern=[]> {
+ def ""#x#suffix : MVE_VMLALDAVBase<
+ iname # x, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
+ sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
+ def "a"#x#suffix : MVE_VMLALDAVBase<
+ iname # "a" # x, suffix,
+ (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, MQPR:$Qn, MQPR:$Qm),
+ "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc",
+ sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
}
-multiclass MVE_VMLALDAVBase_XA<string iname, string suffix, bit sz, bit bit_28,
- bit bit_8, bit bit_0, list<dag> pattern=[]> {
- defm _noacc : MVE_VMLALDAVBase_X<
- iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
- sz, bit_28, 0b0, bit_8, bit_0, pattern>;
- defm _acc : MVE_VMLALDAVBase_X<
- iname # "a", suffix, (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc,
- MQPR:$Qn, MQPR:$Qm),
- "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc",
- sz, bit_28, 0b1, bit_8, bit_0, pattern>;
+
+multiclass MVE_VMLALDAVBase_AX<string iname, string suffix, bit sz, bit bit_28,
+ bit bit_8, bit bit_0, list<dag> pattern=[]> {
+ defm "" : MVE_VMLALDAVBase_A<iname, "", suffix, sz,
+ bit_28, 0b0, bit_8, bit_0, pattern>;
+ defm "" : MVE_VMLALDAVBase_A<iname, "x", suffix, sz,
+ bit_28, 0b1, bit_8, bit_0, pattern>;
}
-multiclass MVE_VRMLALDAVH_multi<string suffix, bit U, list<dag> pattern=[]> {
- defm "" : MVE_VMLALDAVBase_XA<
- "vrmlaldavh", suffix, 0b0, U, 0b1, 0b0, pattern>;
+multiclass MVE_VRMLALDAVH_multi<string suffix, list<dag> pattern=[]> {
+ defm "" : MVE_VMLALDAVBase_AX<"vrmlaldavh", "s"#suffix,
+ 0b0, 0b0, 0b1, 0b0, pattern>;
+ defm "" : MVE_VMLALDAVBase_A<"vrmlaldavh", "", "u"#suffix,
+ 0b0, 0b1, 0b0, 0b1, 0b0, pattern>;
}
-defm MVE_VRMLALDAVHs32 : MVE_VRMLALDAVH_multi<"s32", 0>;
-defm MVE_VRMLALDAVHu32 : MVE_VRMLALDAVH_multi<"u32", 1>;
+defm MVE_VRMLALDAVH : MVE_VRMLALDAVH_multi<"32">;
// vrmlalvh aliases for vrmlaldavh
def : MVEInstAlias<"vrmlalvh${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
- (MVE_VRMLALDAVHs32_noacc_noexch
+ (MVE_VRMLALDAVHs32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
def : MVEInstAlias<"vrmlalvha${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
- (MVE_VRMLALDAVHs32_acc_noexch
+ (MVE_VRMLALDAVHas32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
def : MVEInstAlias<"vrmlalvh${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
- (MVE_VRMLALDAVHu32_noacc_noexch
+ (MVE_VRMLALDAVHu32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
def : MVEInstAlias<"vrmlalvha${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
- (MVE_VRMLALDAVHu32_acc_noexch
+ (MVE_VRMLALDAVHau32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
-multiclass MVE_VMLALDAV_multi<string suffix, bit sz, bit U,
- list<dag> pattern=[]> {
- defm "" : MVE_VMLALDAVBase_XA<"vmlaldav", suffix, sz, U, 0b0, 0b0, pattern>;
+multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> {
+ defm "" : MVE_VMLALDAVBase_AX<"vmlaldav", "s"#suffix, sz, 0b0, 0b0, 0b0, pattern>;
+ defm "" : MVE_VMLALDAVBase_A<"vmlaldav", "", "u"#suffix,
+ sz, 0b1, 0b0, 0b0, 0b0, pattern>;
}
-defm MVE_VMLALDAVs16 : MVE_VMLALDAV_multi<"s16", 0b0, 0b0>;
-defm MVE_VMLALDAVs32 : MVE_VMLALDAV_multi<"s32", 0b1, 0b0>;
-defm MVE_VMLALDAVu16 : MVE_VMLALDAV_multi<"u16", 0b0, 0b1>;
-defm MVE_VMLALDAVu32 : MVE_VMLALDAV_multi<"u32", 0b1, 0b1>;
+defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>;
+defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>;
// vmlalv aliases vmlaldav
-foreach acc = ["_acc", "_noacc"] in {
+foreach acc = ["", "a"] in {
foreach suffix = ["s16", "s32", "u16", "u32"] in {
- def : MVEInstAlias<!strconcat("vmlalv", !if(!eq(acc, "_acc"), "a", ""),
- "${vp}.", suffix, "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm"),
- (!cast<Instruction>("MVE_VMLALDAV"#suffix#acc#"_noexch")
+ def : MVEInstAlias<"vmlalv" # acc # "${vp}." # suffix #
+ "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm",
+ (!cast<Instruction>("MVE_VMLALDAV"#acc#suffix)
tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
}
}
multiclass MVE_VMLSLDAV_multi<string iname, string suffix, bit sz,
- bit bit_28, list<dag> pattern=[]> {
- defm "" : MVE_VMLALDAVBase_XA<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>;
+ bit bit_28, list<dag> pattern=[]> {
+ defm "" : MVE_VMLALDAVBase_AX<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>;
}
-defm MVE_VMLSLDAVs16 : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>;
-defm MVE_VMLSLDAVs32 : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>;
-defm MVE_VRMLSLDAVHs32 : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>;
+defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>;
+defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>;
+defm MVE_VRMLSLDAVH : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>;
// end of mve_rDest instructions
@@ -967,11 +1031,12 @@ def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
let Inst{6} = 0b1;
let Inst{4} = 0b1;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
-class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7>
+class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7, string cstr="">
: MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), iname,
- suffix, "$Qd, $Qm", ""> {
+ suffix, "$Qd, $Qm", cstr> {
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
@@ -985,9 +1050,9 @@ class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7>
let Inst{0} = 0b0;
}
-def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00>;
-def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00>;
-def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00>;
+def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00, "@earlyclobber $Qd">;
+def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00, "@earlyclobber $Qd">;
+def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00, "@earlyclobber $Qd">;
def MVE_VREV32_8 : MVE_VREV<"vrev32", "8", 0b00, 0b01>;
def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>;
@@ -995,6 +1060,13 @@ def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>;
def MVE_VREV16_8 : MVE_VREV<"vrev16", "8", 0b00, 0b10>;
let Predicates = [HasMVEInt] in {
+ def : Pat<(v8i16 (bswap (v8i16 MQPR:$src))),
+ (v8i16 (MVE_VREV16_8 (v8i16 MQPR:$src)))>;
+ def : Pat<(v4i32 (bswap (v4i32 MQPR:$src))),
+ (v4i32 (MVE_VREV32_8 (v4i32 MQPR:$src)))>;
+}
+
+let Predicates = [HasMVEInt] in {
def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))),
(v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>;
def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))),
@@ -1026,6 +1098,7 @@ def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
let Inst{12-6} = 0b0010111;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
let Predicates = [HasMVEInt] in {
@@ -1054,6 +1127,7 @@ class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28>
let Inst{6} = 0b1;
let Inst{4} = 0b1;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VEOR : MVE_bit_ops<"veor", 0b00, 0b1>;
@@ -1145,6 +1219,7 @@ class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps>
class MVE_VORR<string suffix, bits<4> cmode, ExpandImm imm_type>
: MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
let Inst{5} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VORRIZ0v4i32 : MVE_VORR<"i32", 0b0001, expzero00>;
@@ -1173,6 +1248,7 @@ def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm",
class MVE_VBIC<string suffix, bits<4> cmode, ExpandImm imm_type>
: MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
let Inst{5} = 0b1;
+ let validForTailPredication = 1;
}
def MVE_VBICIZ0v4i32 : MVE_VBIC<"i32", 0b0001, expzero00>;
@@ -1315,8 +1391,12 @@ let Predicates = [HasMVEInt] in {
def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane),
(MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>;
- def : Pat<(extractelt (v8f16 MQPR:$src), imm:$lane),
- (COPY_TO_REGCLASS (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane), HPR)>;
+ def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane),
+ (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>;
+ def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane),
+ (COPY_TO_REGCLASS
+ (VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))),
+ HPR)>;
def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
@@ -1408,6 +1488,7 @@ class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract,
let Inst{12-8} = 0b01000;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
class MVE_VADD<string suffix, bits<2> size, list<dag> pattern=[]>
@@ -1442,8 +1523,8 @@ let Predicates = [HasMVEInt] in {
}
class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract,
- bits<2> size, list<dag> pattern=[]>
- : MVE_int<iname, suffix, size, pattern> {
+ bits<2> size, ValueType vt>
+ : MVE_int<iname, suffix, size, []> {
let Inst{28} = U;
let Inst{25-23} = 0b110;
@@ -1453,26 +1534,49 @@ class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract,
let Inst{8} = 0b0;
let Inst{4} = 0b1;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
+
+ ValueType VT = vt;
}
-class MVE_VQADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
- : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, pattern>;
-class MVE_VQSUB<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
- : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, pattern>;
+class MVE_VQADD<string suffix, bit U, bits<2> size, ValueType VT>
+ : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, VT>;
+class MVE_VQSUB<string suffix, bit U, bits<2> size, ValueType VT>
+ : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, VT>;
-def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00>;
-def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01>;
-def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10>;
-def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00>;
-def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01>;
-def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10>;
+def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00, v16i8>;
+def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01, v8i16>;
+def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10, v4i32>;
+def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00, v16i8>;
+def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01, v8i16>;
+def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10, v4i32>;
+
+def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00, v16i8>;
+def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01, v8i16>;
+def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10, v4i32>;
+def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00, v16i8>;
+def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01, v8i16>;
+def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10, v4i32>;
+
+let Predicates = [HasMVEInt] in {
+ foreach instr = [MVE_VQADDu8, MVE_VQADDu16, MVE_VQADDu32] in
+ foreach VT = [instr.VT] in
+ def : Pat<(VT (uaddsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+ (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+ foreach instr = [MVE_VQADDs8, MVE_VQADDs16, MVE_VQADDs32] in
+ foreach VT = [instr.VT] in
+ def : Pat<(VT (saddsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+ (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+ foreach instr = [MVE_VQSUBu8, MVE_VQSUBu16, MVE_VQSUBu32] in
+ foreach VT = [instr.VT] in
+ def : Pat<(VT (usubsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+ (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+ foreach instr = [MVE_VQSUBs8, MVE_VQSUBs16, MVE_VQSUBs32] in
+ foreach VT = [instr.VT] in
+ def : Pat<(VT (ssubsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+ (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+}
-def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00>;
-def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01>;
-def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10>;
-def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00>;
-def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01>;
-def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10>;
class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
: MVE_int<"vabd", suffix, size, pattern> {
@@ -1483,6 +1587,7 @@ class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
let Inst{12-8} = 0b00111;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VABDs8 : MVE_VABD_int<"s8", 0b0, 0b00>;
@@ -1501,6 +1606,7 @@ class MVE_VRHADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
let Inst{12-8} = 0b00001;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VRHADDs8 : MVE_VRHADD<"s8", 0b0, 0b00>;
@@ -1522,6 +1628,7 @@ class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
let Inst{8} = 0b0;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
class MVE_VHADD<string suffix, bit U, bits<2> size,
@@ -1545,6 +1652,60 @@ def MVE_VHSUBu8 : MVE_VHSUB<"u8", 0b1, 0b00>;
def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>;
def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (ARMvshrsImm
+ (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+ (v16i8 (MVE_VHADDs8
+ (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+ def : Pat<(v8i16 (ARMvshrsImm
+ (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+ (v8i16 (MVE_VHADDs16
+ (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+ def : Pat<(v4i32 (ARMvshrsImm
+ (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+ (v4i32 (MVE_VHADDs32
+ (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+
+ def : Pat<(v16i8 (ARMvshruImm
+ (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+ (v16i8 (MVE_VHADDu8
+ (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+ def : Pat<(v8i16 (ARMvshruImm
+ (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+ (v8i16 (MVE_VHADDu16
+ (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+ def : Pat<(v4i32 (ARMvshruImm
+ (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+ (v4i32 (MVE_VHADDu32
+ (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+
+ def : Pat<(v16i8 (ARMvshrsImm
+ (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+ (v16i8 (MVE_VHSUBs8
+ (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+ def : Pat<(v8i16 (ARMvshrsImm
+ (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+ (v8i16 (MVE_VHSUBs16
+ (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+ def : Pat<(v4i32 (ARMvshrsImm
+ (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+ (v4i32 (MVE_VHSUBs32
+ (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+
+ def : Pat<(v16i8 (ARMvshruImm
+ (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+ (v16i8 (MVE_VHSUBu8
+ (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+ def : Pat<(v8i16 (ARMvshruImm
+ (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+ (v8i16 (MVE_VHSUBu16
+ (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+ def : Pat<(v4i32 (ARMvshruImm
+ (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+ (v4i32 (MVE_VHSUBu32
+ (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+}
+
class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
"vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> {
@@ -1563,6 +1724,7 @@ class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
let Inst{6} = 0b0;
let Inst{5} = E;
let Inst{4-0} = 0b10000;
+ let validForTailPredication = 1;
}
def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0>;
@@ -1625,6 +1787,7 @@ class MVE_VCLSCLZ<string iname, string suffix, bits<2> size,
let Inst{6} = 0b1;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VCLSs8 : MVE_VCLSCLZ<"vcls", "s8", 0b00, 0b0>;
@@ -1635,6 +1798,15 @@ def MVE_VCLZs8 : MVE_VCLSCLZ<"vclz", "i8", 0b00, 0b1>;
def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>;
def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 ( ctlz (v16i8 MQPR:$val1))),
+ (v16i8 ( MVE_VCLZs8 (v16i8 MQPR:$val1)))>;
+ def : Pat<(v4i32 ( ctlz (v4i32 MQPR:$val1))),
+ (v4i32 ( MVE_VCLZs32 (v4i32 MQPR:$val1)))>;
+ def : Pat<(v8i16 ( ctlz (v8i16 MQPR:$val1))),
+ (v8i16 ( MVE_VCLZs16 (v8i16 MQPR:$val1)))>;
+}
+
class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
list<dag> pattern=[]>
: MVEIntSingleSrc<iname, suffix, size, pattern> {
@@ -1648,6 +1820,7 @@ class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
let Inst{6} = 0b1;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VABSs8 : MVE_VABSNEG_int<"vabs", "s8", 0b00, 0b0>;
@@ -1689,6 +1862,7 @@ class MVE_VQABSNEG<string iname, string suffix, bits<2> size,
let Inst{6} = 0b1;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VQABSs8 : MVE_VQABSNEG<"vqabs", "s8", 0b00, 0b0>;
@@ -1720,6 +1894,7 @@ class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op,
let Inst{3-0} = imm{3-0};
let DecoderMethod = "DecodeMVEModImmInstruction";
+ let validForTailPredication = 1;
}
let isReMaterializable = 1 in {
@@ -2115,6 +2290,7 @@ class MVE_shift_by_vec<string iname, string suffix, bit U,
let Inst{4} = bit_4;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
multiclass mve_shift_by_vec_multi<string iname, bit bit_4, bit bit_8> {
@@ -2163,6 +2339,7 @@ class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops,
let Inst{4} = 0b1;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm>
@@ -2175,6 +2352,7 @@ class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm>
let Inst{21-16} = imm;
let Inst{10-9} = 0b10;
let Inst{8} = bit_8;
+ let validForTailPredication = 1;
}
def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, (ins shr_imm8:$imm)> {
@@ -2427,6 +2605,7 @@ class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size,
let Inst{11-10} = 0b01;
let Inst{9-7} = op{2-0};
let Inst{4} = 0b0;
+ let validForTailPredication = 1;
}
@@ -2489,6 +2668,7 @@ class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]>
let Inst{12-8} = 0b01101;
let Inst{7} = Qn{3};
let Inst{4} = 0b1;
+ let validForTailPredication = 1;
}
def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>;
@@ -2556,8 +2736,38 @@ def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1,
def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1,
(ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>;
-def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>;
+let Predicates = [HasMVEFloat, UseFusedMAC] in {
+ def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1),
+ (fmul (v8f16 MQPR:$src2),
+ (v8f16 MQPR:$src3)))),
+ (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>;
+ def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1),
+ (fmul (v4f32 MQPR:$src2),
+ (v4f32 MQPR:$src3)))),
+ (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>;
+
+ def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1),
+ (fmul (v8f16 MQPR:$src2),
+ (v8f16 MQPR:$src3)))),
+ (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>;
+ def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1),
+ (fmul (v4f32 MQPR:$src2),
+ (v4f32 MQPR:$src3)))),
+ (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>;
+}
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
+ (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>;
+ def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
+ (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>;
+}
+
+
+let validForTailPredication = 1 in {
+ def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>;
+ def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>;
+}
let Predicates = [HasMVEFloat] in {
def : Pat<(v4f32 (fadd (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
@@ -2566,8 +2776,11 @@ let Predicates = [HasMVEFloat] in {
(v8f16 (MVE_VADDf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
}
-def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>;
-def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>;
+
+let validForTailPredication = 1 in {
+ def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>;
+ def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>;
+}
let Predicates = [HasMVEFloat] in {
def : Pat<(v4f32 (fsub (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
@@ -2576,10 +2789,10 @@ let Predicates = [HasMVEFloat] in {
(v8f16 (MVE_VSUBf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
}
-class MVE_VCADD<string suffix, bit size, list<dag> pattern=[]>
+class MVE_VCADD<string suffix, bit size, string cstr="", list<dag> pattern=[]>
: MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
- "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> {
+ "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
bits<4> Qd;
bits<4> Qn;
bit rot;
@@ -2598,7 +2811,7 @@ class MVE_VCADD<string suffix, bit size, list<dag> pattern=[]>
}
def MVE_VCADDf16 : MVE_VCADD<"f16", 0b0>;
-def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1>;
+def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1, "@earlyclobber $Qd">;
class MVE_VABD_fp<string suffix, bit size>
: MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
@@ -2617,6 +2830,7 @@ class MVE_VABD_fp<string suffix, bit size>
let Inst{11-8} = 0b1101;
let Inst{7} = Qn{3};
let Inst{4} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VABDf32 : MVE_VABD_fp<"f32", 0b0>;
@@ -2643,6 +2857,7 @@ class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
let Inst{4} = 0b1;
let DecoderMethod = "DecodeMVEVCVTt1fp";
+ let validForTailPredication = 1;
}
class MVE_VCVT_imm_asmop<int Bits> : AsmOperandClass {
@@ -2693,6 +2908,7 @@ class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm,
let Inst{9-8} = rm;
let Inst{7} = op;
let Inst{4} = 0b0;
+ let validForTailPredication = 1;
}
multiclass MVE_VCVT_fp_int_anpm_multi<string suffix, bits<2> size, bit op,
@@ -2727,6 +2943,7 @@ class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op,
let Inst{12-9} = 0b0011;
let Inst{8-7} = op;
let Inst{4} = 0b0;
+ let validForTailPredication = 1;
}
// The unsuffixed VCVT for float->int implicitly rounds toward zero,
@@ -2776,6 +2993,7 @@ class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate,
let Inst{11-8} = 0b0111;
let Inst{7} = negate;
let Inst{4} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>;
@@ -2863,6 +3081,7 @@ class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20,
// decoder to emit an operand that isn't affected by any instruction
// bit.
let DecoderMethod = "DecodeMVEVCMP<false," # predtype.DecoderMethod # ">";
+ let validForTailPredication = 1;
}
class MVE_VCMPqqf<string suffix, bit size>
@@ -2927,6 +3146,7 @@ class MVE_VCMPqr<string suffix, bit bit_28, bits<2> bits_21_20,
let Constraints = "";
// Custom decoder method, for the same reason as MVE_VCMPqq
let DecoderMethod = "DecodeMVEVCMP<true," # predtype.DecoderMethod # ">";
+ let validForTailPredication = 1;
}
class MVE_VCMPqrf<string suffix, bit size>
@@ -2966,6 +3186,168 @@ def MVE_VCMPs8r : MVE_VCMPqrs<"s8", 0b00>;
def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>;
def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>;
+multiclass unpred_vcmp_z<string suffix, int fc> {
+ def i8 : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc))>;
+ def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc))>;
+ def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc))>;
+
+ def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+}
+
+multiclass unpred_vcmp_r<string suffix, int fc> {
+ def i8 : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc))>;
+ def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc))>;
+ def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>;
+
+ def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>;
+ def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>;
+ def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>;
+
+ def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, 1, VCCR:$p1))>;
+
+ def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
+}
+
+multiclass unpred_vcmpf_z<int fc> {
+ def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc))>;
+ def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>;
+
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))))),
+ (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+}
+
+multiclass unpred_vcmpf_r<int fc> {
+ def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))),
+ (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
+ def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))),
+ (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;
+
+ def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>;
+ def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>;
+
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))))),
+ (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))))),
+ (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, 1, VCCR:$p1))>;
+
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, 1, VCCR:$p1))>;
+}
+
+let Predicates = [HasMVEInt] in {
+ defm MVE_VCEQZ : unpred_vcmp_z<"i", 0>;
+ defm MVE_VCNEZ : unpred_vcmp_z<"i", 1>;
+ defm MVE_VCGEZ : unpred_vcmp_z<"s", 10>;
+ defm MVE_VCLTZ : unpred_vcmp_z<"s", 11>;
+ defm MVE_VCGTZ : unpred_vcmp_z<"s", 12>;
+ defm MVE_VCLEZ : unpred_vcmp_z<"s", 13>;
+ defm MVE_VCGTUZ : unpred_vcmp_z<"u", 8>;
+ defm MVE_VCGEUZ : unpred_vcmp_z<"u", 2>;
+
+ defm MVE_VCEQ : unpred_vcmp_r<"i", 0>;
+ defm MVE_VCNE : unpred_vcmp_r<"i", 1>;
+ defm MVE_VCGE : unpred_vcmp_r<"s", 10>;
+ defm MVE_VCLT : unpred_vcmp_r<"s", 11>;
+ defm MVE_VCGT : unpred_vcmp_r<"s", 12>;
+ defm MVE_VCLE : unpred_vcmp_r<"s", 13>;
+ defm MVE_VCGTU : unpred_vcmp_r<"u", 8>;
+ defm MVE_VCGEU : unpred_vcmp_r<"u", 2>;
+}
+
+let Predicates = [HasMVEFloat] in {
+ defm MVE_VFCEQZ : unpred_vcmpf_z<0>;
+ defm MVE_VFCNEZ : unpred_vcmpf_z<1>;
+ defm MVE_VFCGEZ : unpred_vcmpf_z<10>;
+ defm MVE_VFCLTZ : unpred_vcmpf_z<11>;
+ defm MVE_VFCGTZ : unpred_vcmpf_z<12>;
+ defm MVE_VFCLEZ : unpred_vcmpf_z<13>;
+
+ defm MVE_VFCEQ : unpred_vcmpf_r<0>;
+ defm MVE_VFCNE : unpred_vcmpf_r<1>;
+ defm MVE_VFCGE : unpred_vcmpf_r<10>;
+ defm MVE_VFCLT : unpred_vcmpf_r<11>;
+ defm MVE_VFCGT : unpred_vcmpf_r<12>;
+ defm MVE_VFCLE : unpred_vcmpf_r<13>;
+}
+
+
+// Extra "worst case" and/or/xor partterns, going into and out of GRP
+multiclass two_predops<SDPatternOperator opnode, Instruction insn> {
+ def v16i1 : Pat<(v16i1 (opnode (v16i1 VCCR:$p1), (v16i1 VCCR:$p2))),
+ (v16i1 (COPY_TO_REGCLASS
+ (insn (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p1), rGPR)),
+ (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p2), rGPR))),
+ VCCR))>;
+ def v8i1 : Pat<(v8i1 (opnode (v8i1 VCCR:$p1), (v8i1 VCCR:$p2))),
+ (v8i1 (COPY_TO_REGCLASS
+ (insn (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p1), rGPR)),
+ (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p2), rGPR))),
+ VCCR))>;
+ def v4i1 : Pat<(v4i1 (opnode (v4i1 VCCR:$p1), (v4i1 VCCR:$p2))),
+ (v4i1 (COPY_TO_REGCLASS
+ (insn (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p1), rGPR)),
+ (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p2), rGPR))),
+ VCCR))>;
+}
+
+let Predicates = [HasMVEInt] in {
+ defm POR : two_predops<or, t2ORRrr>;
+ defm PAND : two_predops<and, t2ANDrr>;
+ defm PEOR : two_predops<xor, t2EORrr>;
+}
+
+// Occasionally we need to cast between a i32 and a boolean vector, for
+// example when moving between rGPR and VPR.P0 as part of predicate vector
+// shuffles. We also sometimes need to cast between different predicate
+// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
+
+def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
+
+let Predicates = [HasMVEInt] in {
+ foreach VT = [ v4i1, v8i1, v16i1 ] in {
+ def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
+ (i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>;
+ def : Pat<(VT (predicate_cast (i32 VCCR:$src))),
+ (VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>;
+
+ foreach VT2 = [ v4i1, v8i1, v16i1 ] in
+ def : Pat<(VT (predicate_cast (VT2 VCCR:$src))),
+ (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
+ }
+}
+
// end of MVE compares
// start of MVE_qDest_qSrc
@@ -2989,10 +3371,10 @@ class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops,
}
class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract,
- string suffix, bits<2> size, list<dag> pattern=[]>
+ string suffix, bits<2> size, string cstr="", list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
- vpred_n, "$Qd = $Qd_src", pattern> {
+ vpred_n, "$Qd = $Qd_src"#cstr, pattern> {
bits<4> Qn;
let Inst{28} = subtract;
@@ -3009,7 +3391,7 @@ multiclass MVE_VQxDMLxDH_multi<string iname, bit exch,
bit round, bit subtract> {
def s8 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s8", 0b00>;
def s16 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s16", 0b01>;
- def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10>;
+ def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10, ",@earlyclobber $Qd">;
}
defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>;
@@ -3021,10 +3403,10 @@ defm MVE_VQDMLSDHX : MVE_VQxDMLxDH_multi<"vqdmlsdhx", 0b1, 0b0, 0b1>;
defm MVE_VQRDMLSDH : MVE_VQxDMLxDH_multi<"vqrdmlsdh", 0b0, 0b1, 0b1>;
defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>;
-class MVE_VCMUL<string iname, string suffix, bit size, list<dag> pattern=[]>
+class MVE_VCMUL<string iname, string suffix, bit size, string cstr="", list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
- "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> {
+ "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
bits<4> Qn;
bits<2> rot;
@@ -3041,13 +3423,13 @@ class MVE_VCMUL<string iname, string suffix, bit size, list<dag> pattern=[]>
}
def MVE_VCMULf16 : MVE_VCMUL<"vcmul", "f16", 0b0>;
-def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1>;
+def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1, "@earlyclobber $Qd">;
class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
- bit T, list<dag> pattern=[]>
+ bit T, string cstr, list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
- vpred_r, "", pattern> {
+ vpred_r, cstr, pattern> {
bits<4> Qd;
bits<4> Qn;
bits<4> Qm;
@@ -3063,9 +3445,9 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
}
multiclass MVE_VMULL_multi<string iname, string suffix,
- bit bit_28, bits<2> bits_21_20> {
- def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0>;
- def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1>;
+ bit bit_28, bits<2> bits_21_20, string cstr=""> {
+ def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0, cstr>;
+ def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1, cstr>;
}
// For integer multiplies, bits 21:20 encode size, and bit 28 signedness.
@@ -3074,10 +3456,10 @@ multiclass MVE_VMULL_multi<string iname, string suffix,
defm MVE_VMULLs8 : MVE_VMULL_multi<"vmull", "s8", 0b0, 0b00>;
defm MVE_VMULLs16 : MVE_VMULL_multi<"vmull", "s16", 0b0, 0b01>;
-defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10>;
+defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10, "@earlyclobber $Qd">;
defm MVE_VMULLu8 : MVE_VMULL_multi<"vmull", "u8", 0b1, 0b00>;
defm MVE_VMULLu16 : MVE_VMULL_multi<"vmull", "u16", 0b1, 0b01>;
-defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10>;
+defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10, "@earlyclobber $Qd">;
defm MVE_VMULLp8 : MVE_VMULL_multi<"vmull", "p8", 0b0, 0b11>;
defm MVE_VMULLp16 : MVE_VMULL_multi<"vmull", "p16", 0b1, 0b11>;
@@ -3144,6 +3526,18 @@ defm MVE_VQMOVNu32 : MVE_VxMOVxN_halves<"vqmovn", "u32", 0b1, 0b1, 0b01>;
defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>;
defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>;
+def MVEvmovn : SDNode<"ARMISD::VMOVN", SDTARMVEXT>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
+ (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+ def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
+ (v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+ def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))),
+ (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
+ def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))),
+ (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
+}
+
class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
@@ -3166,11 +3560,10 @@ defm MVE_VCVTf16f32 : MVE_VCVT_ff_halves<"f16.f32", 0b0>;
defm MVE_VCVTf32f16 : MVE_VCVT_ff_halves<"f32.f16", 0b1>;
class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
- list<dag> pattern=[]>
+ string cstr="", list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
- "$Qd, $Qn, $Qm, $rot", vpred_r, "",
- pattern> {
+ "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
bits<4> Qn;
bit rot;
@@ -3186,11 +3579,11 @@ class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
def MVE_VCADDi8 : MVE_VxCADD<"vcadd", "i8", 0b00, 0b1>;
def MVE_VCADDi16 : MVE_VxCADD<"vcadd", "i16", 0b01, 0b1>;
-def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1>;
+def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1, "@earlyclobber $Qd">;
def MVE_VHCADDs8 : MVE_VxCADD<"vhcadd", "s8", 0b00, 0b0>;
def MVE_VHCADDs16 : MVE_VxCADD<"vhcadd", "s16", 0b01, 0b0>;
-def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0>;
+def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0, "@earlyclobber $Qd">;
class MVE_VADCSBC<string iname, bit I, bit subtract,
dag carryin, list<dag> pattern=[]>
@@ -3220,10 +3613,10 @@ def MVE_VSBC : MVE_VADCSBC<"vsbc", 0b0, 0b1, (ins cl_FPSCR_NZCV:$carryin)>;
def MVE_VSBCI : MVE_VADCSBC<"vsbci", 0b1, 0b1, (ins)>;
class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
- list<dag> pattern=[]>
+ string cstr="", list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
- vpred_r, "", pattern> {
+ vpred_r, cstr, pattern> {
bits<4> Qn;
let Inst{28} = size;
@@ -3236,13 +3629,13 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
let Inst{0} = 0b1;
}
-multiclass MVE_VQDMULL_halves<string suffix, bit size> {
- def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0>;
- def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1>;
+multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> {
+ def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>;
+ def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>;
}
defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>;
-defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1>;
+defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">;
// end of mve_qDest_qSrc
@@ -3267,9 +3660,9 @@ class MVE_qr_base<dag oops, dag iops, InstrItinClass itin, string iname,
let Inst{3-0} = Rm{3-0};
}
-class MVE_qDest_rSrc<string iname, string suffix, list<dag> pattern=[]>
+class MVE_qDest_rSrc<string iname, string suffix, string cstr="", list<dag> pattern=[]>
: MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qn, rGPR:$Rm),
- NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, "",
+ NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, cstr,
pattern>;
class MVE_qDestSrc_rSrc<string iname, string suffix, list<dag> pattern=[]>
@@ -3291,7 +3684,7 @@ class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]>
class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
bit bit_5, bit bit_12, bit bit_16,
bit bit_28, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ : MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = bit_28;
let Inst{21-20} = size;
@@ -3299,6 +3692,7 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
let Inst{12} = bit_12;
let Inst{8} = 0b1;
let Inst{5} = bit_5;
+ let validForTailPredication = 1;
}
multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix,
@@ -3320,9 +3714,27 @@ defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>;
defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>;
defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
+ (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
+ (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
+ (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+}
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
+ (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
+ (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
+ (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+}
+
class MVE_VQDMULL_qr<string iname, string suffix, bit size,
- bit T, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ bit T, string cstr="", list<dag> pattern=[]>
+ : MVE_qDest_rSrc<iname, suffix, cstr, pattern> {
let Inst{28} = size;
let Inst{21-20} = 0b11;
@@ -3332,18 +3744,18 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size,
let Inst{5} = 0b1;
}
-multiclass MVE_VQDMULL_qr_halves<string suffix, bit size> {
- def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0>;
- def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1>;
+multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> {
+ def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>;
+ def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>;
}
defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>;
-defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1>;
+defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">;
class MVE_VxADDSUB_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, bit subtract,
list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ : MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = bit_28;
let Inst{21-20} = bits_21_20;
@@ -3351,6 +3763,7 @@ class MVE_VxADDSUB_qr<string iname, string suffix,
let Inst{12} = subtract;
let Inst{8} = 0b1;
let Inst{5} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>;
@@ -3388,6 +3801,7 @@ class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
let Inst{12-8} = 0b11110;
let Inst{7} = bit_7;
let Inst{6-4} = 0b110;
+ let validForTailPredication = 1;
}
multiclass MVE_VxSHL_qr_types<string iname, bit bit_7, bit bit_17> {
@@ -3421,7 +3835,7 @@ let Predicates = [HasMVEInt] in {
}
class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ : MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = 0b1;
let Inst{21-20} = size;
@@ -3429,15 +3843,27 @@ class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
let Inst{12} = 0b1;
let Inst{8} = 0b0;
let Inst{5} = 0b1;
+ let validForTailPredication = 1;
}
def MVE_VBRSR8 : MVE_VBRSR<"vbrsr", "8", 0b00>;
def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>;
def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 ( bitreverse (v16i8 MQPR:$val1))),
+ (v16i8 ( MVE_VBRSR8 (v16i8 MQPR:$val1), (t2MOVi (i32 8)) ))>;
+
+ def : Pat<(v4i32 ( bitreverse (v4i32 MQPR:$val1))),
+ (v4i32 ( MVE_VBRSR32 (v4i32 MQPR:$val1), (t2MOVi (i32 32)) ))>;
+
+ def : Pat<(v8i16 ( bitreverse (v8i16 MQPR:$val1))),
+ (v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>;
+}
+
class MVE_VMUL_qr_int<string iname, string suffix,
bits<2> size, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ : MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = 0b0;
let Inst{21-20} = size;
@@ -3445,15 +3871,25 @@ class MVE_VMUL_qr_int<string iname, string suffix,
let Inst{12} = 0b1;
let Inst{8} = 0b0;
let Inst{5} = 0b1;
+ let validForTailPredication = 1;
}
def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>;
def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>;
def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
+ (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
+ (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
+ (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+}
+
class MVE_VxxMUL_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ : MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = bit_28;
let Inst{21-20} = bits_21_20;
@@ -3471,14 +3907,14 @@ def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>;
def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>;
def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>;
-let Predicates = [HasMVEFloat] in {
+let Predicates = [HasMVEFloat], validForTailPredication = 1 in {
def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>;
def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>;
}
class MVE_VFMAMLA_qr<string iname, string suffix,
- bit bit_28, bits<2> bits_21_20, bit S,
- list<dag> pattern=[]>
+ bit bit_28, bits<2> bits_21_20, bit S,
+ list<dag> pattern=[]>
: MVE_qDestSrc_rSrc<iname, suffix, pattern> {
let Inst{28} = bit_28;
@@ -3487,6 +3923,7 @@ class MVE_VFMAMLA_qr<string iname, string suffix,
let Inst{12} = S;
let Inst{8} = 0b0;
let Inst{5} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>;
@@ -3503,6 +3940,21 @@ def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>;
def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>;
def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v4i32 (add (v4i32 MQPR:$src1),
+ (v4i32 (mul (v4i32 MQPR:$src2),
+ (v4i32 (ARMvdup (i32 rGPR:$x))))))),
+ (v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>;
+ def : Pat<(v8i16 (add (v8i16 MQPR:$src1),
+ (v8i16 (mul (v8i16 MQPR:$src2),
+ (v8i16 (ARMvdup (i32 rGPR:$x))))))),
+ (v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>;
+ def : Pat<(v16i8 (add (v16i8 MQPR:$src1),
+ (v16i8 (mul (v16i8 MQPR:$src2),
+ (v16i8 (ARMvdup (i32 rGPR:$x))))))),
+ (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>;
+}
+
let Predicates = [HasMVEFloat] in {
def MVE_VFMA_qr_f16 : MVE_VFMAMLA_qr<"vfma", "f16", 0b1, 0b11, 0b0>;
def MVE_VFMA_qr_f32 : MVE_VFMAMLA_qr<"vfma", "f32", 0b0, 0b11, 0b0>;
@@ -3555,6 +4007,7 @@ class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12,
let Inst{7} = imm{1};
let Inst{6-1} = 0b110111;
let Inst{0} = imm{0};
+ let validForTailPredication = 1;
}
def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>;
@@ -3589,6 +4042,7 @@ class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12,
let Inst{6-4} = 0b110;
let Inst{3-1} = Rm{3-1};
let Inst{0} = imm{0};
+ let validForTailPredication = 1;
}
def MVE_VIWDUPu8 : MVE_VxWDUP<"viwdup", "u8", 0b00, 0b0>;
@@ -3599,6 +4053,7 @@ def MVE_VDWDUPu8 : MVE_VxWDUP<"vdwdup", "u8", 0b00, 0b1>;
def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
+let hasSideEffects = 1 in
class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
: MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
"$Rn", vpred_n, "", pattern> {
@@ -3614,6 +4069,7 @@ class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
let Constraints = "";
let DecoderMethod = "DecodeMveVCTP";
+ let validForTailPredication = 1;
}
def MVE_VCTP8 : MVE_VCTP<"8", 0b00>;
@@ -3621,6 +4077,15 @@ def MVE_VCTP16 : MVE_VCTP<"16", 0b01>;
def MVE_VCTP32 : MVE_VCTP<"32", 0b10>;
def MVE_VCTP64 : MVE_VCTP<"64", 0b11>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(int_arm_vctp8 rGPR:$Rn),
+ (v16i1 (MVE_VCTP8 rGPR:$Rn))>;
+ def : Pat<(int_arm_vctp16 rGPR:$Rn),
+ (v8i1 (MVE_VCTP16 rGPR:$Rn))>;
+ def : Pat<(int_arm_vctp32 rGPR:$Rn),
+ (v4i1 (MVE_VCTP32 rGPR:$Rn))>;
+}
+
// end of mve_qDest_rSrc
// start of coproc mov
@@ -3863,6 +4328,7 @@ class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
let mayLoad = dir.load;
let mayStore = !eq(dir.load,0);
+ let validForTailPredication = 1;
}
// Contiguous load and store instructions. These come in two main
@@ -4165,7 +4631,8 @@ class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> patte
let Inst{7} = fc{0};
let Inst{4} = 0b0;
- let Defs = [VPR, P0];
+ let Defs = [VPR];
+ let validForTailPredication = 1;
}
class MVE_VPTt1<string suffix, bits<2> size, dag iops>
@@ -4177,11 +4644,12 @@ class MVE_VPTt1<string suffix, bits<2> size, dag iops>
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
let Inst{0} = fc{1};
+ let validForTailPredication = 1;
}
class MVE_VPTt1i<string suffix, bits<2> size>
: MVE_VPTt1<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, MQPR:$Qm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_i:$fc)> {
let Inst{12} = 0b0;
let Inst{0} = 0b0;
}
@@ -4192,7 +4660,7 @@ def MVE_VPTv16i8 : MVE_VPTt1i<"i8", 0b00>;
class MVE_VPTt1u<string suffix, bits<2> size>
: MVE_VPTt1<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, MQPR:$Qm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_u:$fc)> {
let Inst{12} = 0b0;
let Inst{0} = 0b1;
}
@@ -4203,7 +4671,7 @@ def MVE_VPTv16u8 : MVE_VPTt1u<"u8", 0b00>;
class MVE_VPTt1s<string suffix, bits<2> size>
: MVE_VPTt1<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, MQPR:$Qm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_s:$fc)> {
let Inst{12} = 0b1;
}
@@ -4225,7 +4693,7 @@ class MVE_VPTt2<string suffix, bits<2> size, dag iops>
class MVE_VPTt2i<string suffix, bits<2> size>
: MVE_VPTt2<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_i:$fc)> {
let Inst{12} = 0b0;
let Inst{5} = 0b0;
}
@@ -4236,7 +4704,7 @@ def MVE_VPTv16i8r : MVE_VPTt2i<"i8", 0b00>;
class MVE_VPTt2u<string suffix, bits<2> size>
: MVE_VPTt2<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_u:$fc)> {
let Inst{12} = 0b0;
let Inst{5} = 0b1;
}
@@ -4247,7 +4715,7 @@ def MVE_VPTv16u8r : MVE_VPTt2u<"u8", 0b00>;
class MVE_VPTt2s<string suffix, bits<2> size>
: MVE_VPTt2<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_s:$fc)> {
let Inst{12} = 0b1;
}
@@ -4276,12 +4744,13 @@ class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern=
let Inst{7} = fc{0};
let Inst{4} = 0b0;
- let Defs = [P0];
+ let Defs = [VPR];
let Predicates = [HasMVEFloat];
+ let validForTailPredication = 1;
}
class MVE_VPTft1<string suffix, bit size>
- : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, MQPR:$Qm),
+ : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_fp:$fc),
"$fc, $Qn, $Qm"> {
bits<3> fc;
bits<4> Qm;
@@ -4296,7 +4765,7 @@ def MVE_VPTv4f32 : MVE_VPTft1<"f32", 0b0>;
def MVE_VPTv8f16 : MVE_VPTft1<"f16", 0b1>;
class MVE_VPTft2<string suffix, bit size>
- : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, GPRwithZR:$Rm),
+ : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_fp:$fc),
"$fc, $Qn, $Rm"> {
bits<3> fc;
bits<4> Rm;
@@ -4322,7 +4791,8 @@ def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary,
let Unpredictable{7} = 0b1;
let Unpredictable{5} = 0b1;
- let Defs = [P0];
+ let Uses = [VPR];
+ let validForTailPredication = 1;
}
def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
@@ -4346,6 +4816,7 @@ def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;
+ let validForTailPredication = 1;
}
foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32",
@@ -4353,19 +4824,113 @@ foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32",
def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm",
(MVE_VPSEL MQPR:$Qd, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
-def MVE_VPNOT : MVE_p<(outs), (ins), NoItinerary,
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
+ (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+ def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
+ (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+ def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
+ (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+
+ def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
+ (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+ def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
+ (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+
+ def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
+ (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+ (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), 1)))>;
+ def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
+ (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+ (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>;
+ def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
+ (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+ (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>;
+
+ def : Pat<(v8f16 (vselect (v8i16 MQPR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
+ (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+ (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>;
+ def : Pat<(v4f32 (vselect (v4i32 MQPR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
+ (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+ (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>;
+
+ // Pred <-> Int
+ def : Pat<(v16i8 (zext (v16i1 VCCR:$pred))),
+ (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+ def : Pat<(v8i16 (zext (v8i1 VCCR:$pred))),
+ (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+ def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))),
+ (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+
+ def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))),
+ (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+ def : Pat<(v8i16 (sext (v8i1 VCCR:$pred))),
+ (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+ def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))),
+ (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+
+ def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))),
+ (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+ def : Pat<(v8i16 (anyext (v8i1 VCCR:$pred))),
+ (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+ def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))),
+ (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+
+ def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))),
+ (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, 1))>;
+ def : Pat<(v8i1 (trunc (v8i16 MQPR:$v1))),
+ (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, 1))>;
+ def : Pat<(v4i1 (trunc (v4i32 MQPR:$v1))),
+ (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, 1))>;
+}
+
+let Predicates = [HasMVEFloat] in {
+ // Pred <-> Float
+ // 112 is 1.0 in float
+ def : Pat<(v4f32 (uint_to_fp (v4i1 VCCR:$pred))),
+ (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>;
+ // 2620 in 1.0 in half
+ def : Pat<(v8f16 (uint_to_fp (v8i1 VCCR:$pred))),
+ (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>;
+ // 240 is -1.0 in float
+ def : Pat<(v4f32 (sint_to_fp (v4i1 VCCR:$pred))),
+ (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>;
+ // 2748 is -1.0 in half
+ def : Pat<(v8f16 (sint_to_fp (v8i1 VCCR:$pred))),
+ (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>;
+
+ def : Pat<(v4i1 (fp_to_uint (v4f32 MQPR:$v1))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>;
+ def : Pat<(v8i1 (fp_to_uint (v8f16 MQPR:$v1))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>;
+ def : Pat<(v4i1 (fp_to_sint (v4f32 MQPR:$v1))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>;
+ def : Pat<(v8i1 (fp_to_sint (v8f16 MQPR:$v1))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>;
+}
+
+def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary,
"vpnot", "", "", vpred_n, "", []> {
let Inst{31-0} = 0b11111110001100010000111101001101;
let Unpredictable{19-17} = 0b111;
let Unpredictable{12} = 0b1;
let Unpredictable{7} = 0b1;
let Unpredictable{5} = 0b1;
- let Defs = [P0];
- let Uses = [P0];
let Constraints = "";
+ let DecoderMethod = "DecodeMVEVPNOT";
}
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v4i1 (xor (v4i1 VCCR:$pred), (v4i1 (predicate_cast (i32 65535))))),
+ (v4i1 (MVE_VPNOT (v4i1 VCCR:$pred)))>;
+ def : Pat<(v8i1 (xor (v8i1 VCCR:$pred), (v8i1 (predicate_cast (i32 65535))))),
+ (v8i1 (MVE_VPNOT (v8i1 VCCR:$pred)))>;
+ def : Pat<(v16i1 (xor (v16i1 VCCR:$pred), (v16i1 (predicate_cast (i32 65535))))),
+ (v16i1 (MVE_VPNOT (v16i1 VCCR:$pred)))>;
+}
+
+
class MVE_loltp_start<dag iops, string asm, string ops, bits<2> size>
: t2LOL<(outs GPRlr:$LR), iops, asm, ops> {
bits<4> Rn;
@@ -4433,159 +4998,440 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> {
// Patterns
//===----------------------------------------------------------------------===//
-class MVE_unpred_vector_store_typed<ValueType Ty, Instruction RegImmInst,
+class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag StoreKind, int shift>
+ : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
+ (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag StoreKind, int shift>
+ : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
+ (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>;
+
+multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
+ int shift> {
+ def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
+}
+
+class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag LoadKind, int shift>
+ : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
+ (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
+class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag LoadKind, int shift>
+ : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
+ (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>;
+
+multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
+ int shift> {
+ def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
+}
+
+class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode,
PatFrag StoreKind, int shift>
- : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
- (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+ : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr),
+ (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>;
-multiclass MVE_unpred_vector_store<Instruction RegImmInst, PatFrag StoreKind,
+multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind,
int shift> {
- def : MVE_unpred_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
-}
-
-class MVE_unpred_vector_load_typed<ValueType Ty, Instruction RegImmInst,
- PatFrag LoadKind, int shift>
- : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
- (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
-
-multiclass MVE_unpred_vector_load<Instruction RegImmInst, PatFrag LoadKind,
- int shift> {
- def : MVE_unpred_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
-}
+ def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>;
+}
+
+def aligned32_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+ (pre_store node:$val, node:$ptr, node:$offset), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned32_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+ (post_store node:$val, node:$ptr, node:$offset), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned16_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+ (pre_store node:$val, node:$ptr, node:$offset), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+ (post_store node:$val, node:$ptr, node:$offset), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+
+
+def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ return Ld->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
+}]>;
+def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
+}]>;
+def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2;
+}]>;
+def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
+}]>;
+def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
+}]>;
+def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4;
+}]>;
+
+def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (maskedstore8 node:$val, node:$ptr, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, node:$pred), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+
+def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (maskedstore16 node:$val, node:$ptr, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, node:$pred), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
+}]>;
let Predicates = [HasMVEInt, IsLE] in {
- defm : MVE_unpred_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
- defm : MVE_unpred_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>;
- defm : MVE_unpred_vector_store<MVE_VSTRWU32, alignedstore32, 2>;
+ // Stores
+ defm : MVE_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
+ defm : MVE_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>;
+ defm : MVE_vector_store<MVE_VSTRWU32, alignedstore32, 2>;
- defm : MVE_unpred_vector_load<MVE_VLDRBU8, byte_alignedload, 0>;
- defm : MVE_unpred_vector_load<MVE_VLDRHU16, hword_alignedload, 1>;
- defm : MVE_unpred_vector_load<MVE_VLDRWU32, alignedload32, 2>;
+ // Loads
+ defm : MVE_vector_load<MVE_VLDRBU8, byte_alignedload, 0>;
+ defm : MVE_vector_load<MVE_VLDRHU16, hword_alignedload, 1>;
+ defm : MVE_vector_load<MVE_VLDRWU32, alignedload32, 2>;
- def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)),
- (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
- def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)),
- (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
- def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)),
- (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
+ // Pre/post inc stores
+ defm : MVE_vector_offset_store<MVE_VSTRBU8_pre, pre_store, 0>;
+ defm : MVE_vector_offset_store<MVE_VSTRBU8_post, post_store, 0>;
+ defm : MVE_vector_offset_store<MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
+ defm : MVE_vector_offset_store<MVE_VSTRHU16_post, aligned16_post_store, 1>;
+ defm : MVE_vector_offset_store<MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
+ defm : MVE_vector_offset_store<MVE_VSTRWU32_post, aligned32_post_store, 2>;
}
let Predicates = [HasMVEInt, IsBE] in {
- def : MVE_unpred_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>;
- def : MVE_unpred_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>;
- def : MVE_unpred_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>;
- def : MVE_unpred_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>;
- def : MVE_unpred_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>;
-
- def : MVE_unpred_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>;
- def : MVE_unpred_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>;
- def : MVE_unpred_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>;
- def : MVE_unpred_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>;
- def : MVE_unpred_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>;
+ // Aligned Stores
+ def : MVE_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>;
+ def : MVE_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>;
+ def : MVE_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>;
+ def : MVE_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>;
+ def : MVE_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>;
+
+ // Aligned Loads
+ def : MVE_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>;
+ def : MVE_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>;
+ def : MVE_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>;
+ def : MVE_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>;
+ def : MVE_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>;
+
+ // Other unaligned loads/stores need to go though a VREV
+ def : Pat<(v2f64 (load t2addrmode_imm7<0>:$addr)),
+ (v2f64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(v2i64 (load t2addrmode_imm7<0>:$addr)),
+ (v2i64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(v4i32 (load t2addrmode_imm7<0>:$addr)),
+ (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(v4f32 (load t2addrmode_imm7<0>:$addr)),
+ (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(v8i16 (load t2addrmode_imm7<0>:$addr)),
+ (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(v8f16 (load t2addrmode_imm7<0>:$addr)),
+ (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(store (v2f64 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+ def : Pat<(store (v2i64 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+ def : Pat<(store (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+ def : Pat<(store (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+ def : Pat<(store (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+ def : Pat<(store (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+
+ // Pre/Post inc stores
+ def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_pre, pre_store, 0>;
+ def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_post, post_store, 0>;
+ def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
+ def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_post, aligned16_post_store, 1>;
+ def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
+ def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_post, aligned16_post_store, 1>;
+ def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
+ def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
+ def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
+ def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
}
+let Predicates = [HasMVEInt] in {
+ // Aligned masked store, shared between LE and BE
+ def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore8, 0>;
+ def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, maskedstore16, 1>;
+ def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, maskedstore16, 1>;
+ def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, maskedstore32, 2>;
+ def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, maskedstore32, 2>;
+ // Truncating stores
+ def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+ (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+ def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+ (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+ def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred),
+ (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>;
+ // Aligned masked loads
+ def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload8, 0>;
+ def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+ def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+ def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+ def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+ // Extending masked loads.
+ def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v8i16 NEONimmAllZerosV))),
+ (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v8i16 NEONimmAllZerosV))),
+ (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v8i16 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v8i16 NEONimmAllZerosV))),
+ (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (extmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+}
// Widening/Narrowing Loads/Stores
+let MinAlignment = 2 in {
+ def truncstorevi16_align2 : PatFrag<(ops node:$val, node:$ptr),
+ (truncstorevi16 node:$val, node:$ptr)>;
+ def post_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset),
+ (post_truncstvi16 node:$val, node:$base, node:$offset)>;
+ def pre_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset),
+ (pre_truncstvi16 node:$val, node:$base, node:$offset)>;
+}
+
let Predicates = [HasMVEInt] in {
- def : Pat<(truncstorevi8 (v8i16 MQPR:$val), t2addrmode_imm7<1>:$addr),
- (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<1>:$addr)>;
- def : Pat<(truncstorevi8 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr),
- (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<1>:$addr)>;
- def : Pat<(truncstorevi16 (v4i32 MQPR:$val), t2addrmode_imm7<2>:$addr),
- (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<2>:$addr)>;
+ def : Pat<(truncstorevi8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr),
+ (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr)>;
+ def : Pat<(truncstorevi8 (v4i32 MQPR:$val), taddrmode_imm7<0>:$addr),
+ (MVE_VSTRB32 MQPR:$val, taddrmode_imm7<0>:$addr)>;
+ def : Pat<(truncstorevi16_align2 (v4i32 MQPR:$val), taddrmode_imm7<1>:$addr),
+ (MVE_VSTRH32 MQPR:$val, taddrmode_imm7<1>:$addr)>;
+
+ def : Pat<(post_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+ (MVE_VSTRB16_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+ def : Pat<(post_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+ (MVE_VSTRB32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+ def : Pat<(post_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr),
+ (MVE_VSTRH32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>;
+
+ def : Pat<(pre_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+ (MVE_VSTRB16_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+ def : Pat<(pre_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+ (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+ def : Pat<(pre_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr),
+ (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>;
+}
+
+
+let MinAlignment = 2 in {
+ def extloadvi16_align2 : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>;
+ def sextloadvi16_align2 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>;
+ def zextloadvi16_align2 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>;
}
multiclass MVEExtLoad<string DestLanes, string DestElemBits,
string SrcElemBits, string SrcElemType,
- Operand am> {
+ string Align, Operand am> {
def _Any : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
- (!cast<PatFrag>("extloadvi" # SrcElemBits) am:$addr)),
+ (!cast<PatFrag>("extloadvi" # SrcElemBits # Align) am:$addr)),
(!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
am:$addr)>;
def _Z : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
- (!cast<PatFrag>("zextloadvi" # SrcElemBits) am:$addr)),
+ (!cast<PatFrag>("zextloadvi" # SrcElemBits # Align) am:$addr)),
(!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
am:$addr)>;
def _S : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
- (!cast<PatFrag>("sextloadvi" # SrcElemBits) am:$addr)),
+ (!cast<PatFrag>("sextloadvi" # SrcElemBits # Align) am:$addr)),
(!cast<Instruction>("MVE_VLDR" # SrcElemType # "S" # DestElemBits)
am:$addr)>;
}
let Predicates = [HasMVEInt] in {
- defm : MVEExtLoad<"4", "32", "8", "B", t2addrmode_imm7<1>>;
- defm : MVEExtLoad<"8", "16", "8", "B", t2addrmode_imm7<1>>;
- defm : MVEExtLoad<"4", "32", "16", "H", t2addrmode_imm7<2>>;
+ defm : MVEExtLoad<"4", "32", "8", "B", "", taddrmode_imm7<0>>;
+ defm : MVEExtLoad<"8", "16", "8", "B", "", taddrmode_imm7<0>>;
+ defm : MVEExtLoad<"4", "32", "16", "H", "_align2", taddrmode_imm7<1>>;
}
// Bit convert patterns
let Predicates = [HasMVEInt] in {
- def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v2i64 MQPR:$src))), (v2f64 MQPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v2f64 MQPR:$src))), (v2i64 MQPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v4f32 MQPR:$src))), (v4i32 MQPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v4i32 MQPR:$src))), (v4f32 MQPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v8f16 MQPR:$src))), (v8i16 MQPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v8i16 MQPR:$src))), (v8f16 MQPR:$src)>;
}
let Predicates = [IsLE,HasMVEInt] in {
- def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
-
- def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
- def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
- def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>;
- def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
- def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
-
- def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
-
- def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
-
- def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>;
-
- def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
-
- def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 MQPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 MQPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 MQPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 MQPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 MQPR:$src)>;
+
+ def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 MQPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 MQPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 MQPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 MQPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 MQPR:$src)>;
+
+ def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 MQPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 MQPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 MQPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 MQPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 MQPR:$src)>;
+
+ def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 MQPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 MQPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 MQPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 MQPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 MQPR:$src)>;
+
+ def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 MQPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 MQPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 MQPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 MQPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 MQPR:$src)>;
+
+ def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 MQPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 MQPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 MQPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 MQPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 MQPR:$src)>;
+
+ def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 MQPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 MQPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 MQPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 MQPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 MQPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 MQPR:$src)>;
+}
+
+let Predicates = [IsBE,HasMVEInt] in {
+ def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 (MVE_VREV64_8 MQPR:$src))>;
+
+ def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 (MVE_VREV64_8 MQPR:$src))>;
+
+ def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 (MVE_VREV32_8 MQPR:$src))>;
+
+ def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 (MVE_VREV32_8 MQPR:$src))>;
+
+ def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 (MVE_VREV16_8 MQPR:$src))>;
+
+ def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 (MVE_VREV16_8 MQPR:$src))>;
+
+ def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>;
+ def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>;
+ def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>;
+ def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>;
+ def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>;
+ def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>;
}
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 806681df102c..60ca92e58041 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -15,22 +15,22 @@
// NEON-specific Operands.
//===----------------------------------------------------------------------===//
def nModImm : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
}
def nImmSplatI8AsmOperand : AsmOperandClass { let Name = "NEONi8splat"; }
def nImmSplatI8 : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmSplatI8AsmOperand;
}
def nImmSplatI16AsmOperand : AsmOperandClass { let Name = "NEONi16splat"; }
def nImmSplatI16 : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmSplatI16AsmOperand;
}
def nImmSplatI32AsmOperand : AsmOperandClass { let Name = "NEONi32splat"; }
def nImmSplatI32 : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmSplatI32AsmOperand;
}
def nImmSplatNotI16AsmOperand : AsmOperandClass { let Name = "NEONi16splatNot"; }
@@ -43,7 +43,7 @@ def nImmSplatNotI32 : Operand<i32> {
}
def nImmVMOVI32AsmOperand : AsmOperandClass { let Name = "NEONi32vmov"; }
def nImmVMOVI32 : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmVMOVI32AsmOperand;
}
@@ -62,18 +62,18 @@ class nImmVINVIAsmOperandReplicate<ValueType From, ValueType To>
}
class nImmVMOVIReplicate<ValueType From, ValueType To> : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmVMOVIAsmOperandReplicate<From, To>;
}
class nImmVINVIReplicate<ValueType From, ValueType To> : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmVINVIAsmOperandReplicate<From, To>;
}
def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; }
def nImmVMOVI32Neg : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmVMOVI32NegAsmOperand;
}
def nImmVMOVF32 : Operand<i32> {
@@ -82,7 +82,7 @@ def nImmVMOVF32 : Operand<i32> {
}
def nImmSplatI64AsmOperand : AsmOperandClass { let Name = "NEONi64splat"; }
def nImmSplatI64 : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmSplatI64AsmOperand;
}
@@ -478,20 +478,8 @@ def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
// NEON-specific DAG Nodes.
//===----------------------------------------------------------------------===//
-def SDTARMVCMP : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
-def SDTARMVCMPZ : SDTypeProfile<1, 1, []>;
-
-def NEONvceq : SDNode<"ARMISD::VCEQ", SDTARMVCMP>;
-def NEONvceqz : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>;
-def NEONvcge : SDNode<"ARMISD::VCGE", SDTARMVCMP>;
-def NEONvcgez : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>;
-def NEONvclez : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>;
-def NEONvcgeu : SDNode<"ARMISD::VCGEU", SDTARMVCMP>;
-def NEONvcgt : SDNode<"ARMISD::VCGT", SDTARMVCMP>;
-def NEONvcgtz : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>;
-def NEONvcltz : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>;
-def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
-def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>;
+def SDTARMVTST : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
+def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVTST>;
// Types for vector shift by immediates. The "SHX" version is for long and
// narrow operations where the source and destination vectors have different
@@ -559,14 +547,14 @@ def NEONvtbl2 : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>;
def NEONimmAllZerosV: PatLeaf<(ARMvmovImm (i32 timm)), [{
ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
unsigned EltBits = 0;
- uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+ uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
return (EltBits == 32 && EltVal == 0);
}]>;
def NEONimmAllOnesV: PatLeaf<(ARMvmovImm (i32 timm)), [{
ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
unsigned EltBits = 0;
- uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+ uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
return (EltBits == 8 && EltVal == 0xff);
}]>;
@@ -3326,30 +3314,30 @@ class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
// source operand element sizes of 8, 16 and 32 bits:
multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
bits<5> op11_7, bit op4, string opc, string Dt,
- string asm, SDNode OpNode> {
+ string asm, int fc> {
// 64-bit vector types.
def v8i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "8"), asm, "",
- [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>;
+ [(set DPR:$Vd, (v8i8 (ARMvcmpz (v8i8 DPR:$Vm), (i32 fc))))]>;
def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "16"), asm, "",
- [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>;
+ [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4i16 DPR:$Vm), (i32 fc))))]>;
def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "32"), asm, "",
- [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>;
+ [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2i32 DPR:$Vm), (i32 fc))))]>;
def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, "f32", asm, "",
- [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> {
+ [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2f32 DPR:$Vm), (i32 fc))))]> {
let Inst{10} = 1; // overwrite F = 1
}
def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, "f16", asm, "",
- [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>,
+ [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4f16 DPR:$Vm), (i32 fc))))]>,
Requires<[HasNEON,HasFullFP16]> {
let Inst{10} = 1; // overwrite F = 1
}
@@ -3358,30 +3346,83 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "8"), asm, "",
- [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>;
+ [(set QPR:$Vd, (v16i8 (ARMvcmpz (v16i8 QPR:$Vm), (i32 fc))))]>;
def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "16"), asm, "",
- [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>;
+ [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8i16 QPR:$Vm), (i32 fc))))]>;
def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "32"), asm, "",
- [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>;
+ [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4i32 QPR:$Vm), (i32 fc))))]>;
def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, "f32", asm, "",
- [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> {
+ [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4f32 QPR:$Vm), (i32 fc))))]> {
let Inst{10} = 1; // overwrite F = 1
}
def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, "f16", asm, "",
- [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>,
+ [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8f16 QPR:$Vm), (i32 fc))))]>,
Requires<[HasNEON,HasFullFP16]> {
let Inst{10} = 1; // overwrite F = 1
}
}
+// Neon 3-register comparisons.
+class N3VQ_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, int fc, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (ResTy (ARMvcmp (OpTy QPR:$Vn), (OpTy QPR:$Vm), (i32 fc))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = Commutable;
+}
+
+class N3VD_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, int fc, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set DPR:$Vd, (ResTy (ARMvcmp (OpTy DPR:$Vn), (OpTy DPR:$Vm), (i32 fc))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = Commutable;
+}
+
+multiclass N3V_QHS_cmp<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt,
+ int fc, bit Commutable = 0> {
+ // 64-bit vector types.
+ def v8i8 : N3VD_cmp<op24, op23, 0b00, op11_8, op4, itinD16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v8i8, v8i8, fc, Commutable>;
+ def v4i16 : N3VD_cmp<op24, op23, 0b01, op11_8, op4, itinD16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v4i16, v4i16, fc, Commutable>;
+ def v2i32 : N3VD_cmp<op24, op23, 0b10, op11_8, op4, itinD32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v2i32, v2i32, fc, Commutable>;
+
+ // 128-bit vector types.
+ def v16i8 : N3VQ_cmp<op24, op23, 0b00, op11_8, op4, itinQ16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v16i8, v16i8, fc, Commutable>;
+ def v8i16 : N3VQ_cmp<op24, op23, 0b01, op11_8, op4, itinQ16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v8i16, v8i16, fc, Commutable>;
+ def v4i32 : N3VQ_cmp<op24, op23, 0b10, op11_8, op4, itinQ32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v4i32, v4i32, fc, Commutable>;
+}
+
// Neon 2-register vector intrinsics,
// element sizes of 8, 16 and 32 bits:
@@ -5026,67 +5067,67 @@ def : Pat<(v2i32 (trunc (ARMvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
// Vector Comparisons.
// VCEQ : Vector Compare Equal
-defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>;
-def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
- NEONvceq, 1>;
-def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
- NEONvceq, 1>;
-def VCEQhd : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
- NEONvceq, 1>,
+defm VCEQ : N3V_QHS_cmp<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vceq", "i", 0, 1>;
+def VCEQfd : N3VD_cmp<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
+ 0, 1>;
+def VCEQfq : N3VQ_cmp<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
+ 0, 1>;
+def VCEQhd : N3VD_cmp<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
+ 0, 1>,
Requires<[HasNEON, HasFullFP16]>;
-def VCEQhq : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
- NEONvceq, 1>,
+def VCEQhq : N3VQ_cmp<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
+ 0, 1>,
Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in
defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
- "$Vd, $Vm, #0", NEONvceqz>;
+ "$Vd, $Vm, #0", 0>;
// VCGE : Vector Compare Greater Than or Equal
-defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>;
-defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>;
-def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
- NEONvcge, 0>;
-def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
- NEONvcge, 0>;
-def VCGEhd : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
- NEONvcge, 0>,
+defm VCGEs : N3V_QHS_cmp<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcge", "s", 10, 0>;
+defm VCGEu : N3V_QHS_cmp<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcge", "u", 2, 0>;
+def VCGEfd : N3VD_cmp<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
+ 10, 0>;
+def VCGEfq : N3VQ_cmp<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
+ 10, 0>;
+def VCGEhd : N3VD_cmp<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
+ 10, 0>,
Requires<[HasNEON, HasFullFP16]>;
-def VCGEhq : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
- NEONvcge, 0>,
+def VCGEhq : N3VQ_cmp<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
+ 10, 0>,
Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in {
defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
- "$Vd, $Vm, #0", NEONvcgez>;
+ "$Vd, $Vm, #0", 10>;
defm VCLEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
- "$Vd, $Vm, #0", NEONvclez>;
+ "$Vd, $Vm, #0", 13>;
}
// VCGT : Vector Compare Greater Than
-defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>;
-defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>;
-def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
- NEONvcgt, 0>;
-def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
- NEONvcgt, 0>;
-def VCGThd : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
- NEONvcgt, 0>,
+defm VCGTs : N3V_QHS_cmp<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcgt", "s", 12, 0>;
+defm VCGTu : N3V_QHS_cmp<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcgt", "u", 8, 0>;
+def VCGTfd : N3VD_cmp<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
+ 12, 0>;
+def VCGTfq : N3VQ_cmp<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
+ 12, 0>;
+def VCGThd : N3VD_cmp<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
+ 12, 0>,
Requires<[HasNEON, HasFullFP16]>;
-def VCGThq : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
- NEONvcgt, 0>,
+def VCGThq : N3VQ_cmp<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
+ 12, 0>,
Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in {
defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
- "$Vd, $Vm, #0", NEONvcgtz>;
+ "$Vd, $Vm, #0", 12>;
defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
- "$Vd, $Vm, #0", NEONvcltz>;
+ "$Vd, $Vm, #0", 11>;
}
// VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index cfeb13c6acb6..18bcbda44580 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -565,6 +565,13 @@ let isCall = 1,
4, IIC_Br,
[(ARMcall_nolink tGPR:$func)]>,
Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>;
+
+ // Also used for Thumb2
+ // push lr before the call
+ def tBL_PUSHLR : tPseudoInst<(outs), (ins GPRlr:$ra, pred:$p, thumb_bl_target:$func),
+ 4, IIC_Br,
+ []>,
+ Requires<[IsThumb]>, Sched<[WriteBr]>;
}
let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
@@ -592,6 +599,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
[(ARMbrjt tGPR:$target, tjumptable:$jt)]>,
Sched<[WriteBrTbl]> {
let Size = 2;
+ let isNotDuplicable = 1;
list<Predicate> Predicates = [IsThumb, IsThumb1Only];
}
}
@@ -1362,6 +1370,12 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
[(set tGPR:$Rd, CPSR, (ARMsubc 0, tGPR:$Rn))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
+
+ def tLSLSri : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, imm0_31:$imm5),
+ 2, IIC_iALUr,
+ [(set tGPR:$Rd, CPSR, (ARMlsls tGPR:$Rn, imm0_31:$imm5))]>,
+ Requires<[IsThumb1Only]>,
+ Sched<[WriteALU]>;
}
@@ -1465,7 +1479,7 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
// Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them
// and make use of the same compressed jump table format as Thumb-2.
let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1,
- isIndirectBranch = 1 in {
+ isIndirectBranch = 1, isNotDuplicable = 1 in {
def tTBB_JT : tPseudoInst<(outs),
(ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
IIC_Br, []>, Sched<[WriteBr]>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 7cbfaba7a8eb..25a45b39fa0c 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -45,7 +45,8 @@ def mve_shift_imm : AsmOperandClass {
let RenderMethod = "addImmOperands";
let DiagnosticString = "operand must be an immediate in the range [1,32]";
}
-def long_shift : Operand<i32> {
+def long_shift : Operand<i32>,
+ ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> {
let ParserMatchClass = mve_shift_imm;
let DecoderMethod = "DecodeLongShiftOperand";
}
@@ -2394,6 +2395,23 @@ def : Thumb2DSPPat<(int_arm_qadd(int_arm_qadd rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)),
(t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(saddsat rGPR:$Rm, rGPR:$Rn),
+ (t2QADD rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ssubsat rGPR:$Rm, rGPR:$Rn),
+ (t2QSUB rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+ (t2QDADD rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
+ (t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn),
+ (t2QADD8 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn),
+ (t2QSUB8 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn),
+ (t2QADD16 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn),
+ (t2QSUB16 rGPR:$Rm, rGPR:$Rn)>;
+
// Signed/Unsigned add/subtract
def t2SASX : T2I_pam_intrinsics<0b010, 0b0000, "sasx", int_arm_sasx>;
@@ -4085,7 +4103,7 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
// Pseudo isntruction that combines movs + predicated rsbmi
// to implement integer ABS
-let usesCustomInserter = 1, Defs = [CPSR] in {
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
NoItinerary, []>, Requires<[IsThumb2]>;
}
@@ -4175,15 +4193,15 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm, list<dag>
}
let DecoderNamespace = "Thumb2CoProc" in {
-defm t2LDC : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2LDCL : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2LDC2 : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
-defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2LDC : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2LDCL : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2LDC2 : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
-defm t2STC : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2STCL : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2STC2 : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
-defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2STC : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2STCL : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2STC2 : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
}
@@ -4368,8 +4386,8 @@ def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
(outs),
(ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
c_imm:$CRm, imm0_7:$opc2),
- [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
- imm:$CRm, imm:$opc2)]>,
+ [(int_arm_mcr timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+ timm:$CRm, timm:$opc2)]>,
ComplexDeprecationPredicate<"MCR">;
def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
(t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
@@ -4377,8 +4395,8 @@ def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
(outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
c_imm:$CRm, imm0_7:$opc2),
- [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
- imm:$CRm, imm:$opc2)]> {
+ [(int_arm_mcr2 timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+ timm:$CRm, timm:$opc2)]> {
let Predicates = [IsThumb2, PreV8];
}
def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm",
@@ -4402,24 +4420,24 @@ def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm",
(t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
c_imm:$CRm, 0, pred:$p)>;
-def : T2v6Pat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
- (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : T2v6Pat<(int_arm_mrc timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2),
+ (t2MRC p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
-def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
- (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : T2v6Pat<(int_arm_mrc2 timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2),
+ (t2MRC2 p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
/* from ARM core register to coprocessor */
def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0, (outs),
(ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
c_imm:$CRm),
- [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
- imm:$CRm)]>;
+ [(int_arm_mcrr timm:$cop, timm:$opc1, GPR:$Rt, GPR:$Rt2,
+ timm:$CRm)]>;
def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0, (outs),
(ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
c_imm:$CRm),
- [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
- GPR:$Rt2, imm:$CRm)]> {
+ [(int_arm_mcrr2 timm:$cop, timm:$opc1, GPR:$Rt,
+ GPR:$Rt2, timm:$CRm)]> {
let Predicates = [IsThumb2, PreV8];
}
@@ -4439,8 +4457,8 @@ def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1, (outs GPR:$Rt, GPR:$Rt2),
def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
"cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
- [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
- imm:$CRm, imm:$opc2)]> {
+ [(int_arm_cdp timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+ timm:$CRm, timm:$opc2)]> {
let Inst{27-24} = 0b1110;
bits<4> opc1;
@@ -4465,8 +4483,8 @@ def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
"cdp2", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
- [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
- imm:$CRm, imm:$opc2)]> {
+ [(int_arm_cdp2 timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+ timm:$CRm, timm:$opc2)]> {
let Inst{27-24} = 0b1110;
bits<4> opc1;
@@ -5087,6 +5105,7 @@ def t2BF_LabelPseudo
: t2PseudoInst<(outs ), (ins pclabel:$cp), 0, NoItinerary, []> {
let isTerminator = 1;
let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB];
+ let hasNoSchedulingInfo = 1;
}
def t2BFi : t2BF<(ins bflabel_u4:$b_label, bflabel_s16:$label, pred:$p),
@@ -5217,11 +5236,13 @@ def t2LoopDec :
t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
4, IIC_Br, []>, Sched<[WriteBr]>;
-let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in {
+let isBranch = 1, isTerminator = 1, hasSideEffects = 1, Defs = [CPSR] in {
+// Set WhileLoopStart and LoopEnd to occupy 8 bytes because they may
+// get converted into t2CMP and t2Bcc.
def t2WhileLoopStart :
t2PseudoInst<(outs),
(ins rGPR:$elts, brtarget:$target),
- 4, IIC_Br, []>,
+ 8, IIC_Br, []>,
Sched<[WriteBr]>;
def t2LoopEnd :
@@ -5233,7 +5254,7 @@ def t2LoopEnd :
} // end isNotDuplicable
class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
- : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZR:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond),
+ : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond),
AddrModeNone, NoItinerary, iname, "$Rd, $Rn, $Rm, $fcond", "", pattern> {
bits<4> Rd;
bits<4> Rm;
@@ -5255,6 +5276,25 @@ def t2CSINC : CS<"csinc", 0b1001>;
def t2CSINV : CS<"csinv", 0b1010>;
def t2CSNEG : CS<"csneg", 0b1011>;
+let Predicates = [HasV8_1MMainline] in {
+ def : T2Pat<(ARMcsinc GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
+ (t2CSINC GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+ def : T2Pat<(ARMcsinv GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
+ (t2CSINV GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+ def : T2Pat<(ARMcsneg GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
+ (t2CSNEG GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+
+ multiclass ModifiedV8_1CSEL<Instruction Insn, dag modvalue> {
+ def : T2Pat<(ARMcmov modvalue, GPRwithZR:$tval, cmovpred:$imm),
+ (Insn GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+ def : T2Pat<(ARMcmov GPRwithZR:$tval, modvalue, cmovpred:$imm),
+ (Insn GPRwithZR:$tval, GPRwithZR:$fval,
+ (i32 (inv_cond_XFORM imm:$imm)))>;
+ }
+ defm : ModifiedV8_1CSEL<t2CSINC, (add rGPR:$fval, 1)>;
+ defm : ModifiedV8_1CSEL<t2CSINV, (xor rGPR:$fval, -1)>;
+ defm : ModifiedV8_1CSEL<t2CSNEG, (sub 0, rGPR:$fval)>;
+}
// CS aliases.
let Predicates = [HasV8_1MMainline] in {
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index a0dd25de07ee..fdd961bfbb2f 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -10,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-def SDT_CMPFP0 : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisVT<1, i32>]>;
+def SDT_CMPFP0 : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
SDTCisSameAs<1, 2>]>;
def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
@@ -19,7 +19,7 @@ def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
def SDT_VMOVSR : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>;
def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInGlue, SDNPOutGlue]>;
-def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMFCmp, [SDNPOutGlue]>;
+def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>;
def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
def arm_fmrrd : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>;
@@ -324,7 +324,7 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
// However, there is no UAL syntax for them, so we keep them around for
// (dis)assembly only.
multiclass vfp_ldstx_mult<string asm, bit L_bit> {
- let Predicates = [HasFPRegs] in {
+ let Predicates = [HasFPRegs], hasNoSchedulingInfo = 1 in {
// Unknown precision
def XIA :
AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
@@ -548,12 +548,12 @@ let Defs = [FPSCR_NZCV] in {
def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
(outs), (ins DPR:$Dd, DPR:$Dm),
IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
- [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 1))]>;
+ [/* For disassembly only; pattern left blank */]>;
def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
(outs), (ins SPR:$Sd, SPR:$Sm),
IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
- [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 1))]> {
+ [/* For disassembly only; pattern left blank */]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
let D = VFPNeonA8Domain;
@@ -562,17 +562,17 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
(outs), (ins HPR:$Sd, HPR:$Sm),
IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
- [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 1))]>;
+ [/* For disassembly only; pattern left blank */]>;
def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
(outs), (ins DPR:$Dd, DPR:$Dm),
IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
- [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 0))]>;
+ [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
(outs), (ins SPR:$Sd, SPR:$Sm),
IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
- [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 0))]> {
+ [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
let D = VFPNeonA8Domain;
@@ -581,7 +581,7 @@ def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
def VCMPH : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
(outs), (ins HPR:$Sd, HPR:$Sm),
IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
- [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 0))]>;
+ [(arm_cmpfp HPR:$Sd, HPR:$Sm)]>;
} // Defs = [FPSCR_NZCV]
//===----------------------------------------------------------------------===//
@@ -611,7 +611,7 @@ let Defs = [FPSCR_NZCV] in {
def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
(outs), (ins DPR:$Dd),
IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
- [(arm_cmpfp0 (f64 DPR:$Dd), (i32 1))]> {
+ [/* For disassembly only; pattern left blank */]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -619,7 +619,7 @@ def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
(outs), (ins SPR:$Sd),
IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
- [(arm_cmpfp0 SPR:$Sd, (i32 1))]> {
+ [/* For disassembly only; pattern left blank */]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
@@ -631,7 +631,7 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
(outs), (ins HPR:$Sd),
IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
- [(arm_cmpfp0 HPR:$Sd, (i32 1))]> {
+ [/* For disassembly only; pattern left blank */]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -639,7 +639,7 @@ def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
(outs), (ins DPR:$Dd),
IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
- [(arm_cmpfp0 (f64 DPR:$Dd), (i32 0))]> {
+ [(arm_cmpfp0 (f64 DPR:$Dd))]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -647,7 +647,7 @@ def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
(outs), (ins SPR:$Sd),
IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
- [(arm_cmpfp0 SPR:$Sd, (i32 0))]> {
+ [(arm_cmpfp0 SPR:$Sd)]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
@@ -659,7 +659,7 @@ def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
def VCMPZH : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
(outs), (ins HPR:$Sd),
IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
- [(arm_cmpfp0 HPR:$Sd, (i32 0))]> {
+ [(arm_cmpfp0 HPR:$Sd)]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -1732,7 +1732,8 @@ def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0,
(outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
- IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> {
+ IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []>,
+ Sched<[WriteFPCVT]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
let D = VFPNeonA8Domain;
@@ -1740,7 +1741,8 @@ def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0,
def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1,
(outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
- IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> {
+ IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []>,
+ Sched<[WriteFPCVT]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
let D = VFPNeonA8Domain;
@@ -1748,7 +1750,8 @@ def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1,
def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1,
(outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
- IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> {
+ IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []>,
+ Sched<[WriteFPCVT]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
let D = VFPNeonA8Domain;
@@ -2297,6 +2300,8 @@ class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
let Inst{6-5} = 0b00;
let Inst{4} = 1;
let Inst{3-0} = 0b0000;
+ let Unpredictable{7-5} = 0b111;
+ let Unpredictable{3-0} = 0b1111;
}
let DecoderMethod = "DecodeForVMRSandVMSR" in {
@@ -2370,63 +2375,65 @@ class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
// Instruction operand.
- bits<4> src;
-
- // Encode instruction operand.
- let Inst{15-12} = src;
+ bits<4> Rt;
let Inst{27-20} = 0b11101110;
let Inst{19-16} = opc19_16;
+ let Inst{15-12} = Rt;
let Inst{11-8} = 0b1010;
let Inst{7} = 0;
+ let Inst{6-5} = 0b00;
let Inst{4} = 1;
+ let Inst{3-0} = 0b0000;
let Predicates = [HasVFP2];
+ let Unpredictable{7-5} = 0b111;
+ let Unpredictable{3-0} = 0b1111;
}
let DecoderMethod = "DecodeForVMRSandVMSR" in {
let Defs = [FPSCR] in {
let Predicates = [HasFPRegs] in
// Application level GPR -> FPSCR
- def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$src),
- "vmsr", "\tfpscr, $src",
- [(int_arm_set_fpscr GPRnopc:$src)]>;
+ def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$Rt),
+ "vmsr", "\tfpscr, $Rt",
+ [(int_arm_set_fpscr GPRnopc:$Rt)]>;
// System level GPR -> FPEXC
- def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPRnopc:$src),
- "vmsr", "\tfpexc, $src", []>;
+ def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPRnopc:$Rt),
+ "vmsr", "\tfpexc, $Rt", []>;
// System level GPR -> FPSID
- def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPRnopc:$src),
- "vmsr", "\tfpsid, $src", []>;
- def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPRnopc:$src),
- "vmsr", "\tfpinst, $src", []>;
- def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$src),
- "vmsr", "\tfpinst2, $src", []>;
+ def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPRnopc:$Rt),
+ "vmsr", "\tfpsid, $Rt", []>;
+ def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPRnopc:$Rt),
+ "vmsr", "\tfpinst, $Rt", []>;
+ def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$Rt),
+ "vmsr", "\tfpinst2, $Rt", []>;
}
let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
// System level GPR -> FPSCR with context saving for security extensions
- def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$src),
- "vmsr", "\tfpcxtns, $src", []>;
+ def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$Rt),
+ "vmsr", "\tfpcxtns, $Rt", []>;
}
let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
// System level GPR -> FPSCR with context saving for security extensions
- def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$src),
- "vmsr", "\tfpcxts, $src", []>;
+ def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$Rt),
+ "vmsr", "\tfpcxts, $Rt", []>;
}
let Predicates = [HasV8_1MMainline, HasFPRegs] in {
// System level GPR -> FPSCR_NZCVQC
def VMSR_FPSCR_NZCVQC
: MovToVFP<0b0010 /* fpscr_nzcvqc */,
- (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$src),
- "vmsr", "\tfpscr_nzcvqc, $src", []>;
+ (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$Rt),
+ "vmsr", "\tfpscr_nzcvqc, $Rt", []>;
}
let Predicates = [HasV8_1MMainline, HasMVEInt] in {
// System level GPR -> VPR/P0
let Defs = [VPR] in
- def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$src),
- "vmsr", "\tvpr, $src", []>;
+ def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$Rt),
+ "vmsr", "\tvpr, $Rt", []>;
- def VMSR_P0 : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$src),
- "vmsr", "\tp0, $src", []>;
+ def VMSR_P0 : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$Rt),
+ "vmsr", "\tp0, $Rt", []>;
}
}
@@ -2614,7 +2621,8 @@ def VSCCLRMD : VFPXI<(outs), (ins pred:$p, fp_dreglist_with_vpr:$regs, variable_
let Inst{21-16} = 0b011111;
let Inst{15-12} = regs{11-8};
let Inst{11-8} = 0b1011;
- let Inst{7-0} = regs{7-0};
+ let Inst{7-1} = regs{7-1};
+ let Inst{0} = 0;
let DecoderMethod = "DecodeVSCCLRM";
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 4485a474a6df..8e5e474c0f59 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -34,7 +34,7 @@ public:
ARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI,
const ARMRegisterBankInfo &RBI);
- bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ bool select(MachineInstr &I) override;
static const char *getName() { return DEBUG_TYPE; }
private:
@@ -210,8 +210,8 @@ static const TargetRegisterClass *guessRegClass(unsigned Reg,
static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
- unsigned DstReg = I.getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ Register DstReg = I.getOperand(0).getReg();
+ if (Register::isPhysicalRegister(DstReg))
return true;
const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI);
@@ -236,17 +236,17 @@ static bool selectMergeValues(MachineInstrBuilder &MIB,
// We only support G_MERGE_VALUES as a way to stick together two scalar GPRs
// into one DPR.
- unsigned VReg0 = MIB->getOperand(0).getReg();
+ Register VReg0 = MIB->getOperand(0).getReg();
(void)VReg0;
assert(MRI.getType(VReg0).getSizeInBits() == 64 &&
RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::FPRRegBankID &&
"Unsupported operand for G_MERGE_VALUES");
- unsigned VReg1 = MIB->getOperand(1).getReg();
+ Register VReg1 = MIB->getOperand(1).getReg();
(void)VReg1;
assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
"Unsupported operand for G_MERGE_VALUES");
- unsigned VReg2 = MIB->getOperand(2).getReg();
+ Register VReg2 = MIB->getOperand(2).getReg();
(void)VReg2;
assert(MRI.getType(VReg2).getSizeInBits() == 32 &&
RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::GPRRegBankID &&
@@ -268,17 +268,17 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB,
// We only support G_UNMERGE_VALUES as a way to break up one DPR into two
// GPRs.
- unsigned VReg0 = MIB->getOperand(0).getReg();
+ Register VReg0 = MIB->getOperand(0).getReg();
(void)VReg0;
assert(MRI.getType(VReg0).getSizeInBits() == 32 &&
RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::GPRRegBankID &&
"Unsupported operand for G_UNMERGE_VALUES");
- unsigned VReg1 = MIB->getOperand(1).getReg();
+ Register VReg1 = MIB->getOperand(1).getReg();
(void)VReg1;
assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
"Unsupported operand for G_UNMERGE_VALUES");
- unsigned VReg2 = MIB->getOperand(2).getReg();
+ Register VReg2 = MIB->getOperand(2).getReg();
(void)VReg2;
assert(MRI.getType(VReg2).getSizeInBits() == 64 &&
RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::FPRRegBankID &&
@@ -833,8 +833,7 @@ void ARMInstructionSelector::renderVFPF64Imm(
NewInstBuilder.addImm(FPImmEncoding);
}
-bool ARMInstructionSelector::select(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
+bool ARMInstructionSelector::select(MachineInstr &I) {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -851,7 +850,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
using namespace TargetOpcode;
- if (selectImpl(I, CoverageInfo))
+ if (selectImpl(I, *CoverageInfo))
return true;
MachineInstrBuilder MIB{MF, I};
@@ -874,10 +873,10 @@ bool ARMInstructionSelector::select(MachineInstr &I,
MIB.addImm(1).add(predOps(ARMCC::AL)).add(condCodeOp());
if (isSExt) {
- unsigned SExtResult = I.getOperand(0).getReg();
+ Register SExtResult = I.getOperand(0).getReg();
// Use a new virtual register for the result of the AND
- unsigned AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass);
+ Register AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass);
I.getOperand(0).setReg(AndResult);
auto InsertBefore = std::next(I.getIterator());
@@ -928,7 +927,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
assert(MRI.getType(SrcReg).getSizeInBits() == 64 && "Unsupported size");
assert(MRI.getType(DstReg).getSizeInBits() <= 32 && "Unsupported size");
- unsigned IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass);
+ Register IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass);
auto InsertBefore = std::next(I.getIterator());
auto MovI =
BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::VMOVRRD))
@@ -1039,7 +1038,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
case G_FCMP: {
assert(STI.hasVFP2Base() && "Can't select fcmp without VFP");
- unsigned OpReg = I.getOperand(2).getReg();
+ Register OpReg = I.getOperand(2).getReg();
unsigned Size = MRI.getType(OpReg).getSizeInBits();
if (Size == 64 && !STI.hasFP64()) {
@@ -1077,12 +1076,12 @@ bool ARMInstructionSelector::select(MachineInstr &I,
case G_STORE:
case G_LOAD: {
const auto &MemOp = **I.memoperands_begin();
- if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+ if (MemOp.isAtomic()) {
LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
return false;
}
- unsigned Reg = I.getOperand(0).getReg();
+ Register Reg = I.getOperand(0).getReg();
unsigned RegBank = RBI.getRegBank(Reg, MRI, TRI)->getID();
LLT ValTy = MRI.getType(Reg);
@@ -1097,9 +1096,9 @@ bool ARMInstructionSelector::select(MachineInstr &I,
if (ValSize == 1 && NewOpc == Opcodes.STORE8) {
// Before storing a 1-bit value, make sure to clear out any unneeded bits.
- unsigned OriginalValue = I.getOperand(0).getReg();
+ Register OriginalValue = I.getOperand(0).getReg();
- unsigned ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass);
+ Register ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass);
I.getOperand(0).setReg(ValueToStore);
auto InsertBefore = I.getIterator();
@@ -1159,7 +1158,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
case G_PHI: {
I.setDesc(TII.get(PHI));
- unsigned DstReg = I.getOperand(0).getReg();
+ Register DstReg = I.getOperand(0).getReg();
const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI);
if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
break;
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 73a57b297ad6..81414e6d76fe 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -84,6 +84,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
.legalForCartesianProduct({s8, s16, s32}, {s1, s8, s16});
+ getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
getActionDefinitionsBuilder({G_MUL, G_AND, G_OR, G_XOR})
.legalFor({s32})
.minScalar(0, s32);
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 90a1ce238c3f..4a193fed04a3 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -509,7 +509,7 @@ void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
Offset = MO.getImm() - WordOffset * getImmScale(Opc);
// If storing the base register, it needs to be reset first.
- unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
+ Register InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
MO.setImm(Offset);
@@ -859,7 +859,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
// Determine list of registers and list of implicit super-register defs.
for (const MachineInstr *MI : Cand.Instrs) {
const MachineOperand &MO = getLoadStoreRegOp(*MI);
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
bool IsKill = MO.isKill();
if (IsKill)
KilledRegs.insert(Reg);
@@ -874,7 +874,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
if (!MO.isReg() || !MO.isDef() || MO.isDead())
continue;
assert(MO.isImplicit());
- unsigned DefReg = MO.getReg();
+ Register DefReg = MO.getReg();
if (is_contained(ImpDefs, DefReg))
continue;
@@ -893,7 +893,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
iterator InsertBefore = std::next(iterator(LatestMI));
MachineBasicBlock &MBB = *LatestMI->getParent();
unsigned Offset = getMemoryOpOffset(*First);
- unsigned Base = getLoadStoreBaseOp(*First).getReg();
+ Register Base = getLoadStoreBaseOp(*First).getReg();
bool BaseKill = LatestMI->killsRegister(Base);
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
@@ -1005,7 +1005,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
const MachineInstr *MI = MemOps[SIndex].MI;
int Offset = MemOps[SIndex].Offset;
const MachineOperand &PMO = getLoadStoreRegOp(*MI);
- unsigned PReg = PMO.getReg();
+ Register PReg = PMO.getReg();
unsigned PRegNum = PMO.isUndef() ? std::numeric_limits<unsigned>::max()
: TRI->getEncodingValue(PReg);
unsigned Latest = SIndex;
@@ -1052,7 +1052,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
if (NewOffset != Offset + (int)Size)
break;
const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg == ARM::SP || Reg == ARM::PC)
break;
if (Count == Limit)
@@ -1261,7 +1261,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
if (isThumb1) return false;
const MachineOperand &BaseOP = MI->getOperand(0);
- unsigned Base = BaseOP.getReg();
+ Register Base = BaseOP.getReg();
bool BaseKill = BaseOP.isKill();
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
@@ -1387,7 +1387,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
// FIXME: Use LDM/STM with single register instead.
if (isThumb1) return false;
- unsigned Base = getLoadStoreBaseOp(*MI).getReg();
+ Register Base = getLoadStoreBaseOp(*MI).getReg();
bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
unsigned Opcode = MI->getOpcode();
DebugLoc DL = MI->getDebugLoc();
@@ -1512,7 +1512,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
// Behaviour for writeback is undefined if base register is the same as one
// of the others.
const MachineOperand &BaseOp = MI.getOperand(2);
- unsigned Base = BaseOp.getReg();
+ Register Base = BaseOp.getReg();
const MachineOperand &Reg0Op = MI.getOperand(0);
const MachineOperand &Reg1Op = MI.getOperand(1);
if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
@@ -1655,9 +1655,9 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
return false;
const MachineOperand &BaseOp = MI->getOperand(2);
- unsigned BaseReg = BaseOp.getReg();
- unsigned EvenReg = MI->getOperand(0).getReg();
- unsigned OddReg = MI->getOperand(1).getReg();
+ Register BaseReg = BaseOp.getReg();
+ Register EvenReg = MI->getOperand(0).getReg();
+ Register OddReg = MI->getOperand(1).getReg();
unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false);
@@ -1783,8 +1783,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
if (isMemoryOp(*MBBI)) {
unsigned Opcode = MBBI->getOpcode();
const MachineOperand &MO = MBBI->getOperand(0);
- unsigned Reg = MO.getReg();
- unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
+ Register Reg = MO.getReg();
+ Register Base = getLoadStoreBaseOp(*MBBI).getReg();
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
int Offset = getMemoryOpOffset(*MBBI);
@@ -2121,7 +2121,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
MachineOperand &MO = I->getOperand(j);
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (MO.isDef() && TRI->regsOverlap(Reg, Base))
return false;
if (Reg != Base && !MemRegs.count(Reg))
@@ -2415,7 +2415,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
int Opc = MI.getOpcode();
bool isLd = isLoadSingle(Opc);
- unsigned Base = MI.getOperand(1).getReg();
+ Register Base = MI.getOperand(1).getReg();
int Offset = getMemoryOpOffset(MI);
bool StopHere = false;
auto FindBases = [&] (Base2InstMap &Base2Ops, BaseVec &Bases) {
diff --git a/lib/Target/ARM/ARMLowOverheadLoops.cpp b/lib/Target/ARM/ARMLowOverheadLoops.cpp
index cedf3bd3c74e..e1c5a9c3e223 100644
--- a/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -11,8 +11,7 @@
/// The expectation is that the loop contains three pseudo instructions:
/// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
/// form should be in the preheader, whereas the while form should be in the
-/// preheaders only predecessor. TODO: Could DoLoopStart get moved into the
-/// pre-preheader?
+/// preheaders only predecessor.
/// - t2LoopDec - placed within in the loop body.
/// - t2LoopEnd - the loop latch terminator.
///
@@ -35,6 +34,7 @@ using namespace llvm;
namespace {
class ARMLowOverheadLoops : public MachineFunctionPass {
+ MachineFunction *MF = nullptr;
const ARMBaseInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;
std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
@@ -52,17 +52,6 @@ namespace {
bool runOnMachineFunction(MachineFunction &MF) override;
- bool ProcessLoop(MachineLoop *ML);
-
- void RevertWhile(MachineInstr *MI) const;
-
- void RevertLoopDec(MachineInstr *MI) const;
-
- void RevertLoopEnd(MachineInstr *MI) const;
-
- void Expand(MachineLoop *ML, MachineInstr *Start,
- MachineInstr *Dec, MachineInstr *End, bool Revert);
-
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
@@ -71,36 +60,156 @@ namespace {
StringRef getPassName() const override {
return ARM_LOW_OVERHEAD_LOOPS_NAME;
}
+
+ private:
+ bool ProcessLoop(MachineLoop *ML);
+
+ MachineInstr * IsSafeToDefineLR(MachineInstr *MI);
+
+ bool RevertNonLoops();
+
+ void RevertWhile(MachineInstr *MI) const;
+
+ bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const;
+
+ void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
+
+ void Expand(MachineLoop *ML, MachineInstr *Start,
+ MachineInstr *InsertPt, MachineInstr *Dec,
+ MachineInstr *End, bool Revert);
+
};
}
-
+
char ARMLowOverheadLoops::ID = 0;
INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
false, false)
-bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) {
- if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB())
+bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
+ const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
+ if (!ST.hasLOB())
return false;
- LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n");
+ MF = &mf;
+ LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
auto &MLI = getAnalysis<MachineLoopInfo>();
- MRI = &MF.getRegInfo();
- TII = static_cast<const ARMBaseInstrInfo*>(
- MF.getSubtarget().getInstrInfo());
- BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF));
+ MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+ MRI = &MF->getRegInfo();
+ TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
+ BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
BBUtils->computeAllBlockSizes();
- BBUtils->adjustBBOffsetsAfter(&MF.front());
+ BBUtils->adjustBBOffsetsAfter(&MF->front());
bool Changed = false;
for (auto ML : MLI) {
if (!ML->getParentLoop())
Changed |= ProcessLoop(ML);
}
+ Changed |= RevertNonLoops();
return Changed;
}
+static bool IsLoopStart(MachineInstr &MI) {
+ return MI.getOpcode() == ARM::t2DoLoopStart ||
+ MI.getOpcode() == ARM::t2WhileLoopStart;
+}
+
+template<typename T>
+static MachineInstr* SearchForDef(MachineInstr *Begin, T End, unsigned Reg) {
+ for(auto &MI : make_range(T(Begin), End)) {
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
+ continue;
+ return &MI;
+ }
+ }
+ return nullptr;
+}
+
+static MachineInstr* SearchForUse(MachineInstr *Begin,
+ MachineBasicBlock::iterator End,
+ unsigned Reg) {
+ for(auto &MI : make_range(MachineBasicBlock::iterator(Begin), End)) {
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
+ continue;
+ return &MI;
+ }
+ }
+ return nullptr;
+}
+
+// Is it safe to define LR with DLS/WLS?
+// LR can defined if it is the operand to start, because it's the same value,
+// or if it's going to be equivalent to the operand to Start.
+MachineInstr *ARMLowOverheadLoops::IsSafeToDefineLR(MachineInstr *Start) {
+
+ auto IsMoveLR = [](MachineInstr *MI, unsigned Reg) {
+ return MI->getOpcode() == ARM::tMOVr &&
+ MI->getOperand(0).getReg() == ARM::LR &&
+ MI->getOperand(1).getReg() == Reg &&
+ MI->getOperand(2).getImm() == ARMCC::AL;
+ };
+
+ MachineBasicBlock *MBB = Start->getParent();
+ unsigned CountReg = Start->getOperand(0).getReg();
+ // Walk forward and backward in the block to find the closest instructions
+ // that define LR. Then also filter them out if they're not a mov lr.
+ MachineInstr *PredLRDef = SearchForDef(Start, MBB->rend(), ARM::LR);
+ if (PredLRDef && !IsMoveLR(PredLRDef, CountReg))
+ PredLRDef = nullptr;
+
+ MachineInstr *SuccLRDef = SearchForDef(Start, MBB->end(), ARM::LR);
+ if (SuccLRDef && !IsMoveLR(SuccLRDef, CountReg))
+ SuccLRDef = nullptr;
+
+ // We've either found one, two or none mov lr instructions... Now figure out
+ // if they are performing the equilvant mov that the Start instruction will.
+ // Do this by scanning forward and backward to see if there's a def of the
+ // register holding the count value. If we find a suitable def, return it as
+ // the insert point. Later, if InsertPt != Start, then we can remove the
+ // redundant instruction.
+ if (SuccLRDef) {
+ MachineBasicBlock::iterator End(SuccLRDef);
+ if (!SearchForDef(Start, End, CountReg)) {
+ return SuccLRDef;
+ } else
+ SuccLRDef = nullptr;
+ }
+ if (PredLRDef) {
+ MachineBasicBlock::reverse_iterator End(PredLRDef);
+ if (!SearchForDef(Start, End, CountReg)) {
+ return PredLRDef;
+ } else
+ PredLRDef = nullptr;
+ }
+
+ // We can define LR because LR already contains the same value.
+ if (Start->getOperand(0).getReg() == ARM::LR)
+ return Start;
+
+ // We've found no suitable LR def and Start doesn't use LR directly. Can we
+ // just define LR anyway?
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ LivePhysRegs LiveRegs(*TRI);
+ LiveRegs.addLiveOuts(*MBB);
+
+ // Not if we've haven't found a suitable mov and LR is live out.
+ if (LiveRegs.contains(ARM::LR))
+ return nullptr;
+
+ // If LR is not live out, we can insert the instruction if nothing else
+ // uses LR after it.
+ if (!SearchForUse(Start, MBB->end(), ARM::LR))
+ return Start;
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find suitable insertion point for"
+ << " LR\n");
+ return nullptr;
+}
+
bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
bool Changed = false;
@@ -111,15 +220,10 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
- auto IsLoopStart = [](MachineInstr &MI) {
- return MI.getOpcode() == ARM::t2DoLoopStart ||
- MI.getOpcode() == ARM::t2WhileLoopStart;
- };
-
// Search the given block for a loop start instruction. If one isn't found,
// and there's only one predecessor block, search that one too.
std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
- [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
+ [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
for (auto &MI : *MBB) {
if (IsLoopStart(MI))
return &MI;
@@ -165,41 +269,62 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
Dec = &MI;
else if (MI.getOpcode() == ARM::t2LoopEnd)
End = &MI;
- else if (MI.getDesc().isCall())
+ else if (IsLoopStart(MI))
+ Start = &MI;
+ else if (MI.getDesc().isCall()) {
// TODO: Though the call will require LE to execute again, does this
// mean we should revert? Always executing LE hopefully should be
// faster than performing a sub,cmp,br or even subs,br.
Revert = true;
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
+ }
- if (!Dec)
+ if (!Dec || End)
continue;
- // If we find that we load/store LR between LoopDec and LoopEnd, expect
- // that the decremented value has been spilled to the stack. Because
- // this value isn't actually going to be produced until the latch, by LE,
- // we would need to generate a real sub. The value is also likely to be
- // reloaded for use of LoopEnd - in which in case we'd need to perform
- // an add because it gets negated again by LE! The other option is to
- // then generate the other form of LE which doesn't perform the sub.
- if (MI.mayLoad() || MI.mayStore())
- Revert =
- MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR;
+ // If we find that LR has been written or read between LoopDec and
+ // LoopEnd, expect that the decremented value is being used else where.
+ // Because this value isn't actually going to be produced until the
+ // latch, by LE, we would need to generate a real sub. The value is also
+ // likely to be copied/reloaded for use of LoopEnd - in which in case
+ // we'd need to perform an add because it gets subtracted again by LE!
+ // The other option is to then generate the other form of LE which doesn't
+ // perform the sub.
+ for (auto &MO : MI.operands()) {
+ if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() &&
+ MO.getReg() == ARM::LR) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI);
+ Revert = true;
+ break;
+ }
+ }
}
if (Dec && End && Revert)
break;
}
+ LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
+ if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
+ if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;);
+
if (!Start && !Dec && !End) {
LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
return Changed;
- } if (!(Start && Dec && End)) {
- report_fatal_error("Failed to find all loop components");
+ } else if (!(Start && Dec && End)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n");
+ return false;
}
- if (!End->getOperand(1).isMBB() ||
- End->getOperand(1).getMBB() != ML->getHeader())
- report_fatal_error("Expected LoopEnd to target Loop Header");
+ if (!End->getOperand(1).isMBB())
+ report_fatal_error("Expected LoopEnd to target basic block");
+
+ // TODO Maybe there's cases where the target doesn't have to be the header,
+ // but for now be safe and revert.
+ if (End->getOperand(1).getMBB() != ML->getHeader()) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
+ Revert = true;
+ }
// The WLS and LE instructions have 12-bits for the label offset. WLS
// requires a positive offset, while LE uses negative.
@@ -216,41 +341,57 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
Revert = true;
}
- LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start
- << " - Found Loop Dec: " << *Dec
- << " - Found Loop End: " << *End);
+ MachineInstr *InsertPt = Revert ? nullptr : IsSafeToDefineLR(Start);
+ if (!InsertPt) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
+ Revert = true;
+ } else
+ LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
- Expand(ML, Start, Dec, End, Revert);
+ Expand(ML, Start, InsertPt, Dec, End, Revert);
return true;
}
// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
// beq that branches to the exit branch.
-// FIXME: Need to check that we're not trashing the CPSR when generating the
-// cmp. We could also try to generate a cbz if the value in LR is also in
+// TODO: We could also try to generate a cbz if the value in LR is also in
// another low register.
void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
MachineBasicBlock *MBB = MI->getParent();
MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(ARM::t2CMPri));
- MIB.addReg(ARM::LR);
+ MIB.add(MI->getOperand(0));
MIB.addImm(0);
MIB.addImm(ARMCC::AL);
- MIB.addReg(ARM::CPSR);
+ MIB.addReg(ARM::NoRegister);
+
+ MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
+ unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
+ ARM::tBcc : ARM::t2Bcc;
- // TODO: Try to use tBcc instead
- MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
MIB.add(MI->getOperand(1)); // branch target
MIB.addImm(ARMCC::EQ); // condition code
MIB.addReg(ARM::CPSR);
MI->eraseFromParent();
}
-// TODO: Check flags so that we can possibly generate a tSubs or tSub.
-void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
+bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI,
+ bool AllowFlags) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
MachineBasicBlock *MBB = MI->getParent();
+
+ // If nothing uses or defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
+ bool SetFlags = false;
+ if (AllowFlags) {
+ if (auto *Def = SearchForDef(MI, MBB->end(), ARM::CPSR)) {
+ if (!SearchForUse(MI, MBB->end(), ARM::CPSR) &&
+ Def->getOpcode() == ARM::t2LoopEnd)
+ SetFlags = true;
+ }
+ }
+
MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(ARM::t2SUBri));
MIB.addDef(ARM::LR);
@@ -258,28 +399,39 @@ void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
MIB.add(MI->getOperand(2));
MIB.addImm(ARMCC::AL);
MIB.addReg(0);
- MIB.addReg(0);
+
+ if (SetFlags) {
+ MIB.addReg(ARM::CPSR);
+ MIB->getOperand(5).setIsDef(true);
+ } else
+ MIB.addReg(0);
+
MI->eraseFromParent();
+ return SetFlags;
}
// Generate a subs, or sub and cmp, and a branch instead of an LE.
-// FIXME: Need to check that we're not trashing the CPSR when generating
-// the cmp.
-void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const {
+void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
- // Create cmp
MachineBasicBlock *MBB = MI->getParent();
- MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(ARM::t2CMPri));
- MIB.addReg(ARM::LR);
- MIB.addImm(0);
- MIB.addImm(ARMCC::AL);
- MIB.addReg(ARM::CPSR);
+ // Create cmp
+ if (!SkipCmp) {
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(ARM::t2CMPri));
+ MIB.addReg(ARM::LR);
+ MIB.addImm(0);
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(ARM::NoRegister);
+ }
+
+ MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
+ unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
+ ARM::tBcc : ARM::t2Bcc;
- // TODO Try to use tBcc instead.
// Create bne
- MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
MIB.add(MI->getOperand(1)); // branch target
MIB.addImm(ARMCC::NE); // condition code
MIB.addReg(ARM::CPSR);
@@ -287,33 +439,13 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const {
}
void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
+ MachineInstr *InsertPt,
MachineInstr *Dec, MachineInstr *End,
bool Revert) {
- auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) {
- // The trip count should already been held in LR since the instructions
- // within the loop can only read and write to LR. So, there should be a
- // mov to setup the count. WLS/DLS perform this move, so find the original
- // and delete it - inserting WLS/DLS in its place.
- MachineBasicBlock *MBB = Start->getParent();
- MachineInstr *InsertPt = Start;
- for (auto &I : MRI->def_instructions(ARM::LR)) {
- if (I.getParent() != MBB)
- continue;
-
- // Always execute.
- if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL)
- continue;
-
- // Only handle move reg, if the trip count it will need moving into a reg
- // before the setup instruction anyway.
- if (!I.getDesc().isMoveReg() ||
- !I.getOperand(1).isIdenticalTo(Start->getOperand(0)))
- continue;
- InsertPt = &I;
- break;
- }
-
+ auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start,
+ MachineInstr *InsertPt) {
+ MachineBasicBlock *MBB = InsertPt->getParent();
unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ?
ARM::t2DLS : ARM::t2WLS;
MachineInstrBuilder MIB =
@@ -369,16 +501,54 @@ void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
RevertWhile(Start);
else
Start->eraseFromParent();
- RevertLoopDec(Dec);
- RevertLoopEnd(End);
+ bool FlagsAlreadySet = RevertLoopDec(Dec, true);
+ RevertLoopEnd(End, FlagsAlreadySet);
} else {
- Start = ExpandLoopStart(ML, Start);
+ Start = ExpandLoopStart(ML, Start, InsertPt);
RemoveDeadBranch(Start);
End = ExpandLoopEnd(ML, Dec, End);
RemoveDeadBranch(End);
}
}
+bool ARMLowOverheadLoops::RevertNonLoops() {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
+ bool Changed = false;
+
+ for (auto &MBB : *MF) {
+ SmallVector<MachineInstr*, 4> Starts;
+ SmallVector<MachineInstr*, 4> Decs;
+ SmallVector<MachineInstr*, 4> Ends;
+
+ for (auto &I : MBB) {
+ if (IsLoopStart(I))
+ Starts.push_back(&I);
+ else if (I.getOpcode() == ARM::t2LoopDec)
+ Decs.push_back(&I);
+ else if (I.getOpcode() == ARM::t2LoopEnd)
+ Ends.push_back(&I);
+ }
+
+ if (Starts.empty() && Decs.empty() && Ends.empty())
+ continue;
+
+ Changed = true;
+
+ for (auto *Start : Starts) {
+ if (Start->getOpcode() == ARM::t2WhileLoopStart)
+ RevertWhile(Start);
+ else
+ Start->eraseFromParent();
+ }
+ for (auto *Dec : Decs)
+ RevertLoopDec(Dec);
+
+ for (auto *End : Ends)
+ RevertLoopEnd(End);
+ }
+ return Changed;
+}
+
FunctionPass *llvm::createARMLowOverheadLoopsPass() {
return new ARMLowOverheadLoops();
}
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 90c5ad025e56..c92689f4942e 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -74,8 +74,8 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
switch (MO.getType()) {
default: llvm_unreachable("unknown operand type");
case MachineOperand::MO_Register:
- // Ignore all non-CPSR implicit register operands.
- if (MO.isImplicit() && MO.getReg() != ARM::CPSR)
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
return false;
assert(!MO.getSubReg() && "Subregs should be eliminated!");
MCOp = MCOperand::createReg(MO.getReg());
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 90d794cd27b1..bb136e92329b 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -16,6 +16,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/GlobalVariable.h"
#include "llvm/Support/ErrorHandling.h"
#include <utility>
@@ -130,6 +131,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
/// The amount the literal pool has been increasedby due to promoted globals.
int PromotedGlobalsIncrease = 0;
+ /// True if r0 will be preserved by a call to this function (e.g. C++
+ /// con/destructors).
+ bool PreservesR0 = false;
+
public:
ARMFunctionInfo() = default;
@@ -247,6 +252,9 @@ public:
}
DenseMap<unsigned, unsigned> EHPrologueRemappedRegs;
+
+ void setPreservesR0() { PreservesR0 = true; }
+ bool getPreservesR0() const { return PreservesR0; }
};
} // end namespace llvm
diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp
index 5389d09bf7d7..ae5657a0a2c1 100644
--- a/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/lib/Target/ARM/ARMParallelDSP.cpp
@@ -1,4 +1,4 @@
-//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
+//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -18,13 +18,11 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OrderedBasicBlock.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Pass.h"
#include "llvm/PassRegistry.h"
#include "llvm/PassSupport.h"
@@ -45,54 +43,39 @@ static cl::opt<bool>
DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
cl::desc("Disable the ARM Parallel DSP pass"));
+static cl::opt<unsigned>
+NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16),
+ cl::desc("Limit the number of loads analysed"));
+
namespace {
- struct OpChain;
- struct BinOpChain;
+ struct MulCandidate;
class Reduction;
- using OpChainList = SmallVector<std::unique_ptr<OpChain>, 8>;
- using ReductionList = SmallVector<Reduction, 8>;
- using ValueList = SmallVector<Value*, 8>;
- using MemInstList = SmallVector<LoadInst*, 8>;
- using PMACPair = std::pair<BinOpChain*,BinOpChain*>;
- using PMACPairList = SmallVector<PMACPair, 8>;
- using Instructions = SmallVector<Instruction*,16>;
- using MemLocList = SmallVector<MemoryLocation, 4>;
+ using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>;
+ using MemInstList = SmallVectorImpl<LoadInst*>;
+ using MulPairList = SmallVector<std::pair<MulCandidate*, MulCandidate*>, 8>;
- struct OpChain {
+ // 'MulCandidate' holds the multiplication instructions that are candidates
+ // for parallel execution.
+ struct MulCandidate {
Instruction *Root;
- ValueList AllValues;
- MemInstList VecLd; // List of all load instructions.
- MemInstList Loads;
+ Value* LHS;
+ Value* RHS;
+ bool Exchange = false;
bool ReadOnly = true;
+ bool Paired = false;
+ SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen.
- OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
- virtual ~OpChain() = default;
+ MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
+ Root(I), LHS(lhs), RHS(rhs) { }
- void PopulateLoads() {
- for (auto *V : AllValues) {
- if (auto *Ld = dyn_cast<LoadInst>(V))
- Loads.push_back(Ld);
- }
+ bool HasTwoLoadInputs() const {
+ return isa<LoadInst>(LHS) && isa<LoadInst>(RHS);
}
- unsigned size() const { return AllValues.size(); }
- };
-
- // 'BinOpChain' holds the multiplication instructions that are candidates
- // for parallel execution.
- struct BinOpChain : public OpChain {
- ValueList LHS; // List of all (narrow) left hand operands.
- ValueList RHS; // List of all (narrow) right hand operands.
- bool Exchange = false;
-
- BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
- OpChain(I, lhs), LHS(lhs), RHS(rhs) {
- for (auto *V : RHS)
- AllValues.push_back(V);
- }
-
- bool AreSymmetrical(BinOpChain *Other);
+ LoadInst *getBaseLoad() const {
+ return VecLd.front();
+ }
};
/// Represent a sequence of multiply-accumulate operations with the aim to
@@ -100,9 +83,9 @@ namespace {
class Reduction {
Instruction *Root = nullptr;
Value *Acc = nullptr;
- OpChainList Muls;
- PMACPairList MulPairs;
- SmallPtrSet<Instruction*, 4> Adds;
+ MulCandList Muls;
+ MulPairList MulPairs;
+ SetVector<Instruction*> Adds;
public:
Reduction() = delete;
@@ -112,10 +95,35 @@ namespace {
/// Record an Add instruction that is a part of the this reduction.
void InsertAdd(Instruction *I) { Adds.insert(I); }
- /// Record a BinOpChain, rooted at a Mul instruction, that is a part of
- /// this reduction.
- void InsertMul(Instruction *I, ValueList &LHS, ValueList &RHS) {
- Muls.push_back(make_unique<BinOpChain>(I, LHS, RHS));
+ /// Create MulCandidates, each rooted at a Mul instruction, that is a part
+ /// of this reduction.
+ void InsertMuls() {
+ auto GetMulOperand = [](Value *V) -> Instruction* {
+ if (auto *SExt = dyn_cast<SExtInst>(V)) {
+ if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
+ if (I->getOpcode() == Instruction::Mul)
+ return I;
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
+ if (I->getOpcode() == Instruction::Mul)
+ return I;
+ }
+ return nullptr;
+ };
+
+ auto InsertMul = [this](Instruction *I) {
+ Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
+ Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
+ Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
+ };
+
+ for (auto *Add : Adds) {
+ if (Add == Acc)
+ continue;
+ if (auto *Mul = GetMulOperand(Add->getOperand(0)))
+ InsertMul(Mul);
+ if (auto *Mul = GetMulOperand(Add->getOperand(1)))
+ InsertMul(Mul);
+ }
}
/// Add the incoming accumulator value, returns true if a value had not
@@ -128,9 +136,17 @@ namespace {
return true;
}
- /// Set two BinOpChains, rooted at muls, that can be executed as a single
+ /// Set two MulCandidates, rooted at muls, that can be executed as a single
/// parallel operation.
- void AddMulPair(BinOpChain *Mul0, BinOpChain *Mul1) {
+ void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
+ bool Exchange = false) {
+ LLVM_DEBUG(dbgs() << "Pairing:\n"
+ << *Mul0->Root << "\n"
+ << *Mul1->Root << "\n");
+ Mul0->Paired = true;
+ Mul1->Paired = true;
+ if (Exchange)
+ Mul1->Exchange = true;
MulPairs.push_back(std::make_pair(Mul0, Mul1));
}
@@ -141,24 +157,40 @@ namespace {
/// Return the add instruction which is the root of the reduction.
Instruction *getRoot() { return Root; }
+ bool is64Bit() const { return Root->getType()->isIntegerTy(64); }
+
+ Type *getType() const { return Root->getType(); }
+
/// Return the incoming value to be accumulated. This maybe null.
Value *getAccumulator() { return Acc; }
/// Return the set of adds that comprise the reduction.
- SmallPtrSetImpl<Instruction*> &getAdds() { return Adds; }
+ SetVector<Instruction*> &getAdds() { return Adds; }
- /// Return the BinOpChain, rooted at mul instruction, that comprise the
+ /// Return the MulCandidate, rooted at mul instruction, that comprise the
/// the reduction.
- OpChainList &getMuls() { return Muls; }
+ MulCandList &getMuls() { return Muls; }
- /// Return the BinOpChain, rooted at mul instructions, that have been
+ /// Return the MulCandidate, rooted at mul instructions, that have been
/// paired for parallel execution.
- PMACPairList &getMulPairs() { return MulPairs; }
+ MulPairList &getMulPairs() { return MulPairs; }
/// To finalise, replace the uses of the root with the intrinsic call.
void UpdateRoot(Instruction *SMLAD) {
Root->replaceAllUsesWith(SMLAD);
}
+
+ void dump() {
+ LLVM_DEBUG(dbgs() << "Reduction:\n";
+ for (auto *Add : Adds)
+ LLVM_DEBUG(dbgs() << *Add << "\n");
+ for (auto &Mul : Muls)
+ LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
+ << " " << *Mul->LHS << "\n"
+ << " " << *Mul->RHS << "\n");
+ LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
+ );
+ }
};
class WidenedLoad {
@@ -176,13 +208,11 @@ namespace {
}
};
- class ARMParallelDSP : public LoopPass {
+ class ARMParallelDSP : public FunctionPass {
ScalarEvolution *SE;
AliasAnalysis *AA;
TargetLibraryInfo *TLI;
DominatorTree *DT;
- LoopInfo *LI;
- Loop *L;
const DataLayout *DL;
Module *M;
std::map<LoadInst*, LoadInst*> LoadPairs;
@@ -190,13 +220,12 @@ namespace {
std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
template<unsigned>
- bool IsNarrowSequence(Value *V, ValueList &VL);
-
+ bool IsNarrowSequence(Value *V);
+ bool Search(Value *V, BasicBlock *BB, Reduction &R);
bool RecordMemoryOps(BasicBlock *BB);
void InsertParallelMACs(Reduction &Reduction);
bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
- LoadInst* CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
- IntegerType *LoadTy);
+ LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
bool CreateParallelPairs(Reduction &R);
/// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
@@ -204,60 +233,38 @@ namespace {
/// products to a 32-bit accumulate operand. Optionally, the instruction can
/// exchange the halfwords of the second operand before performing the
/// arithmetic.
- bool MatchSMLAD(Loop *L);
+ bool MatchSMLAD(Function &F);
public:
static char ID;
- ARMParallelDSP() : LoopPass(ID) { }
-
- bool doInitialization(Loop *L, LPPassManager &LPM) override {
- LoadPairs.clear();
- WideLoads.clear();
- return true;
- }
+ ARMParallelDSP() : FunctionPass(ID) { }
void getAnalysisUsage(AnalysisUsage &AU) const override {
- LoopPass::getAnalysisUsage(AU);
+ FunctionPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetPassConfig>();
- AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();
}
- bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
+ bool runOnFunction(Function &F) override {
if (DisableParallelDSP)
return false;
- L = TheLoop;
+ if (skipFunction(F))
+ return false;
+
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &TPC = getAnalysis<TargetPassConfig>();
- BasicBlock *Header = TheLoop->getHeader();
- if (!Header)
- return false;
-
- // TODO: We assume the loop header and latch to be the same block.
- // This is not a fundamental restriction, but lifting this would just
- // require more work to do the transformation and then patch up the CFG.
- if (Header != TheLoop->getLoopLatch()) {
- LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
- "running pass ARMParallelDSP\n");
- return false;
- }
-
- if (!TheLoop->getLoopPreheader())
- InsertPreheaderForLoop(L, DT, LI, nullptr, true);
-
- Function &F = *Header->getParent();
M = F.getParent();
DL = &M->getDataLayout();
@@ -282,17 +289,10 @@ namespace {
return false;
}
- LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
-
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
- if (!RecordMemoryOps(Header)) {
- LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
- return false;
- }
-
- bool Changes = MatchSMLAD(L);
+ bool Changes = MatchSMLAD(F);
return Changes;
}
};
@@ -331,40 +331,14 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
// TODO: we currently only collect i16, and will support i8 later, so that's
// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
template<unsigned MaxBitWidth>
-bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) {
- ConstantInt *CInt;
-
- if (match(V, m_ConstantInt(CInt))) {
- // TODO: if a constant is used, it needs to fit within the bit width.
- return false;
- }
-
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
-
- Value *Val, *LHS, *RHS;
- if (match(V, m_Trunc(m_Value(Val)))) {
- if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
- return IsNarrowSequence<MaxBitWidth>(Val, VL);
- } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
- // TODO: we need to implement sadd16/sadd8 for this, which enables to
- // also do the rewrite for smlad8.ll, but it is unsupported for now.
- return false;
- } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
- if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
+bool ARMParallelDSP::IsNarrowSequence(Value *V) {
+ if (auto *SExt = dyn_cast<SExtInst>(V)) {
+ if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
return false;
- if (match(Val, m_Load(m_Value()))) {
- auto *Ld = cast<LoadInst>(Val);
-
- // Check that these load could be paired.
- if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld))
- return false;
-
- VL.push_back(Val);
- VL.push_back(I);
- return true;
+ if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
+ // Check that this load could be paired.
+ return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
}
}
return false;
@@ -375,6 +349,9 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) {
bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
SmallVector<LoadInst*, 8> Loads;
SmallVector<Instruction*, 8> Writes;
+ LoadPairs.clear();
+ WideLoads.clear();
+ OrderedBasicBlock OrderedBB(BB);
// Collect loads and instruction that may write to memory. For now we only
// record loads which are simple, sign-extended and have a single user.
@@ -389,21 +366,24 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
Loads.push_back(Ld);
}
+ if (Loads.empty() || Loads.size() > NumLoadLimit)
+ return false;
+
using InstSet = std::set<Instruction*>;
using DepMap = std::map<Instruction*, InstSet>;
DepMap RAWDeps;
// Record any writes that may alias a load.
const auto Size = LocationSize::unknown();
- for (auto Read : Loads) {
- for (auto Write : Writes) {
+ for (auto Write : Writes) {
+ for (auto Read : Loads) {
MemoryLocation ReadLoc =
MemoryLocation(Read->getPointerOperand(), Size);
if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
ModRefInfo::ModRef)))
continue;
- if (DT->dominates(Write, Read))
+ if (OrderedBB.dominates(Write, Read))
RAWDeps[Read].insert(Write);
}
}
@@ -411,17 +391,16 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
// Check whether there's not a write between the two loads which would
// prevent them from being safely merged.
auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
- LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset;
- LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base;
+ LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset;
+ LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base;
if (RAWDeps.count(Dominated)) {
InstSet &WritesBefore = RAWDeps[Dominated];
for (auto Before : WritesBefore) {
-
// We can't move the second load backward, past a write, to merge
// with the first load.
- if (DT->dominates(Dominator, Before))
+ if (OrderedBB.dominates(Dominator, Before))
return false;
}
}
@@ -431,7 +410,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
// Record base, offset load pairs.
for (auto *Base : Loads) {
for (auto *Offset : Loads) {
- if (Base == Offset)
+ if (Base == Offset || OffsetLoads.count(Offset))
continue;
if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
@@ -453,7 +432,54 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
return LoadPairs.size() > 1;
}
-// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
+// Search recursively back through the operands to find a tree of values that
+// form a multiply-accumulate chain. The search records the Add and Mul
+// instructions that form the reduction and allows us to find a single value
+// to be used as the initial input to the accumlator.
+bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
+ // If we find a non-instruction, try to use it as the initial accumulator
+ // value. This may have already been found during the search in which case
+ // this function will return false, signaling a search fail.
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return R.InsertAcc(V);
+
+ if (I->getParent() != BB)
+ return false;
+
+ switch (I->getOpcode()) {
+ default:
+ break;
+ case Instruction::PHI:
+ // Could be the accumulator value.
+ return R.InsertAcc(V);
+ case Instruction::Add: {
+ // Adds should be adding together two muls, or another add and a mul to
+ // be within the mac chain. One of the operands may also be the
+ // accumulator value at which point we should stop searching.
+ R.InsertAdd(I);
+ Value *LHS = I->getOperand(0);
+ Value *RHS = I->getOperand(1);
+ bool ValidLHS = Search(LHS, BB, R);
+ bool ValidRHS = Search(RHS, BB, R);
+
+ if (ValidLHS && ValidRHS)
+ return true;
+
+ return R.InsertAcc(I);
+ }
+ case Instruction::Mul: {
+ Value *MulOp0 = I->getOperand(0);
+ Value *MulOp1 = I->getOperand(1);
+ return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
+ }
+ case Instruction::SExt:
+ return Search(I->getOperand(0), BB, R);
+ }
+ return false;
+}
+
+// The pass needs to identify integer add/sub reductions of 16-bit vector
// multiplications.
// To use SMLAD:
// 1) we first need to find integer add then look for this pattern:
@@ -484,88 +510,39 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
// If loop invariants are used instead of loads, these need to be packed
// before the loop begins.
//
-bool ARMParallelDSP::MatchSMLAD(Loop *L) {
- // Search recursively back through the operands to find a tree of values that
- // form a multiply-accumulate chain. The search records the Add and Mul
- // instructions that form the reduction and allows us to find a single value
- // to be used as the initial input to the accumlator.
- std::function<bool(Value*, Reduction&)> Search = [&]
- (Value *V, Reduction &R) -> bool {
-
- // If we find a non-instruction, try to use it as the initial accumulator
- // value. This may have already been found during the search in which case
- // this function will return false, signaling a search fail.
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return R.InsertAcc(V);
-
- switch (I->getOpcode()) {
- default:
- break;
- case Instruction::PHI:
- // Could be the accumulator value.
- return R.InsertAcc(V);
- case Instruction::Add: {
- // Adds should be adding together two muls, or another add and a mul to
- // be within the mac chain. One of the operands may also be the
- // accumulator value at which point we should stop searching.
- bool ValidLHS = Search(I->getOperand(0), R);
- bool ValidRHS = Search(I->getOperand(1), R);
- if (!ValidLHS && !ValidLHS)
- return false;
- else if (ValidLHS && ValidRHS) {
- R.InsertAdd(I);
- return true;
- } else {
- R.InsertAdd(I);
- return R.InsertAcc(I);
- }
- }
- case Instruction::Mul: {
- Value *MulOp0 = I->getOperand(0);
- Value *MulOp1 = I->getOperand(1);
- if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
- ValueList LHS;
- ValueList RHS;
- if (IsNarrowSequence<16>(MulOp0, LHS) &&
- IsNarrowSequence<16>(MulOp1, RHS)) {
- R.InsertMul(I, LHS, RHS);
- return true;
- }
- }
- return false;
- }
- case Instruction::SExt:
- return Search(I->getOperand(0), R);
- }
- return false;
- };
-
+bool ARMParallelDSP::MatchSMLAD(Function &F) {
bool Changed = false;
- SmallPtrSet<Instruction*, 4> AllAdds;
- BasicBlock *Latch = L->getLoopLatch();
- for (Instruction &I : reverse(*Latch)) {
- if (I.getOpcode() != Instruction::Add)
+ for (auto &BB : F) {
+ SmallPtrSet<Instruction*, 4> AllAdds;
+ if (!RecordMemoryOps(&BB))
continue;
- if (AllAdds.count(&I))
- continue;
+ for (Instruction &I : reverse(BB)) {
+ if (I.getOpcode() != Instruction::Add)
+ continue;
- const auto *Ty = I.getType();
- if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
- continue;
+ if (AllAdds.count(&I))
+ continue;
- Reduction R(&I);
- if (!Search(&I, R))
- continue;
+ const auto *Ty = I.getType();
+ if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+ continue;
- if (!CreateParallelPairs(R))
- continue;
+ Reduction R(&I);
+ if (!Search(&I, &BB, R))
+ continue;
- InsertParallelMACs(R);
- Changed = true;
- AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+ R.InsertMuls();
+ LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
+
+ if (!CreateParallelPairs(R))
+ continue;
+
+ InsertParallelMACs(R);
+ Changed = true;
+ AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+ }
}
return Changed;
@@ -578,87 +555,57 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
return false;
// Check that the muls operate directly upon sign extended loads.
- for (auto &MulChain : R.getMuls()) {
- // A mul has 2 operands, and a narrow op consist of sext and a load; thus
- // we expect at least 4 items in this operand value list.
- if (MulChain->size() < 4) {
- LLVM_DEBUG(dbgs() << "Operand list too short.\n");
+ for (auto &MulCand : R.getMuls()) {
+ if (!MulCand->HasTwoLoadInputs())
return false;
- }
- MulChain->PopulateLoads();
- ValueList &LHS = static_cast<BinOpChain*>(MulChain.get())->LHS;
- ValueList &RHS = static_cast<BinOpChain*>(MulChain.get())->RHS;
-
- // Use +=2 to skip over the expected extend instructions.
- for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
- if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
- return false;
- }
}
- auto CanPair = [&](Reduction &R, BinOpChain *PMul0, BinOpChain *PMul1) {
- if (!PMul0->AreSymmetrical(PMul1))
- return false;
-
+ auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) {
// The first elements of each vector should be loads with sexts. If we
// find that its two pairs of consecutive loads, then these can be
// transformed into two wider loads and the users can be replaced with
// DSP intrinsics.
- for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) {
- auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]);
- auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]);
- auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]);
- auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]);
-
- if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
- return false;
+ auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
+ auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
+ auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
+ auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
- LLVM_DEBUG(dbgs() << "Loads:\n"
- << " - " << *Ld0 << "\n"
- << " - " << *Ld1 << "\n"
- << " - " << *Ld2 << "\n"
- << " - " << *Ld3 << "\n");
-
- if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
- if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
- LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
- R.AddMulPair(PMul0, PMul1);
- return true;
- } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
- LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
- LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
- PMul1->Exchange = true;
- R.AddMulPair(PMul0, PMul1);
- return true;
- }
- } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
- AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+ if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
+ if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
- LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
- LLVM_DEBUG(dbgs() << " and swapping muls\n");
- PMul0->Exchange = true;
- // Only the second operand can be exchanged, so swap the muls.
- R.AddMulPair(PMul1, PMul0);
+ R.AddMulPair(PMul0, PMul1);
+ return true;
+ } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
+ LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+ LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
+ R.AddMulPair(PMul0, PMul1, true);
return true;
}
+ } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
+ AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+ LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+ LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
+ LLVM_DEBUG(dbgs() << " and swapping muls\n");
+ // Only the second operand can be exchanged, so swap the muls.
+ R.AddMulPair(PMul1, PMul0, true);
+ return true;
}
return false;
};
- OpChainList &Muls = R.getMuls();
+ MulCandList &Muls = R.getMuls();
const unsigned Elems = Muls.size();
- SmallPtrSet<const Instruction*, 4> Paired;
for (unsigned i = 0; i < Elems; ++i) {
- BinOpChain *PMul0 = static_cast<BinOpChain*>(Muls[i].get());
- if (Paired.count(PMul0->Root))
+ MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
+ if (PMul0->Paired)
continue;
for (unsigned j = 0; j < Elems; ++j) {
if (i == j)
continue;
- BinOpChain *PMul1 = static_cast<BinOpChain*>(Muls[j].get());
- if (Paired.count(PMul1->Root))
+ MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
+ if (PMul1->Paired)
continue;
const Instruction *Mul0 = PMul0->Root;
@@ -668,29 +615,19 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
assert(PMul0 != PMul1 && "expected different chains");
- if (CanPair(R, PMul0, PMul1)) {
- Paired.insert(Mul0);
- Paired.insert(Mul1);
+ if (CanPair(R, PMul0, PMul1))
break;
- }
}
}
return !R.getMulPairs().empty();
}
-
void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
- auto CreateSMLADCall = [&](SmallVectorImpl<LoadInst*> &VecLd0,
- SmallVectorImpl<LoadInst*> &VecLd1,
- Value *Acc, bool Exchange,
- Instruction *InsertAfter) {
+ auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
+ Value *Acc, bool Exchange,
+ Instruction *InsertAfter) {
// Replace the reduction chain with an intrinsic call
- IntegerType *Ty = IntegerType::get(M->getContext(), 32);
- LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
- WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty);
- LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
- WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty);
Value* Args[] = { WideLd0, WideLd1, Acc };
Function *SMLAD = nullptr;
@@ -704,34 +641,95 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
+ BasicBlock::iterator(InsertAfter));
Instruction *Call = Builder.CreateCall(SMLAD, Args);
NumSMLAD++;
return Call;
};
- Instruction *InsertAfter = R.getRoot();
+ // Return the instruction after the dominated instruction.
+ auto GetInsertPoint = [this](Value *A, Value *B) {
+ assert((isa<Instruction>(A) || isa<Instruction>(B)) &&
+ "expected at least one instruction");
+
+ Value *V = nullptr;
+ if (!isa<Instruction>(A))
+ V = B;
+ else if (!isa<Instruction>(B))
+ V = A;
+ else
+ V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A;
+
+ return &*++BasicBlock::iterator(cast<Instruction>(V));
+ };
+
Value *Acc = R.getAccumulator();
- if (!Acc)
- Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
- LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n"
- << "Acc: " << *Acc << "\n");
+ // For any muls that were discovered but not paired, accumulate their values
+ // as before.
+ IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
+ MulCandList &MulCands = R.getMuls();
+ for (auto &MulCand : MulCands) {
+ if (MulCand->Paired)
+ continue;
+
+ Instruction *Mul = cast<Instruction>(MulCand->Root);
+ LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
+
+ if (R.getType() != Mul->getType()) {
+ assert(R.is64Bit() && "expected 64-bit result");
+ Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul));
+ Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));
+ }
+
+ if (!Acc) {
+ Acc = Mul;
+ continue;
+ }
+
+ // If Acc is the original incoming value to the reduction, it could be a
+ // phi. But the phi will dominate Mul, meaning that Mul will be the
+ // insertion point.
+ Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));
+ Acc = Builder.CreateAdd(Mul, Acc);
+ }
+
+ if (!Acc) {
+ Acc = R.is64Bit() ?
+ ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) :
+ ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
+ } else if (Acc->getType() != R.getType()) {
+ Builder.SetInsertPoint(R.getRoot());
+ Acc = Builder.CreateSExt(Acc, R.getType());
+ }
+
+ // Roughly sort the mul pairs in their program order.
+ OrderedBasicBlock OrderedBB(R.getRoot()->getParent());
+ llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) {
+ const Instruction *A = PairA.first->Root;
+ const Instruction *B = PairB.first->Root;
+ return OrderedBB.dominates(A, B);
+ });
+
+ IntegerType *Ty = IntegerType::get(M->getContext(), 32);
for (auto &Pair : R.getMulPairs()) {
- BinOpChain *PMul0 = Pair.first;
- BinOpChain *PMul1 = Pair.second;
- LLVM_DEBUG(dbgs() << "Muls:\n"
- << "- " << *PMul0->Root << "\n"
- << "- " << *PMul1->Root << "\n");
-
- Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
- InsertAfter);
- InsertAfter = cast<Instruction>(Acc);
+ MulCandidate *LHSMul = Pair.first;
+ MulCandidate *RHSMul = Pair.second;
+ LoadInst *BaseLHS = LHSMul->getBaseLoad();
+ LoadInst *BaseRHS = RHSMul->getBaseLoad();
+ LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
+ WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty);
+ LoadInst *WideRHS = WideLoads.count(BaseRHS) ?
+ WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty);
+
+ Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
+ InsertAfter = GetInsertPoint(InsertAfter, Acc);
+ Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
}
R.UpdateRoot(cast<Instruction>(Acc));
}
-LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
IntegerType *LoadTy) {
assert(Loads.size() == 2 && "currently only support widening two loads");
@@ -758,8 +756,8 @@ LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
return;
Source->moveBefore(Sink);
- for (auto &U : Source->uses())
- MoveBefore(Source, U.getUser());
+ for (auto &Op : Source->operands())
+ MoveBefore(Op, Source);
};
// Insert the load at the point of the original dominating load.
@@ -784,57 +782,30 @@ LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
// Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
// TODO: Support big-endian as well.
Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
- BaseSExt->setOperand(0, Bottom);
+ Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
+ BaseSExt->replaceAllUsesWith(NewBaseSExt);
IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
- OffsetSExt->setOperand(0, Trunc);
-
+ Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
+ OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
+
+ LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
+ << *Base << "\n" << *Offset << "\n"
+ << "Created Wide Load:\n"
+ << *WideLoad << "\n"
+ << *Bottom << "\n"
+ << *NewBaseSExt << "\n"
+ << *Top << "\n"
+ << *Trunc << "\n"
+ << *NewOffsetSExt << "\n");
WideLoads.emplace(std::make_pair(Base,
- make_unique<WidenedLoad>(Loads, WideLoad)));
+ std::make_unique<WidenedLoad>(Loads, WideLoad)));
return WideLoad;
}
-// Compare the value lists in Other to this chain.
-bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
- // Element-by-element comparison of Value lists returning true if they are
- // instructions with the same opcode or constants with the same value.
- auto CompareValueList = [](const ValueList &VL0,
- const ValueList &VL1) {
- if (VL0.size() != VL1.size()) {
- LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
- << VL0.size() << " != " << VL1.size() << "\n");
- return false;
- }
-
- const unsigned Pairs = VL0.size();
-
- for (unsigned i = 0; i < Pairs; ++i) {
- const Value *V0 = VL0[i];
- const Value *V1 = VL1[i];
- const auto *Inst0 = dyn_cast<Instruction>(V0);
- const auto *Inst1 = dyn_cast<Instruction>(V1);
-
- if (!Inst0 || !Inst1)
- return false;
-
- if (Inst0->isSameOperationAs(Inst1))
- continue;
-
- const APInt *C0, *C1;
- if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
- return false;
- }
-
- return true;
- };
-
- return CompareValueList(LHS, Other->LHS) &&
- CompareValueList(RHS, Other->RHS);
-}
-
Pass *llvm::createARMParallelDSPPass() {
return new ARMParallelDSP();
}
@@ -842,6 +813,6 @@ Pass *llvm::createARMParallelDSPPass() {
char ARMParallelDSP::ID = 0;
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
- "Transform loops to use DSP intrinsics", false, false)
+ "Transform functions to use DSP intrinsics", false, false)
INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
- "Transform loops to use DSP intrinsics", false, false)
+ "Transform functions to use DSP intrinsics", false, false)
diff --git a/lib/Target/ARM/ARMPredicates.td b/lib/Target/ARM/ARMPredicates.td
index 0b6b40de80dd..b008d3e2e296 100644
--- a/lib/Target/ARM/ARMPredicates.td
+++ b/lib/Target/ARM/ARMPredicates.td
@@ -71,7 +71,7 @@ def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
def NoVFP : Predicate<"!Subtarget->hasVFP2Base()">;
def HasVFP2 : Predicate<"Subtarget->hasVFP2Base()">,
- AssemblerPredicate<"FeatureVFP2_D16_SP", "VFP2">;
+ AssemblerPredicate<"FeatureVFP2_SP", "VFP2">;
def HasVFP3 : Predicate<"Subtarget->hasVFP3Base()">,
AssemblerPredicate<"FeatureVFP3_D16_SP", "VFP3">;
def HasVFP4 : Predicate<"Subtarget->hasVFP4Base()">,
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 92ae26b3729d..56055a15483a 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -180,7 +180,7 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
// models the APSR when it's accessed by some special instructions. In such cases
// it has the same encoding as PC.
def CPSR : ARMReg<0, "cpsr">;
-def APSR : ARMReg<1, "apsr">;
+def APSR : ARMReg<15, "apsr">;
def APSR_NZCV : ARMReg<15, "apsr_nzcv">;
def SPSR : ARMReg<2, "spsr">;
def FPSCR : ARMReg<3, "fpscr">;
@@ -486,12 +486,20 @@ def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
// Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP.
// These are needed by instructions (e.g. ldrexd/strexd) requiring even-odd GPRs.
-def Tuples2R : RegisterTuples<[gsub_0, gsub_1],
- [(add R0, R2, R4, R6, R8, R10, R12),
- (add R1, R3, R5, R7, R9, R11, SP)]>;
+def Tuples2Rnosp : RegisterTuples<[gsub_0, gsub_1],
+ [(add R0, R2, R4, R6, R8, R10),
+ (add R1, R3, R5, R7, R9, R11)]>;
+
+def Tuples2Rsp : RegisterTuples<[gsub_0, gsub_1],
+ [(add R12), (add SP)]>;
// Register class representing a pair of even-odd GPRs.
-def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2R)> {
+def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2Rnosp, Tuples2Rsp)> {
+ let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
+}
+
+// Register class representing a pair of even-odd GPRs, except (R12, SP).
+def GPRPairnosp : RegisterClass<"ARM", [untyped], 64, (add Tuples2Rnosp)> {
let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
}
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 21d32bde4710..3f0b71afd977 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -2239,9 +2239,9 @@ def A9WriteLMfpPostRA : SchedWriteVariant<[
// Distinguish between our multiple MI-level forms of the same
// VLDM/VSTM instructions.
def A9PreRA : SchedPredicate<
- "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">;
+ "Register::isVirtualRegister(MI->getOperand(0).getReg())">;
def A9PostRA : SchedPredicate<
- "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">;
+ "Register::isPhysicalRegister(MI->getOperand(0).getReg())">;
// VLDM represents all destination registers as a single register
// tuple, unlike LDM. So the number of write operands is not variadic.
diff --git a/lib/Target/ARM/ARMScheduleM4.td b/lib/Target/ARM/ARMScheduleM4.td
index 38c8ea2b4f35..bfa5fc0d7131 100644
--- a/lib/Target/ARM/ARMScheduleM4.td
+++ b/lib/Target/ARM/ARMScheduleM4.td
@@ -18,6 +18,9 @@ def CortexM4Model : SchedMachineModel {
let PostRAScheduler = 1;
let CompleteModel = 0;
+ let UnsupportedFeatures = [IsARM, HasNEON, HasDotProd, HasZCZ, HasMVEInt,
+ IsNotMClass, HasDPVFP, HasFPARMv8, HasFullFP16, Has8MSecExt, HasV8,
+ HasV8_3a, HasTrustZone, HasDFB, IsWindows];
}
@@ -50,6 +53,7 @@ def : M4UnitL2<WriteMAC16>;
def : M4UnitL2<WriteDIV>;
def : M4UnitL2I<(instregex "(t|t2)LDM")>;
+def : M4UnitL2I<(instregex "(t|t2)LDR")>;
// Stores we use a latency of 1 as they have no outputs
@@ -78,9 +82,20 @@ def : M4UnitL1<WriteNoop>;
def : M4UnitL1<WritePreLd>;
def : M4UnitL1I<(instregex "(t|t2)MOV")>;
def : M4UnitL1I<(instrs COPY)>;
-def : M4UnitL1I<(instregex "t2IT")>;
-def : M4UnitL1I<(instregex "t2SEL", "t2USAD8",
- "t2(S|Q|SH|U|UQ|UH)(ADD16|ASX|SAX|SUB16|ADD8|SUB8)", "t2USADA8", "(t|t2)REV")>;
+def : M4UnitL1I<(instregex "t2IT", "t2MSR", "t2MRS")>;
+def : M4UnitL1I<(instregex "t2CLREX")>;
+def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", "t2SML[AS]",
+ "t2(S|Q|SH|U|UQ|UH|QD)(ADD|ASX|SAX|SUB)", "t2USADA8", "(t|t2)REV")>;
+
+// These instructions are not of much interest to scheduling as they will not
+// be generated or it is not very useful to schedule them. They are here to make
+// the model more complete.
+def : M4UnitL1I<(instregex "t2CDP", "t2LDC", "t2MCR", "t2MRC", "t2MRRC", "t2STC")>;
+def : M4UnitL1I<(instregex "tCPS", "t2ISB", "t2DSB", "t2DMB", "t2?HINT$")>;
+def : M4UnitL1I<(instregex "t2?UDF$", "tBKPT", "t2DBG")>;
+def : M4UnitL1I<(instregex "t?2?Int_eh_sjlj_", "tADDframe", "t?ADJCALL")>;
+def : M4UnitL1I<(instregex "CMP_SWAP", "JUMPTABLE", "MEMCPY")>;
+def : M4UnitL1I<(instregex "VSETLNi32", "VGETLNi32")>;
def : ReadAdvance<ReadALU, 0>;
def : ReadAdvance<ReadALUsr, 0>;
@@ -112,6 +127,9 @@ def : M4UnitL1<WriteVST1>;
def : M4UnitL1<WriteVST2>;
def : M4UnitL1<WriteVST3>;
def : M4UnitL1<WriteVST4>;
+def : M4UnitL1I<(instregex "VMOVS", "FCONSTS", "VCMP", "VNEG", "VABS")>;
+def : M4UnitL2I<(instregex "VMOVD")>;
+def : M4UnitL1I<(instregex "VMRS", "VMSR", "FMSTAT")>;
def : ReadAdvance<ReadFPMUL, 0>;
def : ReadAdvance<ReadFPMAC, 0>;
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 978faed776b0..09603057b2c8 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -125,7 +125,7 @@ const CallLowering *ARMSubtarget::getCallLowering() const {
return CallLoweringInfo.get();
}
-const InstructionSelector *ARMSubtarget::getInstructionSelector() const {
+InstructionSelector *ARMSubtarget::getInstructionSelector() const {
return InstSelector.get();
}
@@ -205,9 +205,9 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
NoARM = true;
if (isAAPCS_ABI())
- stackAlignment = 8;
+ stackAlignment = Align(8);
if (isTargetNaCl() || isAAPCS16_ABI())
- stackAlignment = 16;
+ stackAlignment = Align(16);
// FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
// emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
@@ -253,6 +253,10 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (isRWPI())
ReserveR9 = true;
+ // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2
+ if (MVEVectorCostFactor == 0)
+ MVEVectorCostFactor = 2;
+
// FIXME: Teach TableGen to deal with these instead of doing it manually here.
switch (ARMProcFamily) {
case Others:
@@ -296,13 +300,15 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
LdStMultipleTiming = SingleIssuePlusExtras;
MaxInterleaveFactor = 4;
if (!isThumb())
- PrefLoopAlignment = 3;
+ PrefLoopLogAlignment = 3;
break;
case Kryo:
break;
case Krait:
PreISelOperandLatencyAdjustment = 1;
break;
+ case NeoverseN1:
+ break;
case Swift:
MaxInterleaveFactor = 2;
LdStMultipleTiming = SingleIssuePlusExtras;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index c2b0f052b843..ef460342a69e 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -71,6 +71,7 @@ protected:
Exynos,
Krait,
Kryo,
+ NeoverseN1,
Swift
};
enum ARMProcClassEnum {
@@ -179,11 +180,9 @@ protected:
bool HasVFPv3SP = false;
bool HasVFPv4SP = false;
bool HasFPARMv8SP = false;
- bool HasVFPv2D16 = false;
bool HasVFPv3D16 = false;
bool HasVFPv4D16 = false;
bool HasFPARMv8D16 = false;
- bool HasVFPv2D16SP = false;
bool HasVFPv3D16SP = false;
bool HasVFPv4D16SP = false;
bool HasFPARMv8D16SP = false;
@@ -450,7 +449,7 @@ protected:
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
- unsigned stackAlignment = 4;
+ Align stackAlignment = Align(4);
/// CPUString - String name of used CPU.
std::string CPUString;
@@ -469,7 +468,12 @@ protected:
int PreISelOperandLatencyAdjustment = 2;
/// What alignment is preferred for loop bodies, in log2(bytes).
- unsigned PrefLoopAlignment = 0;
+ unsigned PrefLoopLogAlignment = 0;
+
+ /// The cost factor for MVE instructions, representing the multiple beats an
+ // instruction can take. The default is 2, (set in initSubtargetFeatures so
+ // that we can use subtarget features less than 2).
+ unsigned MVEVectorCostFactor = 0;
/// OptMinSize - True if we're optimising for minimum code size, equal to
/// the function attribute.
@@ -535,7 +539,7 @@ public:
}
const CallLowering *getCallLowering() const override;
- const InstructionSelector *getInstructionSelector() const override;
+ InstructionSelector *getInstructionSelector() const override;
const LegalizerInfo *getLegalizerInfo() const override;
const RegisterBankInfo *getRegBankInfo() const override;
@@ -600,7 +604,7 @@ public:
bool hasARMOps() const { return !NoARM; }
- bool hasVFP2Base() const { return HasVFPv2D16SP; }
+ bool hasVFP2Base() const { return HasVFPv2SP; }
bool hasVFP3Base() const { return HasVFPv3D16SP; }
bool hasVFP4Base() const { return HasVFPv4D16SP; }
bool hasFPARMv8Base() const { return HasFPARMv8D16SP; }
@@ -668,6 +672,12 @@ public:
bool hasSB() const { return HasSB; }
bool genLongCalls() const { return GenLongCalls; }
bool genExecuteOnly() const { return GenExecuteOnly; }
+ bool hasBaseDSP() const {
+ if (isThumb())
+ return hasDSP();
+ else
+ return hasV5TEOps();
+ }
bool hasFP16() const { return HasFP16; }
bool hasD32() const { return HasD32; }
@@ -812,7 +822,7 @@ public:
/// getStackAlignment - Returns the minimum alignment known to hold of the
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.
- unsigned getStackAlignment() const { return stackAlignment; }
+ Align getStackAlignment() const { return stackAlignment; }
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
@@ -853,9 +863,9 @@ public:
return isROPI() || !isTargetELF();
}
- unsigned getPrefLoopAlignment() const {
- return PrefLoopAlignment;
- }
+ unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; }
+
+ unsigned getMVEVectorCostFactor() const { return MVEVectorCostFactor; }
bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
unsigned PhysReg) const override;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 7f0aae1739b3..5c8007f101d9 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -96,15 +96,16 @@ extern "C" void LLVMInitializeARMTarget() {
initializeARMExpandPseudoPass(Registry);
initializeThumb2SizeReducePass(Registry);
initializeMVEVPTBlockPass(Registry);
+ initializeMVETailPredicationPass(Registry);
initializeARMLowOverheadLoopsPass(Registry);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
if (TT.isOSBinFormatMachO())
- return llvm::make_unique<TargetLoweringObjectFileMachO>();
+ return std::make_unique<TargetLoweringObjectFileMachO>();
if (TT.isOSWindows())
- return llvm::make_unique<TargetLoweringObjectFileCOFF>();
- return llvm::make_unique<ARMElfTargetObjectFile>();
+ return std::make_unique<TargetLoweringObjectFileCOFF>();
+ return std::make_unique<ARMElfTargetObjectFile>();
}
static ARMBaseTargetMachine::ARMABI
@@ -282,7 +283,7 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle,
+ I = std::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle,
F.hasMinSize());
if (!I->isThumb() && !I->hasARMOps())
@@ -447,8 +448,10 @@ bool ARMPassConfig::addPreISel() {
MergeExternalByDefault));
}
- if (TM->getOptLevel() != CodeGenOpt::None)
+ if (TM->getOptLevel() != CodeGenOpt::None) {
addPass(createHardwareLoopsPass());
+ addPass(createMVETailPredicationPass());
+ }
return false;
}
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 2a8ec734a05f..86c8684d14dc 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -36,8 +36,12 @@ using namespace llvm;
#define DEBUG_TYPE "armtti"
+static cl::opt<bool> EnableMaskedLoadStores(
+ "enable-arm-maskedldst", cl::Hidden, cl::init(false),
+ cl::desc("Enable the generation of masked loads and stores"));
+
static cl::opt<bool> DisableLowOverheadLoops(
- "disable-arm-loloops", cl::Hidden, cl::init(true),
+ "disable-arm-loloops", cl::Hidden, cl::init(false),
cl::desc("Disable the generation of low-overhead loops"));
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
@@ -167,6 +171,42 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (!SrcTy.isSimple() || !DstTy.isSimple())
return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ // The extend of a load is free
+ if (I && isa<LoadInst>(I->getOperand(0))) {
+ static const TypeConversionCostTblEntry LoadConversionTbl[] = {
+ {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
+ {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
+ {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
+ {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
+ {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
+ {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
+ {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
+ {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
+ {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
+ {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
+ {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
+ {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
+ };
+ if (const auto *Entry = ConvertCostTableLookup(
+ LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+ return Entry->Cost;
+
+ static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
+ {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
+ {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
+ {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
+ {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
+ {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
+ {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
+ };
+ if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
+ if (const auto *Entry =
+ ConvertCostTableLookup(MVELoadConversionTbl, ISD,
+ DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+ }
+
// Some arithmetic, load and store operations have specific instructions
// to cast up/down their types automatically at no extra cost.
// TODO: Get these tables to know at least what the related operations are.
@@ -313,6 +353,31 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return Entry->Cost;
}
+ // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
+ // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
+ // are linearised so take more.
+ static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
+ };
+
+ if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
+ if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
+ ISD, DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost * ST->getMVEVectorCostFactor();
+ }
+
// Scalar integer conversion costs.
static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
// i16 -> i64 requires two dependent operations.
@@ -332,7 +397,10 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return Entry->Cost;
}
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
+ ? ST->getMVEVectorCostFactor()
+ : 1;
+ return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src);
}
int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -343,8 +411,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
return 3;
- if ((Opcode == Instruction::InsertElement ||
- Opcode == Instruction::ExtractElement)) {
+ if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
+ Opcode == Instruction::ExtractElement)) {
// Cross-class copies are expensive on many microarchitectures,
// so assume they are expensive by default.
if (ValTy->getVectorElementType()->isIntegerTy())
@@ -357,6 +425,17 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
}
+ if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
+ Opcode == Instruction::ExtractElement)) {
+ // We say MVE moves costs at least the MVEVectorCostFactor, even though
+ // they are scalar instructions. This helps prevent mixing scalar and
+ // vector, to prevent vectorising where we end up just scalarising the
+ // result anyway.
+ return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
+ ST->getMVEVectorCostFactor()) *
+ ValTy->getVectorNumElements() / 2;
+ }
+
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
@@ -385,7 +464,10 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
return LT.first;
}
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+ int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
+ ? ST->getMVEVectorCostFactor()
+ : 1;
+ return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -397,13 +479,37 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
unsigned NumVectorInstToHideOverhead = 10;
int MaxMergeDistance = 64;
- if (Ty->isVectorTy() && SE &&
- !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
- return NumVectorInstToHideOverhead;
+ if (ST->hasNEON()) {
+ if (Ty->isVectorTy() && SE &&
+ !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
+ return NumVectorInstToHideOverhead;
- // In many cases the address computation is not merged into the instruction
- // addressing mode.
- return 1;
+ // In many cases the address computation is not merged into the instruction
+ // addressing mode.
+ return 1;
+ }
+ return BaseT::getAddressComputationCost(Ty, SE, Ptr);
+}
+
+bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
+ if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
+ return false;
+
+ if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
+ // Don't support v2i1 yet.
+ if (VecTy->getNumElements() == 2)
+ return false;
+
+ // We don't support extending fp types.
+ unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
+ if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
+ return false;
+ }
+
+ unsigned EltWidth = DataTy->getScalarSizeInBits();
+ return (EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
+ (EltWidth == 16 && (!Alignment || Alignment >= 2)) ||
+ (EltWidth == 8);
}
int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
@@ -442,78 +548,96 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
- if (Kind == TTI::SK_Broadcast) {
- static const CostTblEntry NEONDupTbl[] = {
- // VDUP handles these cases.
- {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
-
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
-
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
- if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE,
- LT.second))
- return LT.first * Entry->Cost;
-
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
- }
- if (Kind == TTI::SK_Reverse) {
- static const CostTblEntry NEONShuffleTbl[] = {
- // Reverse shuffle cost one instruction if we are shuffling within a
- // double word (vrev) or two if we shuffle a quad word (vrev, vext).
- {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
-
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
-
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
- if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE,
- LT.second))
- return LT.first * Entry->Cost;
-
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
- }
- if (Kind == TTI::SK_Select) {
- static const CostTblEntry NEONSelShuffleTbl[] = {
- // Select shuffle cost table for ARM. Cost is the number of instructions
- // required to create the shuffled vector.
+ if (ST->hasNEON()) {
+ if (Kind == TTI::SK_Broadcast) {
+ static const CostTblEntry NEONDupTbl[] = {
+ // VDUP handles these cases.
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ if (const auto *Entry =
+ CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+ }
+ if (Kind == TTI::SK_Reverse) {
+ static const CostTblEntry NEONShuffleTbl[] = {
+ // Reverse shuffle cost one instruction if we are shuffling within a
+ // double word (vrev) or two if we shuffle a quad word (vrev, vext).
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ if (const auto *Entry =
+ CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+ }
+ if (Kind == TTI::SK_Select) {
+ static const CostTblEntry NEONSelShuffleTbl[] = {
+ // Select shuffle cost table for ARM. Cost is the number of
+ // instructions
+ // required to create the shuffled vector.
- {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
- if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
- return LT.first * Entry->Cost;
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+ }
+ }
+ if (ST->hasMVEIntegerOps()) {
+ if (Kind == TTI::SK_Broadcast) {
+ static const CostTblEntry MVEDupTbl[] = {
+ // VDUP handles these cases.
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
+ LT.second))
+ return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
+ }
}
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
+ ? ST->getMVEVectorCostFactor()
+ : 1;
+ return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
int ARMTTIImpl::getArithmeticInstrCost(
@@ -567,38 +691,64 @@ int ARMTTIImpl::getArithmeticInstrCost(
// Multiplication.
};
- if (ST->hasNEON())
+ if (ST->hasNEON()) {
if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
return LT.first * Entry->Cost;
- int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
- Opd1PropInfo, Opd2PropInfo);
-
- // This is somewhat of a hack. The problem that we are facing is that SROA
- // creates a sequence of shift, and, or instructions to construct values.
- // These sequences are recognized by the ISel and have zero-cost. Not so for
- // the vectorized code. Because we have support for v2i64 but not i64 those
- // sequences look particularly beneficial to vectorize.
- // To work around this we increase the cost of v2i64 operations to make them
- // seem less beneficial.
- if (LT.second == MVT::v2i64 &&
- Op2Info == TargetTransformInfo::OK_UniformConstantValue)
- Cost += 4;
-
- return Cost;
+ int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+ Opd1PropInfo, Opd2PropInfo);
+
+ // This is somewhat of a hack. The problem that we are facing is that SROA
+ // creates a sequence of shift, and, or instructions to construct values.
+ // These sequences are recognized by the ISel and have zero-cost. Not so for
+ // the vectorized code. Because we have support for v2i64 but not i64 those
+ // sequences look particularly beneficial to vectorize.
+ // To work around this we increase the cost of v2i64 operations to make them
+ // seem less beneficial.
+ if (LT.second == MVT::v2i64 &&
+ Op2Info == TargetTransformInfo::OK_UniformConstantValue)
+ Cost += 4;
+
+ return Cost;
+ }
+
+ int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
+ ? ST->getMVEVectorCostFactor()
+ : 1;
+
+ // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
+ // without treating floats as more expensive that scalars or increasing the
+ // costs for custom operations. The results is also multiplied by the
+ // MVEVectorCostFactor where appropriate.
+ if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
+ return LT.first * BaseCost;
+
+ // Else this is expand, assume that we need to scalarize this op.
+ if (Ty->isVectorTy()) {
+ unsigned Num = Ty->getVectorNumElements();
+ unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+ // Return the cost of multiple scalar invocation plus the cost of
+ // inserting and extracting the values.
+ return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
+ }
+
+ return BaseCost;
}
int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace, const Instruction *I) {
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
- if (Src->isVectorTy() && Alignment != 16 &&
+ if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 &&
Src->getVectorElementType()->isDoubleTy()) {
// Unaligned loads/stores are extremely inefficient.
// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
return LT.first * 4;
}
- return LT.first;
+ int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
+ ? ST->getMVEVectorCostFactor()
+ : 1;
+ return BaseCost * LT.first;
}
int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
@@ -893,6 +1043,11 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
return;
}
+ // Don't unroll vectorised loop. MVE does not benefit from it as much as
+ // scalar code.
+ if (I.getType()->isVectorTy())
+ return;
+
SmallVector<const Value*, 4> Operands(I.value_op_begin(),
I.value_op_end());
Cost += getUserCost(&I, Operands);
@@ -914,3 +1069,28 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
if (Cost < 12)
UP.Force = true;
}
+
+bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+ if (!ST->hasMVEIntegerOps())
+ return false;
+
+ switch (Opcode) {
+ case Instruction::FAdd:
+ case Instruction::FMul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Mul:
+ case Instruction::FCmp:
+ return false;
+ case Instruction::ICmp:
+ case Instruction::Add:
+ return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128;
+ default:
+ llvm_unreachable("Unhandled reduction opcode");
+ }
+ return false;
+}
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 52f6ea4a6e2f..a878fdcfe3c7 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -101,9 +101,9 @@ public:
/// Floating-point computation using ARMv8 AArch32 Advanced
/// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
- /// is IEEE-754 compliant, but it's not covered in this target.
+ /// and Arm MVE are IEEE-754 compliant.
bool isFPVectorizationPotentiallyUnsafe() {
- return !ST->isTargetDarwin();
+ return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
}
/// \name Scalar TTI Implementations
@@ -122,10 +122,13 @@ public:
/// \name Vector TTI Implementations
/// @{
- unsigned getNumberOfRegisters(bool Vector) {
+ unsigned getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (Vector) {
if (ST->hasNEON())
return 16;
+ if (ST->hasMVEIntegerOps())
+ return 8;
return 0;
}
@@ -138,6 +141,8 @@ public:
if (Vector) {
if (ST->hasNEON())
return 128;
+ if (ST->hasMVEIntegerOps())
+ return 128;
return 0;
}
@@ -148,10 +153,23 @@ public:
return ST->getMaxInterleaveFactor();
}
+ bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment);
+
+ bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) {
+ return isLegalMaskedLoad(DataTy, Alignment);
+ }
+
int getMemcpyCost(const Instruction *I);
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+ bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const;
+
+ bool shouldExpandReduction(const IntrinsicInst *II) const {
+ return false;
+ }
+
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
const Instruction *I = nullptr);
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 1da9452f1d22..d2c355c1da75 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -2275,6 +2275,14 @@ public:
return Value >= 1 && Value <= 32;
}
+ bool isMveSaturateOp() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ uint64_t Value = CE->getValue();
+ return Value == 48 || Value == 64;
+ }
+
bool isITCondCodeNoAL() const {
if (!isITCondCode()) return false;
ARMCC::CondCodes CC = getCondCode();
@@ -2479,28 +2487,28 @@ public:
void addModImmNotOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint32_t Enc = ARM_AM::getSOImmVal(~CE->getValue());
Inst.addOperand(MCOperand::createImm(Enc));
}
void addModImmNegOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint32_t Enc = ARM_AM::getSOImmVal(-CE->getValue());
Inst.addOperand(MCOperand::createImm(Enc));
}
void addThumbModImmNeg8_255Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint32_t Val = -CE->getValue();
Inst.addOperand(MCOperand::createImm(Val));
}
void addThumbModImmNeg1_7Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint32_t Val = -CE->getValue();
Inst.addOperand(MCOperand::createImm(Val));
}
@@ -2523,19 +2531,19 @@ public:
void addFBits16Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(16 - CE->getValue()));
}
void addFBits32Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(32 - CE->getValue()));
}
void addFPImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
Inst.addOperand(MCOperand::createImm(Val));
}
@@ -2544,7 +2552,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// FIXME: We really want to scale the value here, but the LDRD/STRD
// instruction don't encode operands that way yet.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
@@ -2552,35 +2560,31 @@ public:
assert(N == 1 && "Invalid number of operands!");
// FIXME: We really want to scale the value here, but the VSTR/VLDR_VSYSR
// instruction don't encode operands that way yet.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
void addImm7Shift0Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- assert(CE != nullptr && "Invalid operand type!");
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
void addImm7Shift1Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- assert(CE != nullptr && "Invalid operand type!");
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
void addImm7Shift2Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- assert(CE != nullptr && "Invalid operand type!");
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
void addImm7Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- assert(CE != nullptr && "Invalid operand type!");
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
@@ -2588,7 +2592,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The immediate is scaled by four in the encoding and is stored
// in the MCInst as such. Lop off the low two bits here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
}
@@ -2596,7 +2600,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The immediate is scaled by four in the encoding and is stored
// in the MCInst as such. Lop off the low two bits here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(-(CE->getValue() / 4)));
}
@@ -2604,7 +2608,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The immediate is scaled by four in the encoding and is stored
// in the MCInst as such. Lop off the low two bits here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
}
@@ -2612,7 +2616,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The constant encodes as the immediate-1, and we store in the instruction
// the bits as encoded, so subtract off one here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() - 1));
}
@@ -2620,7 +2624,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The constant encodes as the immediate-1, and we store in the instruction
// the bits as encoded, so subtract off one here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() - 1));
}
@@ -2628,7 +2632,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The constant encodes as the immediate, except for 32, which encodes as
// zero.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Imm = CE->getValue();
Inst.addOperand(MCOperand::createImm((Imm == 32 ? 0 : Imm)));
}
@@ -2637,7 +2641,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// An ASR value of 32 encodes as 0, so that's how we want to add it to
// the instruction as well.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
int Val = CE->getValue();
Inst.addOperand(MCOperand::createImm(Val == 32 ? 0 : Val));
}
@@ -2646,7 +2650,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The operand is actually a t2_so_imm, but we have its bitwise
// negation in the assembly source, so twiddle it here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(~(uint32_t)CE->getValue()));
}
@@ -2654,7 +2658,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The operand is actually a t2_so_imm, but we have its
// negation in the assembly source, so twiddle it here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue()));
}
@@ -2662,7 +2666,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The operand is actually an imm0_4095, but we have its
// negation in the assembly source, so twiddle it here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue()));
}
@@ -2671,9 +2675,7 @@ public:
Inst.addOperand(MCOperand::createImm(CE->getValue() >> 2));
return;
}
-
- const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
- assert(SR && "Unknown value type!");
+ const MCSymbolRefExpr *SR = cast<MCSymbolRefExpr>(Imm.Val);
Inst.addOperand(MCOperand::createExpr(SR));
}
@@ -2685,10 +2687,7 @@ public:
Inst.addOperand(MCOperand::createImm(CE->getValue()));
return;
}
-
- const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
-
- assert(SR && "Unknown value type!");
+ const MCSymbolRefExpr *SR = cast<MCSymbolRefExpr>(Imm.Val);
Inst.addOperand(MCOperand::createExpr(SR));
return;
}
@@ -2750,7 +2749,7 @@ public:
return;
}
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
int Val = CE->getValue();
Inst.addOperand(MCOperand::createImm(Val));
}
@@ -3130,7 +3129,7 @@ public:
void addPowerTwoOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
@@ -3225,14 +3224,14 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
// Mask in that this is an i8 splat.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() | 0xe00));
}
void addNEONi16splatOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = CE->getValue();
Value = ARM_AM::encodeNEONi16splat(Value);
Inst.addOperand(MCOperand::createImm(Value));
@@ -3241,7 +3240,7 @@ public:
void addNEONi16splatNotOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = CE->getValue();
Value = ARM_AM::encodeNEONi16splat(~Value & 0xffff);
Inst.addOperand(MCOperand::createImm(Value));
@@ -3250,7 +3249,7 @@ public:
void addNEONi32splatOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = CE->getValue();
Value = ARM_AM::encodeNEONi32splat(Value);
Inst.addOperand(MCOperand::createImm(Value));
@@ -3259,7 +3258,7 @@ public:
void addNEONi32splatNotOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = CE->getValue();
Value = ARM_AM::encodeNEONi32splat(~Value);
Inst.addOperand(MCOperand::createImm(Value));
@@ -3267,7 +3266,7 @@ public:
void addNEONi8ReplicateOperands(MCInst &Inst, bool Inv) const {
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
Inst.getOpcode() == ARM::VMOVv16i8) &&
"All instructions that wants to replicate non-zero byte "
@@ -3298,7 +3297,7 @@ public:
void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = encodeNeonVMOVImmediate(CE->getValue());
Inst.addOperand(MCOperand::createImm(Value));
}
@@ -3310,7 +3309,7 @@ public:
void addNEONvmovi16ReplicateOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
assert((Inst.getOpcode() == ARM::VMOVv4i16 ||
Inst.getOpcode() == ARM::VMOVv8i16 ||
Inst.getOpcode() == ARM::VMVNv4i16 ||
@@ -3327,14 +3326,14 @@ public:
void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = encodeNeonVMOVImmediate(~CE->getValue());
Inst.addOperand(MCOperand::createImm(Value));
}
void addNEONvmovi32ReplicateOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
assert((Inst.getOpcode() == ARM::VMOVv2i32 ||
Inst.getOpcode() == ARM::VMOVv4i32 ||
Inst.getOpcode() == ARM::VMVNv2i32 ||
@@ -3349,7 +3348,7 @@ public:
void addNEONi64splatOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint64_t Value = CE->getValue();
unsigned Imm = 0;
for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
@@ -3360,20 +3359,28 @@ public:
void addComplexRotationEvenOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() / 90));
}
void addComplexRotationOddOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm((CE->getValue() - 90) / 180));
}
+ void addMveSaturateOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+ unsigned Imm = CE->getValue();
+ assert((Imm == 48 || Imm == 64) && "Invalid saturate operand");
+ Inst.addOperand(MCOperand::createImm(Imm == 48 ? 1 : 0));
+ }
+
void print(raw_ostream &OS) const override;
static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_ITCondMask);
+ auto Op = std::make_unique<ARMOperand>(k_ITCondMask);
Op->ITMask.Mask = Mask;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3382,7 +3389,7 @@ public:
static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC,
SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_CondCode);
+ auto Op = std::make_unique<ARMOperand>(k_CondCode);
Op->CC.Val = CC;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3391,7 +3398,7 @@ public:
static std::unique_ptr<ARMOperand> CreateVPTPred(ARMVCC::VPTCodes CC,
SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_VPTPred);
+ auto Op = std::make_unique<ARMOperand>(k_VPTPred);
Op->VCC.Val = CC;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3399,7 +3406,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_CoprocNum);
+ auto Op = std::make_unique<ARMOperand>(k_CoprocNum);
Op->Cop.Val = CopVal;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3407,7 +3414,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_CoprocReg);
+ auto Op = std::make_unique<ARMOperand>(k_CoprocReg);
Op->Cop.Val = CopVal;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3416,7 +3423,7 @@ public:
static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S,
SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_CoprocOption);
+ auto Op = std::make_unique<ARMOperand>(k_CoprocOption);
Op->Cop.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3424,7 +3431,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_CCOut);
+ auto Op = std::make_unique<ARMOperand>(k_CCOut);
Op->Reg.RegNum = RegNum;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3432,7 +3439,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_Token);
+ auto Op = std::make_unique<ARMOperand>(k_Token);
Op->Tok.Data = Str.data();
Op->Tok.Length = Str.size();
Op->StartLoc = S;
@@ -3442,7 +3449,7 @@ public:
static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S,
SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_Register);
+ auto Op = std::make_unique<ARMOperand>(k_Register);
Op->Reg.RegNum = RegNum;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3453,7 +3460,7 @@ public:
CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
unsigned ShiftReg, unsigned ShiftImm, SMLoc S,
SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_ShiftedRegister);
+ auto Op = std::make_unique<ARMOperand>(k_ShiftedRegister);
Op->RegShiftedReg.ShiftTy = ShTy;
Op->RegShiftedReg.SrcReg = SrcReg;
Op->RegShiftedReg.ShiftReg = ShiftReg;
@@ -3466,7 +3473,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
unsigned ShiftImm, SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_ShiftedImmediate);
+ auto Op = std::make_unique<ARMOperand>(k_ShiftedImmediate);
Op->RegShiftedImm.ShiftTy = ShTy;
Op->RegShiftedImm.SrcReg = SrcReg;
Op->RegShiftedImm.ShiftImm = ShiftImm;
@@ -3477,7 +3484,7 @@ public:
static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm,
SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_ShifterImmediate);
+ auto Op = std::make_unique<ARMOperand>(k_ShifterImmediate);
Op->ShifterImm.isASR = isASR;
Op->ShifterImm.Imm = Imm;
Op->StartLoc = S;
@@ -3487,7 +3494,7 @@ public:
static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S,
SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_RotateImmediate);
+ auto Op = std::make_unique<ARMOperand>(k_RotateImmediate);
Op->RotImm.Imm = Imm;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3496,7 +3503,7 @@ public:
static std::unique_ptr<ARMOperand> CreateModImm(unsigned Bits, unsigned Rot,
SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_ModifiedImmediate);
+ auto Op = std::make_unique<ARMOperand>(k_ModifiedImmediate);
Op->ModImm.Bits = Bits;
Op->ModImm.Rot = Rot;
Op->StartLoc = S;
@@ -3506,7 +3513,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateConstantPoolImm(const MCExpr *Val, SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_ConstantPoolImmediate);
+ auto Op = std::make_unique<ARMOperand>(k_ConstantPoolImmediate);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3515,7 +3522,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor);
+ auto Op = std::make_unique<ARMOperand>(k_BitfieldDescriptor);
Op->Bitfield.LSB = LSB;
Op->Bitfield.Width = Width;
Op->StartLoc = S;
@@ -3543,16 +3550,15 @@ public:
Kind = k_SPRRegisterList;
}
- // Sort based on the register encoding values.
- array_pod_sort(Regs.begin(), Regs.end());
-
if (Kind == k_RegisterList && Regs.back().second == ARM::APSR)
Kind = k_RegisterListWithAPSR;
- auto Op = make_unique<ARMOperand>(Kind);
- for (SmallVectorImpl<std::pair<unsigned, unsigned>>::const_iterator
- I = Regs.begin(), E = Regs.end(); I != E; ++I)
- Op->Registers.push_back(I->second);
+ assert(std::is_sorted(Regs.begin(), Regs.end()) &&
+ "Register list must be sorted by encoding");
+
+ auto Op = std::make_unique<ARMOperand>(Kind);
+ for (const auto &P : Regs)
+ Op->Registers.push_back(P.second);
Op->StartLoc = StartLoc;
Op->EndLoc = EndLoc;
@@ -3563,7 +3569,7 @@ public:
unsigned Count,
bool isDoubleSpaced,
SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_VectorList);
+ auto Op = std::make_unique<ARMOperand>(k_VectorList);
Op->VectorList.RegNum = RegNum;
Op->VectorList.Count = Count;
Op->VectorList.isDoubleSpaced = isDoubleSpaced;
@@ -3575,7 +3581,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced,
SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_VectorListAllLanes);
+ auto Op = std::make_unique<ARMOperand>(k_VectorListAllLanes);
Op->VectorList.RegNum = RegNum;
Op->VectorList.Count = Count;
Op->VectorList.isDoubleSpaced = isDoubleSpaced;
@@ -3587,7 +3593,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index,
bool isDoubleSpaced, SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_VectorListIndexed);
+ auto Op = std::make_unique<ARMOperand>(k_VectorListIndexed);
Op->VectorList.RegNum = RegNum;
Op->VectorList.Count = Count;
Op->VectorList.LaneIndex = Index;
@@ -3599,7 +3605,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<ARMOperand>(k_VectorIndex);
+ auto Op = std::make_unique<ARMOperand>(k_VectorIndex);
Op->VectorIndex.Val = Idx;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3608,7 +3614,7 @@ public:
static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S,
SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_Immediate);
+ auto Op = std::make_unique<ARMOperand>(k_Immediate);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3620,7 +3626,7 @@ public:
unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType,
unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S,
SMLoc E, SMLoc AlignmentLoc = SMLoc()) {
- auto Op = make_unique<ARMOperand>(k_Memory);
+ auto Op = std::make_unique<ARMOperand>(k_Memory);
Op->Memory.BaseRegNum = BaseRegNum;
Op->Memory.OffsetImm = OffsetImm;
Op->Memory.OffsetRegNum = OffsetRegNum;
@@ -3637,7 +3643,7 @@ public:
static std::unique_ptr<ARMOperand>
CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy,
unsigned ShiftImm, SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_PostIndexRegister);
+ auto Op = std::make_unique<ARMOperand>(k_PostIndexRegister);
Op->PostIdxReg.RegNum = RegNum;
Op->PostIdxReg.isAdd = isAdd;
Op->PostIdxReg.ShiftTy = ShiftTy;
@@ -3649,7 +3655,7 @@ public:
static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt,
SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_MemBarrierOpt);
+ auto Op = std::make_unique<ARMOperand>(k_MemBarrierOpt);
Op->MBOpt.Val = Opt;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3658,7 +3664,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt);
+ auto Op = std::make_unique<ARMOperand>(k_InstSyncBarrierOpt);
Op->ISBOpt.Val = Opt;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3667,7 +3673,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateTraceSyncBarrierOpt(ARM_TSB::TraceSyncBOpt Opt, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_TraceSyncBarrierOpt);
+ auto Op = std::make_unique<ARMOperand>(k_TraceSyncBarrierOpt);
Op->TSBOpt.Val = Opt;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3676,7 +3682,7 @@ public:
static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags,
SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_ProcIFlags);
+ auto Op = std::make_unique<ARMOperand>(k_ProcIFlags);
Op->IFlags.Val = IFlags;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3684,7 +3690,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_MSRMask);
+ auto Op = std::make_unique<ARMOperand>(k_MSRMask);
Op->MMask.Val = MMask;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3692,7 +3698,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateBankedReg(unsigned Reg, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_BankedReg);
+ auto Op = std::make_unique<ARMOperand>(k_BankedReg);
Op->BankedReg.Val = Reg;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -4253,6 +4259,24 @@ static unsigned getNextRegister(unsigned Reg) {
}
}
+// Insert an <Encoding, Register> pair in an ordered vector. Return true on
+// success, or false, if duplicate encoding found.
+static bool
+insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
+ unsigned Enc, unsigned Reg) {
+ Regs.emplace_back(Enc, Reg);
+ for (auto I = Regs.rbegin(), J = I + 1, E = Regs.rend(); J != E; ++I, ++J) {
+ if (J->first == Enc) {
+ Regs.erase(J.base());
+ return false;
+ }
+ if (J->first < Enc)
+ break;
+ std::swap(*I, *J);
+ }
+ return true;
+}
+
/// Parse a register list.
bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
bool EnforceOrder) {
@@ -4278,7 +4302,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
Reg = getDRegFromQReg(Reg);
EReg = MRI->getEncodingValue(Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ Registers.emplace_back(EReg, Reg);
++Reg;
}
const MCRegisterClass *RC;
@@ -4295,7 +4319,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
// Store the register.
EReg = MRI->getEncodingValue(Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ Registers.emplace_back(EReg, Reg);
// This starts immediately after the first register token in the list,
// so we can see either a comma or a minus (range separator) as a legal
@@ -4326,7 +4350,11 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
while (Reg != EndReg) {
Reg = getNextRegister(Reg);
EReg = MRI->getEncodingValue(Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ if (!insertNoDuplicates(Registers, EReg, Reg)) {
+ Warning(AfterMinusLoc, StringRef("duplicated register (") +
+ ARMInstPrinter::getRegisterName(Reg) +
+ ") in register list");
+ }
}
continue;
}
@@ -4350,11 +4378,16 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
// subset of GPRRegClassId except it contains APSR as well.
RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID];
}
- if (Reg == ARM::VPR && (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] ||
- RC == &ARMMCRegisterClasses[ARM::DPRRegClassID])) {
+ if (Reg == ARM::VPR &&
+ (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] ||
+ RC == &ARMMCRegisterClasses[ARM::DPRRegClassID] ||
+ RC == &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID])) {
RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID];
EReg = MRI->getEncodingValue(Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ if (!insertNoDuplicates(Registers, EReg, Reg)) {
+ Warning(RegLoc, "duplicated register (" + RegTok.getString() +
+ ") in register list");
+ }
continue;
}
// The register must be in the same register class as the first.
@@ -4371,21 +4404,19 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
else if (!ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg))
return Error(RegLoc, "register list not in ascending order");
}
- if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) {
- Warning(RegLoc, "duplicated register (" + RegTok.getString() +
- ") in register list");
- continue;
- }
// VFP register lists must also be contiguous.
if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
RC != &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID] &&
Reg != OldReg + 1)
return Error(RegLoc, "non-contiguous register range");
EReg = MRI->getEncodingValue(Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ if (!insertNoDuplicates(Registers, EReg, Reg)) {
+ Warning(RegLoc, "duplicated register (" + RegTok.getString() +
+ ") in register list");
+ }
if (isQReg) {
EReg = MRI->getEncodingValue(++Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ Registers.emplace_back(EReg, Reg);
}
}
@@ -5702,14 +5733,16 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) {
return false;
}
- // If we have a '#', it's an immediate offset, else assume it's a register
- // offset. Be friendly and also accept a plain integer (without a leading
- // hash) for gas compatibility.
+ // If we have a '#' or '$', it's an immediate offset, else assume it's a
+ // register offset. Be friendly and also accept a plain integer or expression
+ // (without a leading hash) for gas compatibility.
if (Parser.getTok().is(AsmToken::Hash) ||
Parser.getTok().is(AsmToken::Dollar) ||
+ Parser.getTok().is(AsmToken::LParen) ||
Parser.getTok().is(AsmToken::Integer)) {
- if (Parser.getTok().isNot(AsmToken::Integer))
- Parser.Lex(); // Eat '#' or '$'.
+ if (Parser.getTok().is(AsmToken::Hash) ||
+ Parser.getTok().is(AsmToken::Dollar))
+ Parser.Lex(); // Eat '#' or '$'
E = Parser.getTok().getLoc();
bool isNegative = getParser().getTok().is(AsmToken::Minus);
@@ -11308,7 +11341,7 @@ bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) {
SmallVector<uint8_t, 16> Opcodes;
auto parseOne = [&]() -> bool {
- const MCExpr *OE;
+ const MCExpr *OE = nullptr;
SMLoc OpcodeLoc = getLexer().getLoc();
if (check(getLexer().is(AsmToken::EndOfStatement) ||
Parser.parseExpression(OE),
@@ -11694,14 +11727,14 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
{ ARM::AEK_CRYPTO, {Feature_HasV8Bit},
{ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
{ ARM::AEK_FP, {Feature_HasV8Bit},
- {ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} },
+ {ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} },
{ (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM),
{Feature_HasV7Bit, Feature_IsNotMClassBit},
{ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} },
{ ARM::AEK_MP, {Feature_HasV7Bit, Feature_IsNotMClassBit},
{ARM::FeatureMP} },
{ ARM::AEK_SIMD, {Feature_HasV8Bit},
- {ARM::FeatureNEON, ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} },
+ {ARM::FeatureNEON, ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} },
{ ARM::AEK_SEC, {Feature_HasV6KBit}, {ARM::FeatureTrustZone} },
// FIXME: Only available in A-class, isel not predicated
{ ARM::AEK_VIRT, {Feature_HasV7Bit}, {ARM::FeatureVirtualization} },
@@ -11775,19 +11808,19 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
// immediate in the syntax.
switch (Kind) {
default: break;
- case MCK__35_0:
+ case MCK__HASH_0:
if (Op.isImm())
if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
if (CE->getValue() == 0)
return Match_Success;
break;
- case MCK__35_8:
+ case MCK__HASH_8:
if (Op.isImm())
if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
if (CE->getValue() == 8)
return Match_Success;
break;
- case MCK__35_16:
+ case MCK__HASH_16:
if (Op.isImm())
if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
if (CE->getValue() == 16)
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 673691ebd93e..eabc26d05f47 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -314,7 +314,7 @@ static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val,
+static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst,unsigned Val,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst,unsigned Val,
uint64_t Address, const void *Decoder);
@@ -561,6 +561,8 @@ static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn,
uint64_t Address,
const void *Decoder);
@@ -3445,7 +3447,7 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
}
static DecodeStatus
-DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn,
+DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder) {
DecodeStatus S = MCDisassembler::Success;
@@ -5679,7 +5681,7 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
}
}
}
- return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+ return DecodeVMOVModImmInstruction(Inst, Insn, Address, Decoder);
}
if (!(imm & 0x20)) return MCDisassembler::Fail;
@@ -5738,7 +5740,7 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
}
}
}
- return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+ return DecodeVMOVModImmInstruction(Inst, Insn, Address, Decoder);
}
if (!(imm & 0x20)) return MCDisassembler::Fail;
@@ -6481,6 +6483,12 @@ static DecodeStatus DecodeMVEOverlappingLongShift(
if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
return MCDisassembler::Fail;
+ if (fieldFromInstruction (Insn, 6, 3) != 4)
+ return MCDisassembler::SoftFail;
+
+ if (Rda == Rm)
+ return MCDisassembler::SoftFail;
+
return S;
}
@@ -6503,6 +6511,13 @@ static DecodeStatus DecodeMVEOverlappingLongShift(
if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
return MCDisassembler::Fail;
+ if (Inst.getOpcode() == ARM::MVE_SQRSHRL ||
+ Inst.getOpcode() == ARM::MVE_UQRSHLL) {
+ unsigned Saturate = fieldFromInstruction(Insn, 7, 1);
+ // Saturate, the bit position for saturation
+ Inst.addOperand(MCOperand::createImm(Saturate));
+ }
+
return S;
}
@@ -6572,3 +6587,11 @@ static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
return MCDisassembler::Fail;
return S;
}
+
+static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ Inst.addOperand(MCOperand::createReg(ARM::VPR));
+ Inst.addOperand(MCOperand::createReg(ARM::VPR));
+ return S;
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index 7732a6485a85..24a9fabf0979 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -518,10 +518,10 @@ namespace ARM_AM {
// Valid alignments depend on the specific instruction.
//===--------------------------------------------------------------------===//
- // NEON Modified Immediates
+ // NEON/MVE Modified Immediates
//===--------------------------------------------------------------------===//
//
- // Several NEON instructions (e.g., VMOV) take a "modified immediate"
+ // Several NEON and MVE instructions (e.g., VMOV) take a "modified immediate"
// vector operand, where a small immediate encoded in the instruction
// specifies a full NEON vector value. These modified immediates are
// represented here as encoded integers. The low 8 bits hold the immediate
@@ -529,20 +529,20 @@ namespace ARM_AM {
// the "Cmode" field of the instruction. The interfaces below treat the
// Op and Cmode values as a single 5-bit value.
- inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) {
+ inline unsigned createVMOVModImm(unsigned OpCmode, unsigned Val) {
return (OpCmode << 8) | Val;
}
- inline unsigned getNEONModImmOpCmode(unsigned ModImm) {
+ inline unsigned getVMOVModImmOpCmode(unsigned ModImm) {
return (ModImm >> 8) & 0x1f;
}
- inline unsigned getNEONModImmVal(unsigned ModImm) { return ModImm & 0xff; }
+ inline unsigned getVMOVModImmVal(unsigned ModImm) { return ModImm & 0xff; }
- /// decodeNEONModImm - Decode a NEON modified immediate value into the
+ /// decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the
/// element value and the element size in bits. (If the element size is
/// smaller than the vector, it is splatted into all the elements.)
- inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) {
- unsigned OpCmode = getNEONModImmOpCmode(ModImm);
- unsigned Imm8 = getNEONModImmVal(ModImm);
+ inline uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits) {
+ unsigned OpCmode = getVMOVModImmOpCmode(ModImm);
+ unsigned Imm8 = getVMOVModImmVal(ModImm);
uint64_t Val = 0;
if (OpCmode == 0xe) {
@@ -572,7 +572,7 @@ namespace ARM_AM {
}
EltBits = 64;
} else {
- llvm_unreachable("Unsupported NEON immediate");
+ llvm_unreachable("Unsupported VMOV immediate");
}
return Val;
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index aeab5be78ab4..6196881a9b8f 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -233,7 +233,7 @@ static const char *checkPCRelOffset(uint64_t Value, int64_t Min, int64_t Max) {
const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
uint64_t Value) const {
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
case ARM::fixup_arm_thumb_br: {
// Relaxing tB to t2B. tB has a signed 12-bit displacement with the
// low bit being an implied zero. There's an implied +4 offset for the
@@ -870,7 +870,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
const MCValue &Target) {
const MCSymbolRefExpr *A = Target.getSymA();
const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
- const unsigned FixupKind = Fixup.getKind() ;
+ const unsigned FixupKind = Fixup.getKind();
if (FixupKind == FK_NONE)
return true;
if (FixupKind == ARM::fixup_arm_thumb_bl) {
@@ -1105,28 +1105,28 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
if (Instrs.empty())
return 0;
// Start off assuming CFA is at SP+0.
- int CFARegister = ARM::SP;
+ unsigned CFARegister = ARM::SP;
int CFARegisterOffset = 0;
// Mark savable registers as initially unsaved
DenseMap<unsigned, int> RegOffsets;
int FloatRegCount = 0;
// Process each .cfi directive and build up compact unwind info.
for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
- int Reg;
+ unsigned Reg;
const MCCFIInstruction &Inst = Instrs[i];
switch (Inst.getOperation()) {
case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa
CFARegisterOffset = -Inst.getOffset();
- CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true);
break;
case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset
CFARegisterOffset = -Inst.getOffset();
break;
case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register
- CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true);
break;
case MCCFIInstruction::OpOffset: // DW_CFA_offset
- Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
RegOffsets[Reg] = Inst.getOffset();
else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index c4daafe8ee97..6293a2462306 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -393,6 +393,9 @@ namespace ARMII {
// in an IT block).
ThumbArithFlagSetting = 1 << 19,
+ // Whether an instruction can be included in an MVE tail-predicated loop.
+ ValidForTailPredication = 1 << 20,
+
//===------------------------------------------------------------------===//
// Code domain.
DomainShift = 15,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index fda19eea1de6..1fee38821a49 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -82,7 +82,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
if (IsPCRel) {
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default:
Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
return ELF::R_ARM_NONE;
@@ -145,7 +145,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
return ELF::R_ARM_THM_BF18;
}
}
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default:
Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
return ELF::R_ARM_NONE;
@@ -263,5 +263,5 @@ void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createARMELFObjectWriter(uint8_t OSABI) {
- return llvm::make_unique<ARMELFObjectWriter>(OSABI);
+ return std::make_unique<ARMELFObjectWriter>(OSABI);
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index 45be1ee96342..a1def61b58d9 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -1334,12 +1334,12 @@ void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
<< markup(">");
}
-void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+void ARMInstPrinter::printVMOVModImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned EncodedImm = MI->getOperand(OpNum).getImm();
unsigned EltBits;
- uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
+ uint64_t Val = ARM_AM::decodeVMOVModImm(EncodedImm, EltBits);
O << markup("<imm:") << "#0x";
O.write_hex(Val);
O << markup(">");
@@ -1676,3 +1676,11 @@ void ARMInstPrinter::printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
O.write_hex(Val);
O << markup(">");
}
+
+void ARMInstPrinter::printMveSaturateOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint32_t Val = MI->getOperand(OpNum).getImm();
+ assert(Val <= 1 && "Invalid MVE saturate operand");
+ O << "#" << (Val == 1 ? 48 : 64);
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
index 69026956b60e..eeb811e216fc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
@@ -191,7 +191,7 @@ public:
const MCSubtargetInfo &STI, raw_ostream &O);
void printFPImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+ void printVMOVModImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
@@ -262,7 +262,8 @@ public:
const MCSubtargetInfo &STI, raw_ostream &O);
void printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
-
+ void printMveSaturateOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
private:
unsigned DefaultAltIdx = ARM::NoRegAltName;
};
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index dca6fe37d49a..268fe7efd9ce 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1720,7 +1720,6 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
unsigned Reg = MI.getOperand(Op).getReg();
bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
- bool CLRMRegs = MI.getOpcode() == ARM::t2CLRM;
unsigned Binary = 0;
@@ -1739,21 +1738,13 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
Binary |= NumRegs * 2;
} else {
const MCRegisterInfo &MRI = *CTX.getRegisterInfo();
- if (!CLRMRegs) {
- assert(std::is_sorted(MI.begin() + Op, MI.end(),
- [&](const MCOperand &LHS, const MCOperand &RHS) {
- return MRI.getEncodingValue(LHS.getReg()) <
- MRI.getEncodingValue(RHS.getReg());
- }));
- }
-
+ assert(std::is_sorted(MI.begin() + Op, MI.end(),
+ [&](const MCOperand &LHS, const MCOperand &RHS) {
+ return MRI.getEncodingValue(LHS.getReg()) <
+ MRI.getEncodingValue(RHS.getReg());
+ }));
for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
- unsigned RegNo;
- if (CLRMRegs && MI.getOperand(I).getReg() == ARM::APSR) {
- RegNo = 15;
- } else {
- RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg());
- }
+ unsigned RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg());
Binary |= 1 << RegNo;
}
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index c49885023cb2..ed4000c7e5be 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -204,7 +204,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
// relocation entry in the low 16 bits of r_address field.
unsigned ThumbBit = 0;
unsigned MovtBit = 0;
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default: break;
case ARM::fixup_arm_movt_hi16:
MovtBit = 1;
@@ -480,7 +480,7 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
// PAIR. I.e. it's correct that we insert the high bits of the addend in the
// MOVW case here. relocation entries.
uint32_t Value = 0;
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default: break;
case ARM::fixup_arm_movw_lo16:
case ARM::fixup_t2_movw_lo16:
@@ -506,5 +506,5 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
std::unique_ptr<MCObjectTargetWriter>
llvm::createARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
uint32_t CPUSubtype) {
- return llvm::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
+ return std::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index b863517c0cca..7b30a61e8ccb 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -249,12 +249,12 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
: ARM::FK_VFPV3_D16)
: (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3XD_FP16
: ARM::FK_VFPV3XD)));
- else if (STI.hasFeature(ARM::FeatureVFP2_D16_SP))
+ else if (STI.hasFeature(ARM::FeatureVFP2_SP))
emitFPU(ARM::FK_VFPV2);
}
// ABI_HardFP_use attribute to indicate single precision FP.
- if (STI.hasFeature(ARM::FeatureVFP2_D16_SP) && !STI.hasFeature(ARM::FeatureFP64))
+ if (STI.hasFeature(ARM::FeatureVFP2_SP) && !STI.hasFeature(ARM::FeatureFP64))
emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
ARMBuildAttrs::HardFPSinglePrecision);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 054a95dd1e12..900c5fe30364 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -92,7 +92,7 @@ namespace llvm {
std::unique_ptr<MCObjectTargetWriter>
createARMWinCOFFObjectWriter(bool Is64Bit) {
- return llvm::make_unique<ARMWinCOFFObjectWriter>(Is64Bit);
+ return std::make_unique<ARMWinCOFFObjectWriter>(Is64Bit);
}
} // end namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index 2e816bea5e91..b3c8146a9bde 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -22,20 +22,10 @@ public:
std::unique_ptr<MCObjectWriter> OW)
: MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
- void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
void EmitThumbFunc(MCSymbol *Symbol) override;
void FinishImpl() override;
};
-void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
- switch (Flag) {
- default: llvm_unreachable("not implemented");
- case MCAF_SyntaxUnified:
- case MCAF_Code16:
- break;
- }
-}
-
void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
getAssembler().setIsThumbFunc(Symbol);
}
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 4b25986b90a7..cc31929899b4 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -86,8 +86,8 @@ void MLxExpansion::pushStack(MachineInstr *MI) {
MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
// Look past COPY and INSERT_SUBREG instructions to find the
// real definition MI. This is important for _sfp instructions.
- unsigned Reg = MI->getOperand(1).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ Register Reg = MI->getOperand(1).getReg();
+ if (Register::isPhysicalRegister(Reg))
return nullptr;
MachineBasicBlock *MBB = MI->getParent();
@@ -97,13 +97,13 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
break;
if (DefMI->isCopyLike()) {
Reg = DefMI->getOperand(1).getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
DefMI = MRI->getVRegDef(Reg);
continue;
}
} else if (DefMI->isInsertSubreg()) {
Reg = DefMI->getOperand(2).getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
DefMI = MRI->getVRegDef(Reg);
continue;
}
@@ -114,9 +114,8 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
}
unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
- unsigned Reg = MI->getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
- !MRI->hasOneNonDBGUse(Reg))
+ Register Reg = MI->getOperand(0).getReg();
+ if (Register::isPhysicalRegister(Reg) || !MRI->hasOneNonDBGUse(Reg))
return Reg;
MachineBasicBlock *MBB = MI->getParent();
@@ -126,8 +125,7 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
while (UseMI->isCopy() || UseMI->isInsertSubreg()) {
Reg = UseMI->getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
- !MRI->hasOneNonDBGUse(Reg))
+ if (Register::isPhysicalRegister(Reg) || !MRI->hasOneNonDBGUse(Reg))
return Reg;
UseMI = &*MRI->use_instr_nodbg_begin(Reg);
if (UseMI->getParent() != MBB)
@@ -140,8 +138,8 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
/// hasLoopHazard - Check whether an MLx instruction is chained to itself across
/// a single-MBB loop.
bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const {
- unsigned Reg = MI->getOperand(1).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ Register Reg = MI->getOperand(1).getReg();
+ if (Register::isPhysicalRegister(Reg))
return false;
MachineBasicBlock *MBB = MI->getParent();
@@ -154,8 +152,8 @@ outer_continue:
if (DefMI->isPHI()) {
for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
if (DefMI->getOperand(i + 1).getMBB() == MBB) {
- unsigned SrcReg = DefMI->getOperand(i).getReg();
- if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ Register SrcReg = DefMI->getOperand(i).getReg();
+ if (Register::isVirtualRegister(SrcReg)) {
DefMI = MRI->getVRegDef(SrcReg);
goto outer_continue;
}
@@ -163,13 +161,13 @@ outer_continue:
}
} else if (DefMI->isCopyLike()) {
Reg = DefMI->getOperand(1).getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
DefMI = MRI->getVRegDef(Reg);
continue;
}
} else if (DefMI->isInsertSubreg()) {
Reg = DefMI->getOperand(2).getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
DefMI = MRI->getVRegDef(Reg);
continue;
}
@@ -271,23 +269,23 @@ void
MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
unsigned MulOpc, unsigned AddSubOpc,
bool NegAcc, bool HasLane) {
- unsigned DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
bool DstDead = MI->getOperand(0).isDead();
- unsigned AccReg = MI->getOperand(1).getReg();
- unsigned Src1Reg = MI->getOperand(2).getReg();
- unsigned Src2Reg = MI->getOperand(3).getReg();
+ Register AccReg = MI->getOperand(1).getReg();
+ Register Src1Reg = MI->getOperand(2).getReg();
+ Register Src2Reg = MI->getOperand(3).getReg();
bool Src1Kill = MI->getOperand(2).isKill();
bool Src2Kill = MI->getOperand(3).isKill();
unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0;
unsigned NextOp = HasLane ? 5 : 4;
ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm();
- unsigned PredReg = MI->getOperand(++NextOp).getReg();
+ Register PredReg = MI->getOperand(++NextOp).getReg();
const MCInstrDesc &MCID1 = TII->get(MulOpc);
const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
const MachineFunction &MF = *MI->getParent()->getParent();
- unsigned TmpReg = MRI->createVirtualRegister(
- TII->getRegClass(MCID1, 0, TRI, MF));
+ Register TmpReg =
+ MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI, MF));
MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
.addReg(Src1Reg, getKillRegState(Src1Kill))
diff --git a/lib/Target/ARM/MVETailPredication.cpp b/lib/Target/ARM/MVETailPredication.cpp
new file mode 100644
index 000000000000..4db8ab17c49b
--- /dev/null
+++ b/lib/Target/ARM/MVETailPredication.cpp
@@ -0,0 +1,519 @@
+//===- MVETailPredication.cpp - MVE Tail Predication ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead
+/// branches to help accelerate DSP applications. These two extensions can be
+/// combined to provide implicit vector predication within a low-overhead loop.
+/// The HardwareLoops pass inserts intrinsics identifying loops that the
+/// backend will attempt to convert into a low-overhead loop. The vectorizer is
+/// responsible for generating a vectorized loop in which the lanes are
+/// predicated upon the iteration counter. This pass looks at these predicated
+/// vector loops, that are targets for low-overhead loops, and prepares it for
+/// code generation. Once the vectorizer has produced a masked loop, there's a
+/// couple of final forms:
+/// - A tail-predicated loop, with implicit predication.
+/// - A loop containing multiple VCPT instructions, predicating multiple VPT
+/// blocks of instructions operating on different vector types.
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "ARM.h"
+#include "ARMSubtarget.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mve-tail-predication"
+#define DESC "Transform predicated vector loops to use MVE tail predication"
+
+static cl::opt<bool>
+DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
+ cl::init(true),
+ cl::desc("Disable MVE Tail Predication"));
+namespace {
+
+class MVETailPredication : public LoopPass {
+ SmallVector<IntrinsicInst*, 4> MaskedInsts;
+ Loop *L = nullptr;
+ ScalarEvolution *SE = nullptr;
+ TargetTransformInfo *TTI = nullptr;
+
+public:
+ static char ID;
+
+ MVETailPredication() : LoopPass(ID) { }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager&) override;
+
+private:
+
+ /// Perform the relevant checks on the loop and convert if possible.
+ bool TryConvert(Value *TripCount);
+
+ /// Return whether this is a vectorized loop, that contains masked
+ /// load/stores.
+ bool IsPredicatedVectorLoop();
+
+ /// Compute a value for the total number of elements that the predicated
+ /// loop will process.
+ Value *ComputeElements(Value *TripCount, VectorType *VecTy);
+
+ /// Is the icmp that generates an i1 vector, based upon a loop counter
+ /// and a limit that is defined outside the loop.
+ bool isTailPredicate(Instruction *Predicate, Value *NumElements);
+};
+
+} // end namespace
+
+static bool IsDecrement(Instruction &I) {
+ auto *Call = dyn_cast<IntrinsicInst>(&I);
+ if (!Call)
+ return false;
+
+ Intrinsic::ID ID = Call->getIntrinsicID();
+ return ID == Intrinsic::loop_decrement_reg;
+}
+
+static bool IsMasked(Instruction *I) {
+ auto *Call = dyn_cast<IntrinsicInst>(I);
+ if (!Call)
+ return false;
+
+ Intrinsic::ID ID = Call->getIntrinsicID();
+ // TODO: Support gather/scatter expand/compress operations.
+ return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
+}
+
+bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
+ if (skipLoop(L) || DisableTailPredication)
+ return false;
+
+ Function &F = *L->getHeader()->getParent();
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ auto &TM = TPC.getTM<TargetMachine>();
+ auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ this->L = L;
+
+ // The MVE and LOB extensions are combined to enable tail-predication, but
+ // there's nothing preventing us from generating VCTP instructions for v8.1m.
+ if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) {
+ LLVM_DEBUG(dbgs() << "TP: Not a v8.1m.main+mve target.\n");
+ return false;
+ }
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader)
+ return false;
+
+ auto FindLoopIterations = [](BasicBlock *BB) -> IntrinsicInst* {
+ for (auto &I : *BB) {
+ auto *Call = dyn_cast<IntrinsicInst>(&I);
+ if (!Call)
+ continue;
+
+ Intrinsic::ID ID = Call->getIntrinsicID();
+ if (ID == Intrinsic::set_loop_iterations ||
+ ID == Intrinsic::test_set_loop_iterations)
+ return cast<IntrinsicInst>(&I);
+ }
+ return nullptr;
+ };
+
+ // Look for the hardware loop intrinsic that sets the iteration count.
+ IntrinsicInst *Setup = FindLoopIterations(Preheader);
+
+ // The test.set iteration could live in the pre- preheader.
+ if (!Setup) {
+ if (!Preheader->getSinglePredecessor())
+ return false;
+ Setup = FindLoopIterations(Preheader->getSinglePredecessor());
+ if (!Setup)
+ return false;
+ }
+
+ // Search for the hardware loop intrinic that decrements the loop counter.
+ IntrinsicInst *Decrement = nullptr;
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (IsDecrement(I)) {
+ Decrement = cast<IntrinsicInst>(&I);
+ break;
+ }
+ }
+ }
+
+ if (!Decrement)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "TP: Running on Loop: " << *L
+ << *Setup << "\n"
+ << *Decrement << "\n");
+ bool Changed = TryConvert(Setup->getArgOperand(0));
+ return Changed;
+}
+
+bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
+ // Look for the following:
+
+ // %trip.count.minus.1 = add i32 %N, -1
+ // %broadcast.splatinsert10 = insertelement <4 x i32> undef,
+ // i32 %trip.count.minus.1, i32 0
+ // %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10,
+ // <4 x i32> undef,
+ // <4 x i32> zeroinitializer
+ // ...
+ // ...
+ // %index = phi i32
+ // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert,
+ // <4 x i32> undef,
+ // <4 x i32> zeroinitializer
+ // %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11
+
+ // And return whether V == %pred.
+
+ using namespace PatternMatch;
+
+ CmpInst::Predicate Pred;
+ Instruction *Shuffle = nullptr;
+ Instruction *Induction = nullptr;
+
+ // The vector icmp
+ if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
+ m_Instruction(Shuffle))) ||
+ Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle))
+ return false;
+
+ // First find the stuff outside the loop which is setting up the limit
+ // vector....
+ // The invariant shuffle that broadcast the limit into a vector.
+ Instruction *Insert = nullptr;
+ if (!match(Shuffle, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
+ m_Zero())))
+ return false;
+
+ // Insert the limit into a vector.
+ Instruction *BECount = nullptr;
+ if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(BECount),
+ m_Zero())))
+ return false;
+
+ // The limit calculation, backedge count.
+ Value *TripCount = nullptr;
+ if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes())))
+ return false;
+
+ if (TripCount != NumElements)
+ return false;
+
+ // Now back to searching inside the loop body...
+ // Find the add with takes the index iv and adds a constant vector to it.
+ Instruction *BroadcastSplat = nullptr;
+ Constant *Const = nullptr;
+ if (!match(Induction, m_Add(m_Instruction(BroadcastSplat),
+ m_Constant(Const))))
+ return false;
+
+ // Check that we're adding <0, 1, 2, 3...
+ if (auto *CDS = dyn_cast<ConstantDataSequential>(Const)) {
+ for (unsigned i = 0; i < CDS->getNumElements(); ++i) {
+ if (CDS->getElementAsInteger(i) != i)
+ return false;
+ }
+ } else
+ return false;
+
+ // The shuffle which broadcasts the index iv into a vector.
+ if (!match(BroadcastSplat, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
+ m_Zero())))
+ return false;
+
+ // The insert element which initialises a vector with the index iv.
+ Instruction *IV = nullptr;
+ if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(IV), m_Zero())))
+ return false;
+
+ // The index iv.
+ auto *Phi = dyn_cast<PHINode>(IV);
+ if (!Phi)
+ return false;
+
+ // TODO: Don't think we need to check the entry value.
+ Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader());
+ if (!match(OnEntry, m_Zero()))
+ return false;
+
+ Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch());
+ unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements();
+
+ Instruction *LHS = nullptr;
+ if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes))))
+ return false;
+
+ return LHS == Phi;
+}
+
+static VectorType* getVectorType(IntrinsicInst *I) {
+ unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
+ auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType());
+ return cast<VectorType>(PtrTy->getElementType());
+}
+
+bool MVETailPredication::IsPredicatedVectorLoop() {
+ // Check that the loop contains at least one masked load/store intrinsic.
+ // We only support 'normal' vector instructions - other than masked
+ // load/stores.
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (IsMasked(&I)) {
+ VectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
+ unsigned Lanes = VecTy->getNumElements();
+ unsigned ElementWidth = VecTy->getScalarSizeInBits();
+ // MVE vectors are 128-bit, but don't support 128 x i1.
+ // TODO: Can we support vectors larger than 128-bits?
+ unsigned MaxWidth = TTI->getRegisterBitWidth(true);
+ if (Lanes * ElementWidth != MaxWidth || Lanes == MaxWidth)
+ return false;
+ MaskedInsts.push_back(cast<IntrinsicInst>(&I));
+ } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
+ for (auto &U : Int->args()) {
+ if (isa<VectorType>(U->getType()))
+ return false;
+ }
+ }
+ }
+ }
+
+ return !MaskedInsts.empty();
+}
+
+Value* MVETailPredication::ComputeElements(Value *TripCount,
+ VectorType *VecTy) {
+ const SCEV *TripCountSE = SE->getSCEV(TripCount);
+ ConstantInt *VF = ConstantInt::get(cast<IntegerType>(TripCount->getType()),
+ VecTy->getNumElements());
+
+ if (VF->equalsInt(1))
+ return nullptr;
+
+ // TODO: Support constant trip counts.
+ auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* {
+ if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
+ if (Const->getAPInt() != -VF->getValue())
+ return nullptr;
+ } else
+ return nullptr;
+ return dyn_cast<SCEVMulExpr>(S->getOperand(1));
+ };
+
+ auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* {
+ if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
+ if (Const->getValue() != VF)
+ return nullptr;
+ } else
+ return nullptr;
+ return dyn_cast<SCEVUDivExpr>(S->getOperand(1));
+ };
+
+ auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* {
+ if (auto *Const = dyn_cast<SCEVConstant>(S->getRHS())) {
+ if (Const->getValue() != VF)
+ return nullptr;
+ } else
+ return nullptr;
+
+ if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) {
+ if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) {
+ if (Const->getAPInt() != (VF->getValue() - 1))
+ return nullptr;
+ } else
+ return nullptr;
+
+ return RoundUp->getOperand(1);
+ }
+ return nullptr;
+ };
+
+ // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to
+ // determine the numbers of elements instead? Looks like this is what is used
+ // for delinearization, but I'm not sure if it can be applied to the
+ // vectorized form - at least not without a bit more work than I feel
+ // comfortable with.
+
+ // Search for Elems in the following SCEV:
+ // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw>
+ const SCEV *Elems = nullptr;
+ if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE))
+ if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1)))
+ if (auto *Add = dyn_cast<SCEVAddExpr>(Div->getLHS()))
+ if (auto *Mul = VisitAdd(Add))
+ if (auto *Div = VisitMul(Mul))
+ if (auto *Res = VisitDiv(Div))
+ Elems = Res;
+
+ if (!Elems)
+ return nullptr;
+
+ Instruction *InsertPt = L->getLoopPreheader()->getTerminator();
+ if (!isSafeToExpandAt(Elems, InsertPt, *SE))
+ return nullptr;
+
+ auto DL = L->getHeader()->getModule()->getDataLayout();
+ SCEVExpander Expander(*SE, DL, "elements");
+ return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
+}
+
+// Look through the exit block to see whether there's a duplicate predicate
+// instruction. This can happen when we need to perform a select on values
+// from the last and previous iteration. Instead of doing a straight
+// replacement of that predicate with the vctp, clone the vctp and place it
+// in the block. This means that the VPR doesn't have to be live into the
+// exit block which should make it easier to convert this loop into a proper
+// tail predicated loop.
+static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
+ SetVector<Instruction*> &MaybeDead, Loop *L) {
+ if (BasicBlock *Exit = L->getUniqueExitBlock()) {
+ for (auto &Pair : NewPredicates) {
+ Instruction *OldPred = Pair.first;
+ Instruction *NewPred = Pair.second;
+
+ for (auto &I : *Exit) {
+ if (I.isSameOperationAs(OldPred)) {
+ Instruction *PredClone = NewPred->clone();
+ PredClone->insertBefore(&I);
+ I.replaceAllUsesWith(PredClone);
+ MaybeDead.insert(&I);
+ break;
+ }
+ }
+ }
+ }
+
+ // Drop references and add operands to check for dead.
+ SmallPtrSet<Instruction*, 4> Dead;
+ while (!MaybeDead.empty()) {
+ auto *I = MaybeDead.front();
+ MaybeDead.remove(I);
+ if (I->hasNUsesOrMore(1))
+ continue;
+
+ for (auto &U : I->operands()) {
+ if (auto *OpI = dyn_cast<Instruction>(U))
+ MaybeDead.insert(OpI);
+ }
+ I->dropAllReferences();
+ Dead.insert(I);
+ }
+
+ for (auto *I : Dead)
+ I->eraseFromParent();
+
+ for (auto I : L->blocks())
+ DeleteDeadPHIs(I);
+}
+
+bool MVETailPredication::TryConvert(Value *TripCount) {
+ if (!IsPredicatedVectorLoop())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "TP: Found predicated vector loop.\n");
+
+ // Walk through the masked intrinsics and try to find whether the predicate
+ // operand is generated from an induction variable.
+ Module *M = L->getHeader()->getModule();
+ Type *Ty = IntegerType::get(M->getContext(), 32);
+ SetVector<Instruction*> Predicates;
+ DenseMap<Instruction*, Instruction*> NewPredicates;
+
+ for (auto *I : MaskedInsts) {
+ Intrinsic::ID ID = I->getIntrinsicID();
+ unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3;
+ auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp));
+ if (!Predicate || Predicates.count(Predicate))
+ continue;
+
+ VectorType *VecTy = getVectorType(I);
+ Value *NumElements = ComputeElements(TripCount, VecTy);
+ if (!NumElements)
+ continue;
+
+ if (!isTailPredicate(Predicate, NumElements)) {
+ LLVM_DEBUG(dbgs() << "TP: Not tail predicate: " << *Predicate << "\n");
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "TP: Found tail predicate: " << *Predicate << "\n");
+ Predicates.insert(Predicate);
+
+ // Insert a phi to count the number of elements processed by the loop.
+ IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
+ PHINode *Processed = Builder.CreatePHI(Ty, 2);
+ Processed->addIncoming(NumElements, L->getLoopPreheader());
+
+ // Insert the intrinsic to represent the effect of tail predication.
+ Builder.SetInsertPoint(cast<Instruction>(Predicate));
+ ConstantInt *Factor =
+ ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+ Intrinsic::ID VCTPID;
+ switch (VecTy->getNumElements()) {
+ default:
+ llvm_unreachable("unexpected number of lanes");
+ case 2: VCTPID = Intrinsic::arm_vctp64; break;
+ case 4: VCTPID = Intrinsic::arm_vctp32; break;
+ case 8: VCTPID = Intrinsic::arm_vctp16; break;
+ case 16: VCTPID = Intrinsic::arm_vctp8; break;
+ }
+ Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
+ Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
+ Predicate->replaceAllUsesWith(TailPredicate);
+ NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
+
+ // Add the incoming value to the new phi.
+ // TODO: This add likely already exists in the loop.
+ Value *Remaining = Builder.CreateSub(Processed, Factor);
+ Processed->addIncoming(Remaining, L->getLoopLatch());
+ LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: "
+ << *Processed << "\n"
+ << "TP: Inserted VCTP: " << *TailPredicate << "\n");
+ }
+
+ // Now clean up.
+ Cleanup(NewPredicates, Predicates, L);
+ return true;
+}
+
+Pass *llvm::createMVETailPredicationPass() {
+ return new MVETailPredication();
+}
+
+char MVETailPredication::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false)
+INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false)
diff --git a/lib/Target/ARM/MVEVPTBlockPass.cpp b/lib/Target/ARM/MVEVPTBlockPass.cpp
new file mode 100644
index 000000000000..bc0a80b177ed
--- /dev/null
+++ b/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -0,0 +1,278 @@
+//===-- MVEVPTBlockPass.cpp - Insert MVE VPT blocks -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include <cassert>
+#include <new>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-mve-vpt"
+
+namespace {
+ class MVEVPTBlock : public MachineFunctionPass {
+ public:
+ static char ID;
+ const Thumb2InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+
+ MVEVPTBlock() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "MVE VPT block insertion pass";
+ }
+
+ private:
+ bool InsertVPTBlocks(MachineBasicBlock &MBB);
+ };
+
+ char MVEVPTBlock::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false)
+
+enum VPTMaskValue {
+ T = 8, // 0b1000
+ TT = 4, // 0b0100
+ TE = 12, // 0b1100
+ TTT = 2, // 0b0010
+ TTE = 6, // 0b0110
+ TEE = 10, // 0b1010
+ TET = 14, // 0b1110
+ TTTT = 1, // 0b0001
+ TTTE = 3, // 0b0011
+ TTEE = 5, // 0b0101
+ TTET = 7, // 0b0111
+ TEEE = 9, // 0b1001
+ TEET = 11, // 0b1011
+ TETT = 13, // 0b1101
+ TETE = 15 // 0b1111
+};
+
+static unsigned VCMPOpcodeToVPT(unsigned Opcode) {
+ switch (Opcode) {
+ case ARM::MVE_VCMPf32:
+ return ARM::MVE_VPTv4f32;
+ case ARM::MVE_VCMPf16:
+ return ARM::MVE_VPTv8f16;
+ case ARM::MVE_VCMPi8:
+ return ARM::MVE_VPTv16i8;
+ case ARM::MVE_VCMPi16:
+ return ARM::MVE_VPTv8i16;
+ case ARM::MVE_VCMPi32:
+ return ARM::MVE_VPTv4i32;
+ case ARM::MVE_VCMPu8:
+ return ARM::MVE_VPTv16u8;
+ case ARM::MVE_VCMPu16:
+ return ARM::MVE_VPTv8u16;
+ case ARM::MVE_VCMPu32:
+ return ARM::MVE_VPTv4u32;
+ case ARM::MVE_VCMPs8:
+ return ARM::MVE_VPTv16s8;
+ case ARM::MVE_VCMPs16:
+ return ARM::MVE_VPTv8s16;
+ case ARM::MVE_VCMPs32:
+ return ARM::MVE_VPTv4s32;
+
+ case ARM::MVE_VCMPf32r:
+ return ARM::MVE_VPTv4f32r;
+ case ARM::MVE_VCMPf16r:
+ return ARM::MVE_VPTv8f16r;
+ case ARM::MVE_VCMPi8r:
+ return ARM::MVE_VPTv16i8r;
+ case ARM::MVE_VCMPi16r:
+ return ARM::MVE_VPTv8i16r;
+ case ARM::MVE_VCMPi32r:
+ return ARM::MVE_VPTv4i32r;
+ case ARM::MVE_VCMPu8r:
+ return ARM::MVE_VPTv16u8r;
+ case ARM::MVE_VCMPu16r:
+ return ARM::MVE_VPTv8u16r;
+ case ARM::MVE_VCMPu32r:
+ return ARM::MVE_VPTv4u32r;
+ case ARM::MVE_VCMPs8r:
+ return ARM::MVE_VPTv16s8r;
+ case ARM::MVE_VCMPs16r:
+ return ARM::MVE_VPTv8s16r;
+ case ARM::MVE_VCMPs32r:
+ return ARM::MVE_VPTv4s32r;
+
+ default:
+ return 0;
+ }
+}
+
+static MachineInstr *findVCMPToFoldIntoVPST(MachineBasicBlock::iterator MI,
+ const TargetRegisterInfo *TRI,
+ unsigned &NewOpcode) {
+ // Search backwards to the instruction that defines VPR. This may or not
+ // be a VCMP, we check that after this loop. If we find another instruction
+ // that reads cpsr, we return nullptr.
+ MachineBasicBlock::iterator CmpMI = MI;
+ while (CmpMI != MI->getParent()->begin()) {
+ --CmpMI;
+ if (CmpMI->modifiesRegister(ARM::VPR, TRI))
+ break;
+ if (CmpMI->readsRegister(ARM::VPR, TRI))
+ break;
+ }
+
+ if (CmpMI == MI)
+ return nullptr;
+ NewOpcode = VCMPOpcodeToVPT(CmpMI->getOpcode());
+ if (NewOpcode == 0)
+ return nullptr;
+
+ // Search forward from CmpMI to MI, checking if either register was def'd
+ if (registerDefinedBetween(CmpMI->getOperand(1).getReg(), std::next(CmpMI),
+ MI, TRI))
+ return nullptr;
+ if (registerDefinedBetween(CmpMI->getOperand(2).getReg(), std::next(CmpMI),
+ MI, TRI))
+ return nullptr;
+ return &*CmpMI;
+}
+
+bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
+ bool Modified = false;
+ MachineBasicBlock::instr_iterator MBIter = Block.instr_begin();
+ MachineBasicBlock::instr_iterator EndIter = Block.instr_end();
+
+ while (MBIter != EndIter) {
+ MachineInstr *MI = &*MBIter;
+ unsigned PredReg = 0;
+ DebugLoc dl = MI->getDebugLoc();
+
+ ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg);
+
+ // The idea of the predicate is that None, Then and Else are for use when
+ // handling assembly language: they correspond to the three possible
+ // suffixes "", "t" and "e" on the mnemonic. So when instructions are read
+ // from assembly source or disassembled from object code, you expect to see
+ // a mixture whenever there's a long VPT block. But in code generation, we
+ // hope we'll never generate an Else as input to this pass.
+ assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds");
+
+ if (Pred == ARMVCC::None) {
+ ++MBIter;
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump());
+ int VPTInstCnt = 1;
+ ARMVCC::VPTCodes NextPred;
+
+ // Look at subsequent instructions, checking if they can be in the same VPT
+ // block.
+ ++MBIter;
+ while (MBIter != EndIter && VPTInstCnt < 4) {
+ NextPred = getVPTInstrPredicate(*MBIter, PredReg);
+ assert(NextPred != ARMVCC::Else &&
+ "VPT block pass does not expect Else preds");
+ if (NextPred != Pred)
+ break;
+ LLVM_DEBUG(dbgs() << " adding : "; MBIter->dump());
+ ++VPTInstCnt;
+ ++MBIter;
+ };
+
+ unsigned BlockMask = 0;
+ switch (VPTInstCnt) {
+ case 1:
+ BlockMask = VPTMaskValue::T;
+ break;
+ case 2:
+ BlockMask = VPTMaskValue::TT;
+ break;
+ case 3:
+ BlockMask = VPTMaskValue::TTT;
+ break;
+ case 4:
+ BlockMask = VPTMaskValue::TTTT;
+ break;
+ default:
+ llvm_unreachable("Unexpected number of instruction in a VPT block");
+ };
+
+ // Search back for a VCMP that can be folded to create a VPT, or else create
+ // a VPST directly
+ MachineInstrBuilder MIBuilder;
+ unsigned NewOpcode;
+ MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode);
+ if (VCMP) {
+ LLVM_DEBUG(dbgs() << " folding VCMP into VPST: "; VCMP->dump());
+ MIBuilder = BuildMI(Block, MI, dl, TII->get(NewOpcode));
+ MIBuilder.addImm(BlockMask);
+ MIBuilder.add(VCMP->getOperand(1));
+ MIBuilder.add(VCMP->getOperand(2));
+ MIBuilder.add(VCMP->getOperand(3));
+ VCMP->eraseFromParent();
+ } else {
+ MIBuilder = BuildMI(Block, MI, dl, TII->get(ARM::MVE_VPST));
+ MIBuilder.addImm(BlockMask);
+ }
+
+ finalizeBundle(
+ Block, MachineBasicBlock::instr_iterator(MIBuilder.getInstr()), MBIter);
+
+ Modified = true;
+ }
+ return Modified;
+}
+
+bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
+ const ARMSubtarget &STI =
+ static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+
+ if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
+ return false;
+
+ TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
+ TRI = STI.getRegisterInfo();
+
+ LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n"
+ << "********** Function: " << Fn.getName() << '\n');
+
+ bool Modified = false;
+ for (MachineBasicBlock &MBB : Fn)
+ Modified |= InsertVPTBlocks(MBB);
+
+ LLVM_DEBUG(dbgs() << "**************************************\n");
+ return Modified;
+}
+
+/// createMVEVPTBlock - Returns an instance of the MVE VPT block
+/// insertion pass.
+FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); }
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 426e9a0ed9b8..956d474f1d79 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -164,7 +164,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
// to determine the end of the prologue.
DebugLoc dl;
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ Register FramePtr = RegInfo->getFrameRegister(MF);
unsigned BasePtr = RegInfo->getBaseRegister();
int CFAOffset = 0;
@@ -459,8 +459,8 @@ static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) {
else if (MI.getOpcode() == ARM::tPOP) {
return true;
} else if (MI.getOpcode() == ARM::tMOVr) {
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
return ((ARM::tGPRRegClass.contains(Src) || Src == ARM::LR) &&
ARM::hGPRRegClass.contains(Dst));
}
@@ -483,7 +483,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
assert((unsigned)NumBytes >= ArgRegsSaveSize &&
"ArgRegsSaveSize is included in NumBytes");
const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ Register FramePtr = RegInfo->getFrameRegister(MF);
if (!AFI->hasStackFrame()) {
if (NumBytes - ArgRegsSaveSize != 0)
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index f57d93a2e83d..fccaa4c9cc8a 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -80,12 +80,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
assert((RC == &ARM::tGPRRegClass ||
- (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
- isARMLowRegister(SrcReg))) && "Unknown regclass!");
+ (Register::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) &&
+ "Unknown regclass!");
if (RC == &ARM::tGPRRegClass ||
- (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
- isARMLowRegister(SrcReg))) {
+ (Register::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) {
DebugLoc DL;
if (I != MBB.end()) DL = I->getDebugLoc();
@@ -108,13 +107,13 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
unsigned DestReg, int FI,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
- assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
- (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
- isARMLowRegister(DestReg))) && "Unknown regclass!");
+ assert(
+ (RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
+ (Register::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) &&
+ "Unknown regclass!");
if (RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
- (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
- isARMLowRegister(DestReg))) {
+ (Register::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) {
DebugLoc DL;
if (I != MBB.end()) DL = I->getDebugLoc();
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 3143eb9840ed..786fc78d0233 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -87,7 +87,7 @@ static void TrackDefUses(MachineInstr *MI, RegisterSet &Defs, RegisterSet &Uses,
for (auto &MO : MI->operands()) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP)
continue;
if (MO.isUse())
@@ -145,8 +145,8 @@ Thumb2ITBlock::MoveCopyOutOfITBlock(MachineInstr *MI,
MI->getOperand(1).getSubReg() == 0 &&
"Sub-register indices still around?");
- unsigned DstReg = MI->getOperand(0).getReg();
- unsigned SrcReg = MI->getOperand(1).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
+ Register SrcReg = MI->getOperand(1).getReg();
// First check if it's safe to move it.
if (Uses.count(DstReg) || Defs.count(SrcReg))
@@ -308,131 +308,3 @@ bool Thumb2ITBlock::runOnMachineFunction(MachineFunction &Fn) {
/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks
/// insertion pass.
FunctionPass *llvm::createThumb2ITBlockPass() { return new Thumb2ITBlock(); }
-
-#undef DEBUG_TYPE
-#define DEBUG_TYPE "arm-mve-vpt"
-
-namespace {
- class MVEVPTBlock : public MachineFunctionPass {
- public:
- static char ID;
- const Thumb2InstrInfo *TII;
- const TargetRegisterInfo *TRI;
-
- MVEVPTBlock() : MachineFunctionPass(ID) {}
-
- bool runOnMachineFunction(MachineFunction &Fn) override;
-
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoVRegs);
- }
-
- StringRef getPassName() const override {
- return "MVE VPT block insertion pass";
- }
-
- private:
- bool InsertVPTBlocks(MachineBasicBlock &MBB);
- };
-
- char MVEVPTBlock::ID = 0;
-
-} // end anonymous namespace
-
-INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false)
-
-enum VPTMaskValue {
- T = 8, // 0b1000
- TT = 4, // 0b0100
- TE = 12, // 0b1100
- TTT = 2, // 0b0010
- TTE = 6, // 0b0110
- TEE = 10, // 0b1010
- TET = 14, // 0b1110
- TTTT = 1, // 0b0001
- TTTE = 3, // 0b0011
- TTEE = 5, // 0b0101
- TTET = 7, // 0b0111
- TEEE = 9, // 0b1001
- TEET = 11, // 0b1011
- TETT = 13, // 0b1101
- TETE = 15 // 0b1111
-};
-
-bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
- bool Modified = false;
- MachineBasicBlock::iterator MBIter = Block.begin();
- MachineBasicBlock::iterator EndIter = Block.end();
-
- while (MBIter != EndIter) {
- MachineInstr *MI = &*MBIter;
- unsigned PredReg = 0;
- DebugLoc dl = MI->getDebugLoc();
-
- ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg);
-
- // The idea of the predicate is that None, Then and Else are for use when
- // handling assembly language: they correspond to the three possible
- // suffixes "", "t" and "e" on the mnemonic. So when instructions are read
- // from assembly source or disassembled from object code, you expect to see
- // a mixture whenever there's a long VPT block. But in code generation, we
- // hope we'll never generate an Else as input to this pass.
-
- assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds");
-
- if (Pred == ARMVCC::None) {
- ++MBIter;
- continue;
- }
-
- MachineInstrBuilder MIBuilder =
- BuildMI(Block, MBIter, dl, TII->get(ARM::MVE_VPST));
- // The mask value for the VPST instruction is T = 0b1000 = 8
- MIBuilder.addImm(VPTMaskValue::T);
-
- MachineBasicBlock::iterator VPSTInsertPos = MIBuilder.getInstr();
- int VPTInstCnt = 1;
- ARMVCC::VPTCodes NextPred;
-
- do {
- ++MBIter;
- NextPred = getVPTInstrPredicate(*MBIter, PredReg);
- } while (NextPred != ARMVCC::None && NextPred == Pred && ++VPTInstCnt < 4);
-
- MachineInstr *LastMI = &*MBIter;
- finalizeBundle(Block, VPSTInsertPos.getInstrIterator(),
- ++LastMI->getIterator());
-
- Modified = true;
- LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump(););
-
- ++MBIter;
- }
- return Modified;
-}
-
-bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
- const ARMSubtarget &STI =
- static_cast<const ARMSubtarget &>(Fn.getSubtarget());
-
- if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
- return false;
-
- TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
- TRI = STI.getRegisterInfo();
-
- LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n"
- << "********** Function: " << Fn.getName() << '\n');
-
- bool Modified = false;
- for (MachineBasicBlock &MBB : Fn)
- Modified |= InsertVPTBlocks(MBB);
-
- LLVM_DEBUG(dbgs() << "**************************************\n");
- return Modified;
-}
-
-/// createMVEVPTBlock - Returns an instance of the MVE VPT block
-/// insertion pass.
-FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); }
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 5a965f7a6b9b..af1f0aeb27ba 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -159,9 +159,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
// Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for
// gsub_0, but needs an extra constraint for gsub_1 (which could be sp
// otherwise).
- if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ if (Register::isVirtualRegister(SrcReg)) {
MachineRegisterInfo *MRI = &MF.getRegInfo();
- MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass);
+ MRI->constrainRegClass(SrcReg, &ARM::GPRPairnospRegClass);
}
MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
@@ -200,10 +200,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
// Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for
// gsub_0, but needs an extra constraint for gsub_1 (which could be sp
// otherwise).
- if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+ if (Register::isVirtualRegister(DestReg)) {
MachineRegisterInfo *MRI = &MF.getRegInfo();
- MRI->constrainRegClass(DestReg,
- &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass);
+ MRI->constrainRegClass(DestReg, &ARM::GPRPairnospRegClass);
}
MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
@@ -211,7 +210,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
- if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ if (Register::isPhysicalRegister(DestReg))
MIB.addReg(DestReg, RegState::ImplicitDefine);
return;
}
@@ -470,12 +469,17 @@ immediateOffsetOpcode(unsigned opcode)
bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
unsigned FrameReg, int &Offset,
- const ARMBaseInstrInfo &TII) {
+ const ARMBaseInstrInfo &TII,
+ const TargetRegisterInfo *TRI) {
unsigned Opcode = MI.getOpcode();
const MCInstrDesc &Desc = MI.getDesc();
unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
bool isSub = false;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const TargetRegisterClass *RegClass =
+ TII.getRegClass(Desc, FrameRegIdx, TRI, MF);
+
// Memory operands in inline assembly always use AddrModeT2_i12.
if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR)
AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
@@ -554,7 +558,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
// register then we change to an immediate version.
unsigned NewOpc = Opcode;
if (AddrMode == ARMII::AddrModeT2_so) {
- unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg();
+ Register OffsetReg = MI.getOperand(FrameRegIdx + 1).getReg();
if (OffsetReg != 0) {
MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
return Offset == 0;
@@ -645,10 +649,21 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1);
// Attempt to fold address computation
- // Common case: small offset, fits into instruction.
+ // Common case: small offset, fits into instruction. We need to make sure
+ // the register class is correct too, for instructions like the MVE
+ // VLDRH.32, which only accepts low tGPR registers.
int ImmedOffset = Offset / Scale;
unsigned Mask = (1 << NumBits) - 1;
- if ((unsigned)Offset <= Mask * Scale) {
+ if ((unsigned)Offset <= Mask * Scale &&
+ (Register::isVirtualRegister(FrameReg) ||
+ RegClass->contains(FrameReg))) {
+ if (Register::isVirtualRegister(FrameReg)) {
+ // Make sure the register class for the virtual register is correct
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ if (!MRI->constrainRegClass(FrameReg, RegClass))
+ llvm_unreachable("Unable to constrain virtual register class.");
+ }
+
// Replace the FrameIndex with fp/sp
MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
if (isSub) {
@@ -681,7 +696,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
}
Offset = (isSub) ? -Offset : Offset;
- return Offset == 0;
+ return Offset == 0 && (Register::isVirtualRegister(FrameReg) ||
+ RegClass->contains(FrameReg));
}
ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI,
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 37a85fa38417..c5a62aa33990 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -300,7 +300,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
for (const MachineOperand &MO : CPSRDef->operands()) {
if (!MO.isReg() || MO.isUndef() || MO.isUse())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg == 0 || Reg == ARM::CPSR)
continue;
Defs.insert(Reg);
@@ -309,7 +309,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
for (const MachineOperand &MO : Use->operands()) {
if (!MO.isReg() || MO.isUndef() || MO.isDef())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Defs.count(Reg))
return false;
}
@@ -380,7 +380,7 @@ static bool VerifyLowRegs(MachineInstr *MI) {
const MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg() || MO.isImplicit())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg == 0 || Reg == ARM::CPSR)
continue;
if (isPCOk && Reg == ARM::PC)
@@ -464,11 +464,11 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
// For this reason we can't reuse the logic at the end of this function; we
// have to implement the MI building here.
bool IsStore = Entry.WideOpc == ARM::t2STR_POST;
- unsigned Rt = MI->getOperand(IsStore ? 1 : 0).getReg();
- unsigned Rn = MI->getOperand(IsStore ? 0 : 1).getReg();
+ Register Rt = MI->getOperand(IsStore ? 1 : 0).getReg();
+ Register Rn = MI->getOperand(IsStore ? 0 : 1).getReg();
unsigned Offset = MI->getOperand(3).getImm();
unsigned PredImm = MI->getOperand(4).getImm();
- unsigned PredReg = MI->getOperand(5).getReg();
+ Register PredReg = MI->getOperand(5).getReg();
assert(isARMLowRegister(Rt));
assert(isARMLowRegister(Rn));
@@ -496,7 +496,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
return true;
}
case ARM::t2LDMIA: {
- unsigned BaseReg = MI->getOperand(0).getReg();
+ Register BaseReg = MI->getOperand(0).getReg();
assert(isARMLowRegister(BaseReg));
// For the non-writeback version (this one), the base register must be
@@ -524,7 +524,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
break;
case ARM::t2LDMIA_RET: {
- unsigned BaseReg = MI->getOperand(1).getReg();
+ Register BaseReg = MI->getOperand(1).getReg();
if (BaseReg != ARM::SP)
return false;
Opc = Entry.NarrowOpc2; // tPOP_RET
@@ -537,7 +537,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
case ARM::t2STMDB_UPD: {
OpNum = 0;
- unsigned BaseReg = MI->getOperand(1).getReg();
+ Register BaseReg = MI->getOperand(1).getReg();
if (BaseReg == ARM::SP &&
(Entry.WideOpc == ARM::t2LDMIA_UPD ||
Entry.WideOpc == ARM::t2STMDB_UPD)) {
@@ -743,11 +743,11 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
// are optimizing for size.
return false;
- unsigned Reg0 = MI->getOperand(0).getReg();
- unsigned Reg1 = MI->getOperand(1).getReg();
+ Register Reg0 = MI->getOperand(0).getReg();
+ Register Reg1 = MI->getOperand(1).getReg();
// t2MUL is "special". The tied source operand is second, not first.
if (MI->getOpcode() == ARM::t2MUL) {
- unsigned Reg2 = MI->getOperand(2).getReg();
+ Register Reg2 = MI->getOperand(2).getReg();
// Early exit if the regs aren't all low regs.
if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
|| !isARMLowRegister(Reg2))
@@ -782,7 +782,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
if (Imm > Limit)
return false;
} else {
- unsigned Reg2 = MI->getOperand(2).getReg();
+ Register Reg2 = MI->getOperand(2).getReg();
if (Entry.LowRegs2 && !isARMLowRegister(Reg2))
return false;
}
@@ -868,7 +868,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
continue;
const MachineOperand &MO = MI->getOperand(i);
if (MO.isReg()) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!Reg || Reg == ARM::CPSR)
continue;
if (Entry.LowRegs1 && !isARMLowRegister(Reg))
diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp
index a96417ffce4d..b0ba58d8dc4a 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -107,8 +107,9 @@ void ThumbRegisterInfo::emitLoadConstPool(
MachineFunction &MF = *MBB.getParent();
const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
if (STI.isThumb1Only()) {
- assert((isARMLowRegister(DestReg) || isVirtualRegister(DestReg)) &&
- "Thumb1 does not have ldr to high register");
+ assert(
+ (isARMLowRegister(DestReg) || Register::isVirtualRegister(DestReg)) &&
+ "Thumb1 does not have ldr to high register");
return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred,
PredReg, MIFlags);
}
@@ -141,7 +142,7 @@ static void emitThumbRegPlusImmInReg(
unsigned LdReg = DestReg;
if (DestReg == ARM::SP)
assert(BaseReg == ARM::SP && "Unexpected!");
- if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg))
+ if (!isARMLowRegister(DestReg) && !Register::isVirtualRegister(DestReg))
LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
@@ -371,7 +372,7 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
if (Opcode == ARM::tADDframe) {
Offset += MI.getOperand(FrameRegIdx+1).getImm();
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII,
*this);
@@ -509,7 +510,7 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
if (MI.mayLoad()) {
// Use the destination register to materialize sp + offset.
- unsigned TmpReg = MI.getOperand(0).getReg();
+ Register TmpReg = MI.getOperand(0).getReg();
bool UseRR = false;
if (Opcode == ARM::tLDRspi) {
if (FrameReg == ARM::SP || STI.genExecuteOnly())