aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target/ARM
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-01-20 11:41:25 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-01-20 11:41:25 +0000
commitd9484dd61cc151c4f34c31e07f693fefa66316b5 (patch)
treeab0560b3da293f1fafd3269c59692e929418f5c2 /contrib/llvm/lib/Target/ARM
parent79e0962d4c3cf1f0acf359a9d69cb3ac68c414c4 (diff)
parentd8e91e46262bc44006913e6796843909f1ac7bcd (diff)
Notes
Diffstat (limited to 'contrib/llvm/lib/Target/ARM')
-rw-r--r--contrib/llvm/lib/Target/ARM/ARM.td93
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp48
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp116
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h16
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp5
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h3
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp84
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMCallLowering.h7
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp1022
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp16
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp33
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMFastISel.cpp10
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp32
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMFrameLowering.h2
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp66
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp672
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMISelLowering.h19
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrFormats.td31
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp30
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrInfo.h7
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrInfo.td172
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrNEON.td145
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrThumb.td20
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td89
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstrVFP.td66
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp94
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp72
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h4
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp37
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp24
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMMacroFusion.h5
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp414
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp1
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp17
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMSubtarget.h24
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp8
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp14
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp41
-rw-r--r--contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h6
-rw-r--r--contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp136
-rw-r--r--contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp15
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h22
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp1
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h5
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp18
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp8
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp13
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp20
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp2
-rw-r--r--contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp2
-rw-r--r--contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp8
-rw-r--r--contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp36
52 files changed, 2587 insertions, 1264 deletions
diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td
index 2e62a0790418..3db60f1c16d6 100644
--- a/contrib/llvm/lib/Target/ARM/ARM.td
+++ b/contrib/llvm/lib/Target/ARM/ARM.td
@@ -61,6 +61,11 @@ def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
"floating point",
[FeatureFPARMv8]>;
+def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true",
+ "Enable full half-precision "
+ "floating point fml instructions",
+ [FeatureFullFP16]>;
+
def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
"Floating point unit supports "
"single precision only">;
@@ -194,6 +199,10 @@ def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg",
"SlowLoadDSubregister", "true",
"Loading into D subregs is slow">;
+def FeatureUseWideStrideVFP : SubtargetFeature<"wide-stride-vfp",
+ "UseWideStrideVFP", "true",
+ "Use a wide stride when allocating VFP registers">;
+
// Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD.
def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
"DontWidenVMOVS", "true",
@@ -256,6 +265,9 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
"Prefer 32-bit Thumb instrs">;
+def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2",
+ "Prefer 32-bit alignment for loops">;
+
/// Some instructions update CPSR partially, which can add false dependency for
/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
/// mapped to a separate physical register. Avoid partial CPSR update for these
@@ -351,6 +363,11 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
"Use alias analysis during codegen">;
+// Armv8.5-A extensions
+
+def FeatureSB : SubtargetFeature<"sb", "HasSB", "true",
+ "Enable v8.5a Speculation Barrier" >;
+
//===----------------------------------------------------------------------===//
// ARM architecture class
//
@@ -440,6 +457,10 @@ def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
"Support ARM v8.4a instructions",
[HasV8_3aOps, FeatureDotProd]>;
+def HasV8_5aOps : SubtargetFeature<"v8.5a", "HasV8_5aOps", "true",
+ "Support ARM v8.5a instructions",
+ [HasV8_4aOps, FeatureSB]>;
+
//===----------------------------------------------------------------------===//
// ARM Processor subtarget features.
//
@@ -482,8 +503,25 @@ def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
"Swift ARM processors", []>;
-def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
- "Samsung Exynos-Mx processors", []>;
+def ProcExynos : SubtargetFeature<"exynos", "ARMProcFamily", "Exynos",
+ "Samsung Exynos processors",
+ [FeatureZCZeroing,
+ FeatureUseWideStrideVFP,
+ FeatureUseAA,
+ FeatureSplatVFPToNeon,
+ FeatureSlowVGETLNi32,
+ FeatureSlowVDUP32,
+ FeatureSlowFPBrcc,
+ FeatureProfUnpredicate,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureHasSlowFPVMLx,
+ FeatureHasRetAddrStack,
+ FeatureFuseLiterals,
+ FeatureFuseAES,
+ FeatureExpandMLx,
+ FeatureCrypto,
+ FeatureCRC]>;
def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4",
"Cortex-R4 ARM processors", []>;
@@ -659,6 +697,20 @@ def ARMv84a : Architecture<"armv8.4-a", "ARMv84a", [HasV8_4aOps,
FeatureRAS,
FeatureDotProd]>;
+def ARMv85a : Architecture<"armv8.5-a", "ARMv85a", [HasV8_5aOps,
+ FeatureAClass,
+ FeatureDB,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureDSP,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVirtualization,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureRAS,
+ FeatureDotProd]>;
+
def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops,
FeatureRClass,
FeatureDB,
@@ -865,6 +917,7 @@ def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift,
FeatureHasRetAddrStack,
FeatureNEONForFP,
FeatureVFP4,
+ FeatureUseWideStrideVFP,
FeatureMP,
FeatureHWDivThumb,
FeatureHWDivARM,
@@ -926,6 +979,7 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r,
def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m,
ProcM3,
+ FeaturePrefLoopAlign32,
FeatureHasNoBranchPredictor]>;
def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m,
@@ -936,6 +990,8 @@ def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em,
FeatureVFP4,
FeatureVFPOnlySP,
FeatureD16,
+ FeaturePrefLoopAlign32,
+ FeatureHasSlowFPVMLx,
FeatureHasNoBranchPredictor]>;
def : ProcNoItin<"cortex-m7", [ARMv7em,
@@ -950,6 +1006,8 @@ def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline,
FeatureFPARMv8,
FeatureD16,
FeatureVFPOnlySP,
+ FeaturePrefLoopAlign32,
+ FeatureHasSlowFPVMLx,
FeatureHasNoBranchPredictor]>;
def : ProcNoItin<"cortex-a32", [ARMv8a,
@@ -985,7 +1043,7 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ARMv8a, ProcA57,
FeatureAvoidPartialCPSR,
FeatureCheapPredicableCPSR]>;
-def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72,
+def : ProcessorModel<"cortex-a72", CortexA57Model, [ARMv8a, ProcA72,
FeatureHWDivThumb,
FeatureHWDivARM,
FeatureCrypto,
@@ -1017,29 +1075,12 @@ def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift,
FeatureZCZeroing,
FeatureNoPostRASched]>;
-def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynosM1,
- FeatureHWDivThumb,
- FeatureHWDivARM,
- FeatureCrypto,
- FeatureCRC]>;
-
-def : ProcNoItin<"exynos-m2", [ARMv8a, ProcExynosM1,
- FeatureHWDivThumb,
- FeatureHWDivARM,
- FeatureCrypto,
- FeatureCRC]>;
-
-def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynosM1,
- FeatureHWDivThumb,
- FeatureHWDivARM,
- FeatureCrypto,
- FeatureCRC]>;
-
-def : ProcNoItin<"exynos-m4", [ARMv8a, ProcExynosM1,
- FeatureHWDivThumb,
- FeatureHWDivARM,
- FeatureCrypto,
- FeatureCRC]>;
+def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynos]>;
+def : ProcNoItin<"exynos-m2", [ARMv8a, ProcExynos]>;
+def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynos]>;
+def : ProcNoItin<"exynos-m4", [ARMv82a, ProcExynos,
+ FeatureFullFP16,
+ FeatureDotProd]>;
def : ProcNoItin<"kryo", [ARMv8a, ProcKryo,
FeatureHWDivThumb,
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index b227eaed8d61..b7cd3a0c2dae 100644
--- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -367,6 +367,18 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
unsigned RC;
+ bool FirstHalf;
+ const ARMBaseTargetMachine &ATM =
+ static_cast<const ARMBaseTargetMachine &>(TM);
+
+ // 'Q' should correspond to the low order register and 'R' to the high
+ // order register. Whether this corresponds to the upper or lower half
+ // depends on the endianess mode.
+ if (ExtraCode[0] == 'Q')
+ FirstHalf = ATM.isLittleEndian();
+ else
+ // ExtraCode[0] == 'R'.
+ FirstHalf = !ATM.isLittleEndian();
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
if (InlineAsm::hasRegClassConstraint(Flags, RC) &&
ARM::GPRPairRegClass.hasSubClassEq(TRI->getRegClass(RC))) {
@@ -376,14 +388,14 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
if (!MO.isReg())
return true;
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
- unsigned Reg = TRI->getSubReg(MO.getReg(), ExtraCode[0] == 'Q' ?
+ unsigned Reg = TRI->getSubReg(MO.getReg(), FirstHalf ?
ARM::gsub_0 : ARM::gsub_1);
O << ARMInstPrinter::getRegisterName(Reg);
return false;
}
if (NumVals != 2)
return true;
- unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1;
+ unsigned RegOp = FirstHalf ? OpNum : OpNum + 1;
if (RegOp >= MI->getNumOperands())
return true;
const MachineOperand &MO = MI->getOperand(RegOp);
@@ -815,15 +827,31 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
assert(Subtarget->isTargetWindows() &&
"Windows is the only supported COFF target");
- bool IsIndirect = (TargetFlags & ARMII::MO_DLLIMPORT);
+ bool IsIndirect =
+ (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB));
if (!IsIndirect)
return getSymbol(GV);
SmallString<128> Name;
- Name = "__imp_";
+ if (TargetFlags & ARMII::MO_DLLIMPORT)
+ Name = "__imp_";
+ else if (TargetFlags & ARMII::MO_COFFSTUB)
+ Name = ".refptr.";
getNameWithPrefix(Name, GV);
- return OutContext.getOrCreateSymbol(Name);
+ MCSymbol *MCSym = OutContext.getOrCreateSymbol(Name);
+
+ if (TargetFlags & ARMII::MO_COFFSTUB) {
+ MachineModuleInfoCOFF &MMICOFF =
+ MMI->getObjFileInfo<MachineModuleInfoCOFF>();
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMICOFF.getGVStubEntry(MCSym);
+
+ if (!StubSym.getPointer())
+ StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV), true);
+ }
+
+ return MCSym;
} else if (Subtarget->isTargetELF()) {
return getSymbol(GV);
}
@@ -1043,10 +1071,12 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
const MachineFunction &MF = *MI->getParent()->getParent();
- const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TargetRegInfo =
+ MF.getSubtarget().getRegisterInfo();
+ const MachineRegisterInfo &MachineRegInfo = MF.getRegInfo();
const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>();
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ unsigned FramePtr = TargetRegInfo->getFrameRegister(MF);
unsigned Opc = MI->getOpcode();
unsigned SrcReg, DstReg;
@@ -1103,7 +1133,9 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
if (MO.isUndef()) {
assert(RegList.empty() &&
"Pad registers must come before restored ones");
- Pad += 4;
+ unsigned Width =
+ TargetRegInfo->getRegSizeInBits(MO.getReg(), MachineRegInfo) / 8;
+ Pad += Width;
continue;
}
RegList.push_back(MO.getReg());
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index b1c2031c7d7b..bbebed59c851 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -708,8 +708,12 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return MCID.getSize();
// If this machine instr is an inline asm, measure it.
- if (MI.getOpcode() == ARM::INLINEASM)
- return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
+ if (MI.getOpcode() == ARM::INLINEASM) {
+ unsigned Size = getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
+ if (!MF->getInfo<ARMFunctionInfo>()->isThumbFunction())
+ Size = alignTo(Size, 4);
+ return Size;
+ }
unsigned Opc = MI.getOpcode();
switch (Opc) {
default:
@@ -935,9 +939,9 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Mov->addRegisterKilled(SrcReg, TRI);
}
-bool ARMBaseInstrInfo::isCopyInstr(const MachineInstr &MI,
- const MachineOperand *&Src,
- const MachineOperand *&Dest) const {
+bool ARMBaseInstrInfo::isCopyInstrImpl(const MachineInstr &MI,
+ const MachineOperand *&Src,
+ const MachineOperand *&Dest) const {
// VMOVRRD is also a copy instruction but it requires
// special way of handling. It is more complex copy version
// and since that we are not considering it. For recognition
@@ -971,8 +975,6 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
unsigned SrcReg, bool isKill, int FI,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
- DebugLoc DL;
- if (I != MBB.end()) DL = I->getDebugLoc();
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
unsigned Align = MFI.getObjectAlignment(FI);
@@ -984,7 +986,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
switch (TRI->getSpillSize(*RC)) {
case 2:
if (ARM::HPRRegClass.hasSubClassEq(RC)) {
- BuildMI(MBB, I, DL, get(ARM::VSTRH))
+ BuildMI(MBB, I, DebugLoc(), get(ARM::VSTRH))
.addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FI)
.addImm(0)
@@ -995,14 +997,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 4:
if (ARM::GPRRegClass.hasSubClassEq(RC)) {
- BuildMI(MBB, I, DL, get(ARM::STRi12))
+ BuildMI(MBB, I, DebugLoc(), get(ARM::STRi12))
.addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FI)
.addImm(0)
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
} else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
- BuildMI(MBB, I, DL, get(ARM::VSTRS))
+ BuildMI(MBB, I, DebugLoc(), get(ARM::VSTRS))
.addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FI)
.addImm(0)
@@ -1013,7 +1015,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 8:
if (ARM::DPRRegClass.hasSubClassEq(RC)) {
- BuildMI(MBB, I, DL, get(ARM::VSTRD))
+ BuildMI(MBB, I, DebugLoc(), get(ARM::VSTRD))
.addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FI)
.addImm(0)
@@ -1021,7 +1023,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
.add(predOps(ARMCC::AL));
} else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
if (Subtarget.hasV5TEOps()) {
- MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STRD));
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::STRD));
AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)
@@ -1029,7 +1031,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
} else {
// Fallback to STM instruction, which has existed since the dawn of
// time.
- MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STMIA))
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::STMIA))
.addFrameIndex(FI)
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
@@ -1043,14 +1045,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
if (ARM::DPairRegClass.hasSubClassEq(RC)) {
// Use aligned spills if the stack can be realigned.
if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
- BuildMI(MBB, I, DL, get(ARM::VST1q64))
+ BuildMI(MBB, I, DebugLoc(), get(ARM::VST1q64))
.addFrameIndex(FI)
.addImm(16)
.addReg(SrcReg, getKillRegState(isKill))
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
} else {
- BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
+ BuildMI(MBB, I, DebugLoc(), get(ARM::VSTMQIA))
.addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FI)
.addMemOperand(MMO)
@@ -1063,14 +1065,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
// Use aligned spills if the stack can be realigned.
if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
- BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo))
+ BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64TPseudo))
.addFrameIndex(FI)
.addImm(16)
.addReg(SrcReg, getKillRegState(isKill))
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
} else {
- MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(),
+ get(ARM::VSTMDIA))
.addFrameIndex(FI)
.add(predOps(ARMCC::AL))
.addMemOperand(MMO);
@@ -1086,14 +1089,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
// FIXME: It's possible to only store part of the QQ register if the
// spilled def has a sub-register index.
- BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
+ BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64QPseudo))
.addFrameIndex(FI)
.addImm(16)
.addReg(SrcReg, getKillRegState(isKill))
.addMemOperand(MMO)
.add(predOps(ARMCC::AL));
} else {
- MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(),
+ get(ARM::VSTMDIA))
.addFrameIndex(FI)
.add(predOps(ARMCC::AL))
.addMemOperand(MMO);
@@ -1107,7 +1111,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 64:
if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
- MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+ MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::VSTMDIA))
.addFrameIndex(FI)
.add(predOps(ARMCC::AL))
.addMemOperand(MMO);
@@ -1172,8 +1176,14 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
- const MachineMemOperand *Dummy;
- return MI.mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
+ SmallVector<const MachineMemOperand *, 1> Accesses;
+ if (MI.mayStore() && hasStoreToStackSlot(MI, Accesses)) {
+ FrameIndex =
+ cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
+ ->getFrameIndex();
+ return true;
+ }
+ return false;
}
void ARMBaseInstrInfo::
@@ -1386,8 +1396,14 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
- const MachineMemOperand *Dummy;
- return MI.mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+ SmallVector<const MachineMemOperand *, 1> Accesses;
+ if (MI.mayLoad() && hasLoadFromStackSlot(MI, Accesses)) {
+ FrameIndex =
+ cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
+ ->getFrameIndex();
+ return true;
+ }
+ return false;
}
/// Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD
@@ -1432,9 +1448,8 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
SmallVector<unsigned, 6> ScratchRegs;
for(unsigned I = 5; I < MI->getNumOperands(); ++I)
ScratchRegs.push_back(MI->getOperand(I).getReg());
- llvm::sort(ScratchRegs.begin(), ScratchRegs.end(),
- [&TRI](const unsigned &Reg1,
- const unsigned &Reg2) -> bool {
+ llvm::sort(ScratchRegs,
+ [&TRI](const unsigned &Reg1, const unsigned &Reg2) -> bool {
return TRI.getEncodingValue(Reg1) <
TRI.getEncodingValue(Reg2);
});
@@ -1590,11 +1605,10 @@ void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB,
MachineFunction &MF = *MBB.getParent();
unsigned CPI = Orig.getOperand(1).getIndex();
unsigned PCLabelId = duplicateCPV(MF, CPI);
- MachineInstrBuilder MIB =
- BuildMI(MBB, I, Orig.getDebugLoc(), get(Opcode), DestReg)
- .addConstantPoolIndex(CPI)
- .addImm(PCLabelId);
- MIB->setMemRefs(Orig.memoperands_begin(), Orig.memoperands_end());
+ BuildMI(MBB, I, Orig.getDebugLoc(), get(Opcode), DestReg)
+ .addConstantPoolIndex(CPI)
+ .addImm(PCLabelId)
+ .cloneMemRefs(Orig);
break;
}
}
@@ -2185,6 +2199,7 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
{ARM::tSUBSi8, ARM::tSUBi8},
{ARM::tSUBSrr, ARM::tSUBrr},
{ARM::tSBCS, ARM::tSBC},
+ {ARM::tRSBS, ARM::tRSB},
{ARM::t2ADDSri, ARM::t2ADDri},
{ARM::t2ADDSrr, ARM::t2ADDrr},
@@ -2949,6 +2964,8 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
+ MI->clearRegisterDeads(ARM::CPSR);
+
return true;
}
@@ -4534,9 +4551,9 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
MIB.addReg(Reg, RegState::Kill)
- .addImm(0)
- .setMemRefs(MI->memoperands_begin(), MI->memoperands_end())
- .add(predOps(ARMCC::AL));
+ .addImm(0)
+ .cloneMemRefs(*MI)
+ .add(predOps(ARMCC::AL));
}
bool
@@ -5061,3 +5078,32 @@ bool ARMBaseInstrInfo::getInsertSubregLikeInputs(
}
llvm_unreachable("Target dependent opcode missing");
}
+
+std::pair<unsigned, unsigned>
+ARMBaseInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ const unsigned Mask = ARMII::MO_OPTION_MASK;
+ return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+ARMBaseInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace ARMII;
+
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_LO16, "arm-lo16"}, {MO_HI16, "arm-hi16"}};
+ return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+ARMBaseInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+ using namespace ARMII;
+
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_COFFSTUB, "arm-coffstub"},
+ {MO_GOT, "arm-got"},
+ {MO_SBREL, "arm-sbrel"},
+ {MO_DLLIMPORT, "arm-dllimport"},
+ {MO_SECREL, "arm-secrel"},
+ {MO_NONLAZY, "arm-nonlazy"}};
+ return makeArrayRef(TargetFlags);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index b54be15097b1..de1f307083ba 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -101,6 +101,12 @@ protected:
unsigned OpIdx1,
unsigned OpIdx2) const override;
+ /// If the specific machine instruction is a instruction that moves/copies
+ /// value from one register to another register return true along with
+ /// @Source machine operand and @Destination machine operand.
+ bool isCopyInstrImpl(const MachineInstr &MI, const MachineOperand *&Source,
+ const MachineOperand *&Destination) const override;
+
public:
// Return whether the target has an explicit NOP encoding.
bool hasNOP() const;
@@ -201,9 +207,6 @@ public:
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
- bool isCopyInstr(const MachineInstr &MI, const MachineOperand *&Src,
- const MachineOperand *&Dest) const override;
-
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned SrcReg, bool isKill, int FrameIndex,
@@ -331,6 +334,13 @@ public:
/// Get the number of addresses by LDM or VLDM or zero for unknown.
unsigned getNumLDMAddresses(const MachineInstr &MI) const;
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableBitmaskMachineOperandTargetFlags() const override;
+
private:
unsigned getInstBundleLength(const MachineInstr &MI) const;
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 5342e6e2cd13..02b3daf3c6fd 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -209,6 +209,11 @@ getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
+bool ARMBaseRegisterInfo::
+isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const {
+ return !getReservedRegs(MF).test(PhysReg);
+}
+
const TargetRegisterClass *
ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
const MachineFunction &) const {
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index f755f66a0f3a..45d29ebc0bd3 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -131,6 +131,8 @@ public:
CallingConv::ID) const;
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ bool isAsmClobberable(const MachineFunction &MF,
+ unsigned PhysReg) const override;
const TargetRegisterClass *
getPointerRegClass(const MachineFunction &MF,
@@ -154,7 +156,6 @@ public:
void updateRegAllocHint(unsigned Reg, unsigned NewReg,
MachineFunction &MF) const override;
- bool enableMultipleCopyHints() const override { return true; }
bool hasBasePointer(const MachineFunction &MF) const;
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
index 47f998b696f5..8e80c32bcf89 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -237,7 +237,7 @@ void ARMCallLowering::splitToValueTypes(
/// Lower the return value for the already existing \p Ret. This assumes that
/// \p MIRBuilder's insertion point is correct.
bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
- const Value *Val, unsigned VReg,
+ const Value *Val, ArrayRef<unsigned> VRegs,
MachineInstrBuilder &Ret) const {
if (!Val)
// Nothing to do here.
@@ -251,16 +251,24 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
if (!isSupportedType(DL, TLI, Val->getType()))
return false;
- SmallVector<ArgInfo, 4> SplitVTs;
- SmallVector<unsigned, 4> Regs;
- ArgInfo RetInfo(VReg, Val->getType());
- setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
- splitToValueTypes(RetInfo, SplitVTs, MF, [&](unsigned Reg, uint64_t Offset) {
- Regs.push_back(Reg);
- });
+ SmallVector<EVT, 4> SplitEVTs;
+ ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
+ assert(VRegs.size() == SplitEVTs.size() &&
+ "For each split Type there should be exactly one VReg.");
- if (Regs.size() > 1)
- MIRBuilder.buildUnmerge(Regs, VReg);
+ SmallVector<ArgInfo, 4> SplitVTs;
+ LLVMContext &Ctx = Val->getType()->getContext();
+ for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
+ ArgInfo CurArgInfo(VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx));
+ setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
+
+ SmallVector<unsigned, 4> Regs;
+ splitToValueTypes(
+ CurArgInfo, SplitVTs, MF,
+ [&](unsigned Reg, uint64_t Offset) { Regs.push_back(Reg); });
+ if (Regs.size() > 1)
+ MIRBuilder.buildUnmerge(Regs, VRegs[i]);
+ }
CCAssignFn *AssignFn =
TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
@@ -270,14 +278,15 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
}
bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
- const Value *Val, unsigned VReg) const {
- assert(!Val == !VReg && "Return value without a vreg");
+ const Value *Val,
+ ArrayRef<unsigned> VRegs) const {
+ assert(!Val == VRegs.empty() && "Return value without a vreg");
auto const &ST = MIRBuilder.getMF().getSubtarget<ARMSubtarget>();
unsigned Opcode = ST.getReturnOpcode();
auto Ret = MIRBuilder.buildInstrNoInsert(Opcode).add(predOps(ARMCC::AL));
- if (!lowerReturnVal(MIRBuilder, Val, VReg, Ret))
+ if (!lowerReturnVal(MIRBuilder, Val, VRegs, Ret))
return false;
MIRBuilder.insertInstr(Ret);
@@ -420,7 +429,7 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
auto &TLI = *getTLI<ARMTargetLowering>();
auto Subtarget = TLI.getSubtarget();
- if (Subtarget->isThumb())
+ if (Subtarget->isThumb1Only())
return false;
// Quick exit if there aren't any args
@@ -491,6 +500,22 @@ struct CallReturnHandler : public IncomingValueHandler {
MachineInstrBuilder MIB;
};
+// FIXME: This should move to the ARMSubtarget when it supports all the opcodes.
+unsigned getCallOpcode(const ARMSubtarget &STI, bool isDirect) {
+ if (isDirect)
+ return STI.isThumb() ? ARM::tBL : ARM::BL;
+
+ if (STI.isThumb())
+ return ARM::tBLXr;
+
+ if (STI.hasV5TOps())
+ return ARM::BLX;
+
+ if (STI.hasV4TOps())
+ return ARM::BX_CALL;
+
+ return ARM::BMOVPCRX_CALL;
+}
} // end anonymous namespace
bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
@@ -508,27 +533,34 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (STI.genLongCalls())
return false;
+ if (STI.isThumb1Only())
+ return false;
+
auto CallSeqStart = MIRBuilder.buildInstr(ARM::ADJCALLSTACKDOWN);
// Create the call instruction so we can add the implicit uses of arg
// registers, but don't insert it yet.
bool isDirect = !Callee.isReg();
- auto CallOpcode =
- isDirect ? ARM::BL
- : STI.hasV5TOps()
- ? ARM::BLX
- : STI.hasV4TOps() ? ARM::BX_CALL : ARM::BMOVPCRX_CALL;
- auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode)
- .add(Callee)
- .addRegMask(TRI->getCallPreservedMask(MF, CallConv));
- if (Callee.isReg()) {
+ auto CallOpcode = getCallOpcode(STI, isDirect);
+ auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode);
+
+ bool isThumb = STI.isThumb();
+ if (isThumb)
+ MIB.add(predOps(ARMCC::AL));
+
+ MIB.add(Callee);
+ if (!isDirect) {
auto CalleeReg = Callee.getReg();
- if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg))
- MIB->getOperand(0).setReg(constrainOperandRegClass(
+ if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) {
+ unsigned CalleeIdx = isThumb ? 2 : 0;
+ MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(),
- *MIB.getInstr(), MIB->getDesc(), Callee, 0));
+ *MIB.getInstr(), MIB->getDesc(), Callee, CalleeIdx));
+ }
}
+ MIB.addRegMask(TRI->getCallPreservedMask(MF, CallConv));
+
SmallVector<ArgInfo, 8> ArgInfos;
for (auto Arg : OrigArgs) {
if (!isSupportedType(DL, TLI, Arg.Ty))
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallLowering.h b/contrib/llvm/lib/Target/ARM/ARMCallLowering.h
index 86854c53f179..45a988a2f00e 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMCallLowering.h
@@ -33,8 +33,8 @@ class ARMCallLowering : public CallLowering {
public:
ARMCallLowering(const ARMTargetLowering &TLI);
- bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
- unsigned VReg) const override;
+ bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
+ ArrayRef<unsigned> VRegs) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<unsigned> VRegs) const override;
@@ -45,7 +45,8 @@ public:
private:
bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
- unsigned VReg, MachineInstrBuilder &Ret) const;
+ ArrayRef<unsigned> VRegs,
+ MachineInstrBuilder &Ret) const;
using SplitArgTy = std::function<void(unsigned Reg, uint64_t Offset)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp b/contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
index 24071277427a..b631c2bc687b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -54,47 +54,108 @@ EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false),
cl::desc("Use DSP instructions for scalar operations\
with immediate operands"));
-namespace {
+// The goal of this pass is to enable more efficient code generation for
+// operations on narrow types (i.e. types with < 32-bits) and this is a
+// motivating IR code example:
+//
+// define hidden i32 @cmp(i8 zeroext) {
+// %2 = add i8 %0, -49
+// %3 = icmp ult i8 %2, 3
+// ..
+// }
+//
+// The issue here is that i8 is type-legalized to i32 because i8 is not a
+// legal type. Thus, arithmetic is done in integer-precision, but then the
+// byte value is masked out as follows:
+//
+// t19: i32 = add t4, Constant:i32<-49>
+// t24: i32 = and t19, Constant:i32<255>
+//
+// Consequently, we generate code like this:
+//
+// subs r0, #49
+// uxtb r1, r0
+// cmp r1, #3
+//
+// This shows that masking out the byte value results in generation of
+// the UXTB instruction. This is not optimal as r0 already contains the byte
+// value we need, and so instead we can just generate:
+//
+// sub.w r1, r0, #49
+// cmp r1, #3
+//
+// We achieve this by type promoting the IR to i32 like so for this example:
+//
+// define i32 @cmp(i8 zeroext %c) {
+// %0 = zext i8 %c to i32
+// %c.off = add i32 %0, -49
+// %1 = icmp ult i32 %c.off, 3
+// ..
+// }
+//
+// For this to be valid and legal, we need to prove that the i32 add is
+// producing the same value as the i8 addition, and that e.g. no overflow
+// happens.
+//
+// A brief sketch of the algorithm and some terminology.
+// We pattern match interesting IR patterns:
+// - which have "sources": instructions producing narrow values (i8, i16), and
+// - they have "sinks": instructions consuming these narrow values.
+//
+// We collect all instruction connecting sources and sinks in a worklist, so
+// that we can mutate these instruction and perform type promotion when it is
+// legal to do so.
+namespace {
class IRPromoter {
SmallPtrSet<Value*, 8> NewInsts;
- SmallVector<Instruction*, 4> InstsToRemove;
+ SmallPtrSet<Instruction*, 4> InstsToRemove;
+ DenseMap<Value*, SmallVector<Type*, 4>> TruncTysMap;
+ SmallPtrSet<Value*, 8> Promoted;
Module *M = nullptr;
LLVMContext &Ctx;
+ IntegerType *ExtTy = nullptr;
+ IntegerType *OrigTy = nullptr;
+ SmallPtrSetImpl<Value*> *Visited;
+ SmallPtrSetImpl<Value*> *Sources;
+ SmallPtrSetImpl<Instruction*> *Sinks;
+ SmallPtrSetImpl<Instruction*> *SafeToPromote;
+
+ void ReplaceAllUsersOfWith(Value *From, Value *To);
+ void PrepareConstants(void);
+ void ExtendSources(void);
+ void ConvertTruncs(void);
+ void PromoteTree(void);
+ void TruncateSinks(void);
+ void Cleanup(void);
public:
- IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { }
+ IRPromoter(Module *M) : M(M), Ctx(M->getContext()),
+ ExtTy(Type::getInt32Ty(Ctx)) { }
- void Cleanup() {
- for (auto *I : InstsToRemove) {
- LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
- I->dropAllReferences();
- I->eraseFromParent();
- }
- InstsToRemove.clear();
- NewInsts.clear();
- }
void Mutate(Type *OrigTy,
SmallPtrSetImpl<Value*> &Visited,
- SmallPtrSetImpl<Value*> &Leaves,
- SmallPtrSetImpl<Instruction*> &Roots);
+ SmallPtrSetImpl<Value*> &Sources,
+ SmallPtrSetImpl<Instruction*> &Sinks,
+ SmallPtrSetImpl<Instruction*> &SafeToPromote);
};
class ARMCodeGenPrepare : public FunctionPass {
const ARMSubtarget *ST = nullptr;
IRPromoter *Promoter = nullptr;
std::set<Value*> AllVisited;
- Type *OrigTy = nullptr;
- unsigned TypeSize = 0;
+ SmallPtrSet<Instruction*, 8> SafeToPromote;
- bool isNarrowInstSupported(Instruction *I);
+ bool isSafeOverflow(Instruction *I);
bool isSupportedValue(Value *V);
bool isLegalToPromote(Value *V);
bool TryToPromote(Value *V);
public:
static char ID;
+ static unsigned TypeSize;
+ Type *OrigTy = nullptr;
ARMCodeGenPrepare() : FunctionPass(ID) {}
@@ -111,8 +172,7 @@ public:
}
-/// Can the given value generate sign bits.
-static bool isSigned(Value *V) {
+static bool generateSignBits(Value *V) {
if (!isa<Instruction>(V))
return false;
@@ -121,120 +181,226 @@ static bool isSigned(Value *V) {
Opc == Instruction::SRem;
}
+static bool EqualTypeSize(Value *V) {
+ return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
+}
+
+static bool LessOrEqualTypeSize(Value *V) {
+ return V->getType()->getScalarSizeInBits() <= ARMCodeGenPrepare::TypeSize;
+}
+
+static bool GreaterThanTypeSize(Value *V) {
+ return V->getType()->getScalarSizeInBits() > ARMCodeGenPrepare::TypeSize;
+}
+
+static bool LessThanTypeSize(Value *V) {
+ return V->getType()->getScalarSizeInBits() < ARMCodeGenPrepare::TypeSize;
+}
+
/// Some instructions can use 8- and 16-bit operands, and we don't need to
/// promote anything larger. We disallow booleans to make life easier when
/// dealing with icmps but allow any other integer that is <= 16 bits. Void
/// types are accepted so we can handle switches.
static bool isSupportedType(Value *V) {
- if (V->getType()->isVoidTy())
+ Type *Ty = V->getType();
+
+ // Allow voids and pointers, these won't be promoted.
+ if (Ty->isVoidTy() || Ty->isPointerTy())
return true;
- const IntegerType *IntTy = dyn_cast<IntegerType>(V->getType());
- if (!IntTy)
- return false;
+ if (auto *Ld = dyn_cast<LoadInst>(V))
+ Ty = cast<PointerType>(Ld->getPointerOperandType())->getElementType();
- // Don't try to promote boolean values.
- if (IntTy->getBitWidth() == 1)
+ if (!isa<IntegerType>(Ty) ||
+ cast<IntegerType>(V->getType())->getBitWidth() == 1)
return false;
- if (auto *ZExt = dyn_cast<ZExtInst>(V))
- return isSupportedType(ZExt->getOperand(0));
+ return LessOrEqualTypeSize(V);
+}
- return IntTy->getBitWidth() <= 16;
+/// Return true if the given value is a source in the use-def chain, producing
+/// a narrow 'TypeSize' value. These values will be zext to start the promotion
+/// of the tree to i32. We guarantee that these won't populate the upper bits
+/// of the register. ZExt on the loads will be free, and the same for call
+/// return values because we only accept ones that guarantee a zeroext ret val.
+/// Many arguments will have the zeroext attribute too, so those would be free
+/// too.
+static bool isSource(Value *V) {
+ if (!isa<IntegerType>(V->getType()))
+ return false;
+
+ // TODO Allow zext to be sources.
+ if (isa<Argument>(V))
+ return true;
+ else if (isa<LoadInst>(V))
+ return true;
+ else if (isa<BitCastInst>(V))
+ return true;
+ else if (auto *Call = dyn_cast<CallInst>(V))
+ return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+ else if (auto *Trunc = dyn_cast<TruncInst>(V))
+ return EqualTypeSize(Trunc);
+ return false;
}
/// Return true if V will require any promoted values to be truncated for the
-/// use to be valid.
+/// the IR to remain valid. We can't mutate the value type of these
+/// instructions.
static bool isSink(Value *V) {
- auto UsesNarrowValue = [](Value *V) {
- return V->getType()->getScalarSizeInBits() <= 32;
- };
-
+ // TODO The truncate also isn't actually necessary because we would already
+ // proved that the data value is kept within the range of the original data
+ // type.
+
+ // Sinks are:
+ // - points where the value in the register is being observed, such as an
+ // icmp, switch or store.
+ // - points where value types have to match, such as calls and returns.
+ // - zext are included to ease the transformation and are generally removed
+ // later on.
if (auto *Store = dyn_cast<StoreInst>(V))
- return UsesNarrowValue(Store->getValueOperand());
+ return LessOrEqualTypeSize(Store->getValueOperand());
if (auto *Return = dyn_cast<ReturnInst>(V))
- return UsesNarrowValue(Return->getReturnValue());
+ return LessOrEqualTypeSize(Return->getReturnValue());
+ if (auto *ZExt = dyn_cast<ZExtInst>(V))
+ return GreaterThanTypeSize(ZExt);
+ if (auto *Switch = dyn_cast<SwitchInst>(V))
+ return LessThanTypeSize(Switch->getCondition());
+ if (auto *ICmp = dyn_cast<ICmpInst>(V))
+ return ICmp->isSigned() || LessThanTypeSize(ICmp->getOperand(0));
return isa<CallInst>(V);
}
-/// Return true if the given value is a leaf that will need to be zext'd.
-static bool isSource(Value *V) {
- if (isa<Argument>(V) && isSupportedType(V))
- return true;
- else if (isa<TruncInst>(V))
- return true;
- else if (auto *ZExt = dyn_cast<ZExtInst>(V))
- // ZExt can be a leaf if its the only user of a load.
- return isa<LoadInst>(ZExt->getOperand(0)) &&
- ZExt->getOperand(0)->hasOneUse();
- else if (auto *Call = dyn_cast<CallInst>(V))
- return Call->hasRetAttr(Attribute::AttrKind::ZExt);
- else if (auto *Load = dyn_cast<LoadInst>(V)) {
- if (!isa<IntegerType>(Load->getType()))
- return false;
- // A load is a leaf, unless its already just being zext'd.
- if (Load->hasOneUse() && isa<ZExtInst>(*Load->use_begin()))
- return false;
-
- return true;
- }
- return false;
-}
-
/// Return whether the instruction can be promoted within any modifications to
-/// it's operands or result.
-static bool isSafeOverflow(Instruction *I) {
+/// its operands or result.
+bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) {
+ // FIXME Do we need NSW too?
if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
return true;
+ // We can support a, potentially, overflowing instruction (I) if:
+ // - It is only used by an unsigned icmp.
+ // - The icmp uses a constant.
+ // - The overflowing value (I) is decreasing, i.e would underflow - wrapping
+ // around zero to become a larger number than before.
+ // - The underflowing instruction (I) also uses a constant.
+ //
+ // We can then use the two constants to calculate whether the result would
+ // wrap in respect to itself in the original bitwidth. If it doesn't wrap,
+ // just underflows the range, the icmp would give the same result whether the
+ // result has been truncated or not. We calculate this by:
+ // - Zero extending both constants, if needed, to 32-bits.
+ // - Take the absolute value of I's constant, adding this to the icmp const.
+ // - Check that this value is not out of range for small type. If it is, it
+ // means that it has underflowed enough to wrap around the icmp constant.
+ //
+ // For example:
+ //
+ // %sub = sub i8 %a, 2
+ // %cmp = icmp ule i8 %sub, 254
+ //
+ // If %a = 0, %sub = -2 == FE == 254
+ // But if this is evalulated as a i32
+ // %sub = -2 == FF FF FF FE == 4294967294
+ // So the unsigned compares (i8 and i32) would not yield the same result.
+ //
+ // Another way to look at it is:
+ // %a - 2 <= 254
+ // %a + 2 <= 254 + 2
+ // %a <= 256
+ // And we can't represent 256 in the i8 format, so we don't support it.
+ //
+ // Whereas:
+ //
+ // %sub i8 %a, 1
+ // %cmp = icmp ule i8 %sub, 254
+ //
+ // If %a = 0, %sub = -1 == FF == 255
+ // As i32:
+ // %sub = -1 == FF FF FF FF == 4294967295
+ //
+ // In this case, the unsigned compare results would be the same and this
+ // would also be true for ult, uge and ugt:
+ // - (255 < 254) == (0xFFFFFFFF < 254) == false
+ // - (255 <= 254) == (0xFFFFFFFF <= 254) == false
+ // - (255 > 254) == (0xFFFFFFFF > 254) == true
+ // - (255 >= 254) == (0xFFFFFFFF >= 254) == true
+ //
+ // To demonstrate why we can't handle increasing values:
+ //
+ // %add = add i8 %a, 2
+ // %cmp = icmp ult i8 %add, 127
+ //
+ // If %a = 254, %add = 256 == (i8 1)
+ // As i32:
+ // %add = 256
+ //
+ // (1 < 127) != (256 < 127)
+
unsigned Opc = I->getOpcode();
- if (Opc == Instruction::Add || Opc == Instruction::Sub) {
- // We don't care if the add or sub could wrap if the value is decreasing
- // and is only being used by an unsigned compare.
- if (!I->hasOneUse() ||
- !isa<ICmpInst>(*I->user_begin()) ||
- !isa<ConstantInt>(I->getOperand(1)))
- return false;
+ if (Opc != Instruction::Add && Opc != Instruction::Sub)
+ return false;
- auto *CI = cast<ICmpInst>(*I->user_begin());
- if (CI->isSigned())
- return false;
+ if (!I->hasOneUse() ||
+ !isa<ICmpInst>(*I->user_begin()) ||
+ !isa<ConstantInt>(I->getOperand(1)))
+ return false;
- bool NegImm = cast<ConstantInt>(I->getOperand(1))->isNegative();
- bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
- ((Opc == Instruction::Add) && NegImm);
- if (!IsDecreasing)
- return false;
+ ConstantInt *OverflowConst = cast<ConstantInt>(I->getOperand(1));
+ bool NegImm = OverflowConst->isNegative();
+ bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
+ ((Opc == Instruction::Add) && NegImm);
+ if (!IsDecreasing)
+ return false;
- LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
- return true;
- }
+ // Don't support an icmp that deals with sign bits.
+ auto *CI = cast<ICmpInst>(*I->user_begin());
+ if (CI->isSigned() || CI->isEquality())
+ return false;
- // Otherwise, if an instruction is using a negative immediate we will need
- // to fix it up during the promotion.
- for (auto &Op : I->operands()) {
- if (auto *Const = dyn_cast<ConstantInt>(Op))
- if (Const->isNegative())
- return false;
- }
- return false;
+ ConstantInt *ICmpConst = nullptr;
+ if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(0)))
+ ICmpConst = Const;
+ else if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(1)))
+ ICmpConst = Const;
+ else
+ return false;
+
+ // Now check that the result can't wrap on itself.
+ APInt Total = ICmpConst->getValue().getBitWidth() < 32 ?
+ ICmpConst->getValue().zext(32) : ICmpConst->getValue();
+
+ Total += OverflowConst->getValue().getBitWidth() < 32 ?
+ OverflowConst->getValue().abs().zext(32) : OverflowConst->getValue().abs();
+
+ APInt Max = APInt::getAllOnesValue(ARMCodeGenPrepare::TypeSize);
+
+ if (Total.getBitWidth() > Max.getBitWidth()) {
+ if (Total.ugt(Max.zext(Total.getBitWidth())))
+ return false;
+ } else if (Max.getBitWidth() > Total.getBitWidth()) {
+ if (Total.zext(Max.getBitWidth()).ugt(Max))
+ return false;
+ } else if (Total.ugt(Max))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
+ return true;
}
static bool shouldPromote(Value *V) {
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
+ if (!isa<IntegerType>(V->getType()) || isSink(V))
return false;
- if (!isa<IntegerType>(V->getType()))
- return false;
+ if (isSource(V))
+ return true;
- if (isa<StoreInst>(I) || isa<TerminatorInst>(I) || isa<TruncInst>(I) ||
- isa<ICmpInst>(I))
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
return false;
- if (auto *ZExt = dyn_cast<ZExtInst>(I))
- return !ZExt->getDestTy()->isIntegerTy(32);
+ if (isa<ICmpInst>(I))
+ return false;
return true;
}
@@ -245,24 +411,16 @@ static bool isPromotedResultSafe(Value *V) {
if (!isa<Instruction>(V))
return true;
- if (isSigned(V))
+ if (generateSignBits(V))
return false;
- // If I is only being used by something that will require its value to be
- // truncated, then we don't care about the promoted result.
- auto *I = cast<Instruction>(V);
- if (I->hasOneUse() && isSink(*I->use_begin()))
- return true;
-
- if (isa<OverflowingBinaryOperator>(I))
- return isSafeOverflow(I);
- return true;
+ return !isa<OverflowingBinaryOperator>(V);
}
/// Return the intrinsic for the instruction that can perform the same
/// operation but on a narrow type. This is using the parallel dsp intrinsics
/// on scalar values.
-static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) {
+static Intrinsic::ID getNarrowIntrinsic(Instruction *I) {
// Whether we use the signed or unsigned versions of these intrinsics
// doesn't matter because we're not using the GE bits that they set in
// the APSR.
@@ -270,124 +428,163 @@ static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) {
default:
break;
case Instruction::Add:
- return TypeSize == 16 ? Intrinsic::arm_uadd16 :
+ return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_uadd16 :
Intrinsic::arm_uadd8;
case Instruction::Sub:
- return TypeSize == 16 ? Intrinsic::arm_usub16 :
+ return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_usub16 :
Intrinsic::arm_usub8;
}
llvm_unreachable("unhandled opcode for narrow intrinsic");
}
-void IRPromoter::Mutate(Type *OrigTy,
- SmallPtrSetImpl<Value*> &Visited,
- SmallPtrSetImpl<Value*> &Leaves,
- SmallPtrSetImpl<Instruction*> &Roots) {
+void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
+ SmallVector<Instruction*, 4> Users;
+ Instruction *InstTo = dyn_cast<Instruction>(To);
+ bool ReplacedAll = true;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Replacing " << *From << " with " << *To
+ << "\n");
+
+ for (Use &U : From->uses()) {
+ auto *User = cast<Instruction>(U.getUser());
+ if (InstTo && User->isIdenticalTo(InstTo)) {
+ ReplacedAll = false;
+ continue;
+ }
+ Users.push_back(User);
+ }
+
+ for (auto *U : Users)
+ U->replaceUsesOfWith(From, To);
+
+ if (ReplacedAll)
+ if (auto *I = dyn_cast<Instruction>(From))
+ InstsToRemove.insert(I);
+}
+
+void IRPromoter::PrepareConstants() {
IRBuilder<> Builder{Ctx};
- Type *ExtTy = Type::getInt32Ty(M->getContext());
- unsigned TypeSize = OrigTy->getPrimitiveSizeInBits();
- SmallPtrSet<Value*, 8> Promoted;
- LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << TypeSize
- << " to 32-bits\n");
-
- auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) {
- SmallVector<Instruction*, 4> Users;
- Instruction *InstTo = dyn_cast<Instruction>(To);
- for (Use &U : From->uses()) {
- auto *User = cast<Instruction>(U.getUser());
- if (InstTo && User->isIdenticalTo(InstTo))
+ // First step is to prepare the instructions for mutation. Most constants
+ // just need to be zero extended into their new type, but complications arise
+ // because:
+ // - For nuw binary operators, negative immediates would need sign extending;
+ // however, instead we'll change them to positive and zext them. We can do
+ // this because:
+ // > The operators that can wrap are: add, sub, mul and shl.
+ // > shl interprets its second operand as unsigned and if the first operand
+ // is an immediate, it will need zext to be nuw.
+ // > I'm assuming mul has to interpret immediates as unsigned for nuw.
+ // > Which leaves the nuw add and sub to be handled; as with shl, if an
+ // immediate is used as operand 0, it will need zext to be nuw.
+ // - We also allow add and sub to safely overflow in certain circumstances
+ // and only when the value (operand 0) is being decreased.
+ //
+ // For adds and subs, that are either nuw or safely wrap and use a negative
+ // immediate as operand 1, we create an equivalent instruction using a
+ // positive immediate. That positive immediate can then be zext along with
+ // all the other immediates later.
+ for (auto *V : *Visited) {
+ if (!isa<Instruction>(V))
+ continue;
+
+ auto *I = cast<Instruction>(V);
+ if (SafeToPromote->count(I)) {
+
+ if (!isa<OverflowingBinaryOperator>(I))
continue;
- Users.push_back(User);
- }
- for (auto &U : Users)
- U->replaceUsesOfWith(From, To);
- };
+ if (auto *Const = dyn_cast<ConstantInt>(I->getOperand(1))) {
+ if (!Const->isNegative())
+ break;
- auto FixConst = [&](ConstantInt *Const, Instruction *I) {
- Constant *NewConst = nullptr;
- if (isSafeOverflow(I)) {
- NewConst = (Const->isNegative()) ?
- ConstantExpr::getSExt(Const, ExtTy) :
- ConstantExpr::getZExt(Const, ExtTy);
- } else {
- uint64_t NewVal = *Const->getValue().getRawData();
- if (Const->getType() == Type::getInt16Ty(Ctx))
- NewVal &= 0xFFFF;
- else
- NewVal &= 0xFF;
- NewConst = ConstantInt::get(ExtTy, NewVal);
+ unsigned Opc = I->getOpcode();
+ if (Opc != Instruction::Add && Opc != Instruction::Sub)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n");
+ auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs());
+ Builder.SetInsertPoint(I);
+ Value *NewVal = Opc == Instruction::Sub ?
+ Builder.CreateAdd(I->getOperand(0), NewConst) :
+ Builder.CreateSub(I->getOperand(0), NewConst);
+ LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n");
+
+ if (auto *NewInst = dyn_cast<Instruction>(NewVal)) {
+ NewInst->copyIRFlags(I);
+ NewInsts.insert(NewInst);
+ }
+ InstsToRemove.insert(I);
+ I->replaceAllUsesWith(NewVal);
+ }
}
- I->replaceUsesOfWith(Const, NewConst);
- };
+ }
+ for (auto *I : NewInsts)
+ Visited->insert(I);
+}
- auto InsertDSPIntrinsic = [&](Instruction *I) {
- LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
- << *I << "\n");
- Function *DSPInst =
- Intrinsic::getDeclaration(M, getNarrowIntrinsic(I, TypeSize));
- Builder.SetInsertPoint(I);
- Builder.SetCurrentDebugLocation(I->getDebugLoc());
- Value *Args[] = { I->getOperand(0), I->getOperand(1) };
- CallInst *Call = Builder.CreateCall(DSPInst, Args);
- ReplaceAllUsersOfWith(I, Call);
- InstsToRemove.push_back(I);
- NewInsts.insert(Call);
- };
+void IRPromoter::ExtendSources() {
+ IRBuilder<> Builder{Ctx};
auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
+ assert(V->getType() != ExtTy && "zext already extends to i32");
LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
Builder.SetInsertPoint(InsertPt);
if (auto *I = dyn_cast<Instruction>(V))
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- auto *ZExt = cast<Instruction>(Builder.CreateZExt(V, ExtTy));
- if (isa<Argument>(V))
- ZExt->moveBefore(InsertPt);
- else
- ZExt->moveAfter(InsertPt);
+
+ Value *ZExt = Builder.CreateZExt(V, ExtTy);
+ if (auto *I = dyn_cast<Instruction>(ZExt)) {
+ if (isa<Argument>(V))
+ I->moveBefore(InsertPt);
+ else
+ I->moveAfter(InsertPt);
+ NewInsts.insert(I);
+ }
+
ReplaceAllUsersOfWith(V, ZExt);
- NewInsts.insert(ZExt);
};
- // First, insert extending instructions between the leaves and their users.
- LLVM_DEBUG(dbgs() << "ARM CGP: Promoting leaves:\n");
- for (auto V : Leaves) {
+ // Now, insert extending instructions between the sources and their users.
+ LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n");
+ for (auto V : *Sources) {
LLVM_DEBUG(dbgs() << " - " << *V << "\n");
- if (auto *ZExt = dyn_cast<ZExtInst>(V))
- ZExt->mutateType(ExtTy);
- else if (auto *I = dyn_cast<Instruction>(V))
+ if (auto *I = dyn_cast<Instruction>(V))
InsertZExt(I, I);
else if (auto *Arg = dyn_cast<Argument>(V)) {
BasicBlock &BB = Arg->getParent()->front();
InsertZExt(Arg, &*BB.getFirstInsertionPt());
} else {
- llvm_unreachable("unhandled leaf that needs extending");
+ llvm_unreachable("unhandled source that needs extending");
}
Promoted.insert(V);
}
+}
+void IRPromoter::PromoteTree() {
LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
- // Then mutate the types of the instructions within the tree. Here we handle
- // constant operands.
- for (auto *V : Visited) {
- if (Leaves.count(V))
- continue;
- if (!isa<Instruction>(V))
+ IRBuilder<> Builder{Ctx};
+
+ // Mutate the types of the instructions within the tree. Here we handle
+ // constant operands.
+ for (auto *V : *Visited) {
+ if (Sources->count(V))
continue;
auto *I = cast<Instruction>(V);
- if (Roots.count(I))
+ if (Sinks->count(I))
continue;
- for (auto &U : I->operands()) {
- if ((U->getType() == ExtTy) || !isSupportedType(&*U))
+ for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) {
+ Value *Op = I->getOperand(i);
+ if ((Op->getType() == ExtTy) || !isa<IntegerType>(Op->getType()))
continue;
- if (auto *Const = dyn_cast<ConstantInt>(&*U))
- FixConst(Const, I);
- else if (isa<UndefValue>(&*U))
- U->mutateType(ExtTy);
+ if (auto *Const = dyn_cast<ConstantInt>(Op)) {
+ Constant *NewConst = ConstantExpr::getZExt(Const, ExtTy);
+ I->setOperand(i, NewConst);
+ } else if (isa<UndefValue>(Op))
+ I->setOperand(i, UndefValue::get(ExtTy));
}
if (shouldPromote(I)) {
@@ -396,91 +593,215 @@ void IRPromoter::Mutate(Type *OrigTy,
}
}
- // Now we need to remove any zexts that have become unnecessary, as well
- // as insert any intrinsics.
- for (auto *V : Visited) {
- if (Leaves.count(V))
+ // Finally, any instructions that should be promoted but haven't yet been,
+ // need to be handled using intrinsics.
+ for (auto *V : *Visited) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
continue;
- if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
- if (ZExt->getDestTy() != ExtTy) {
- ZExt->mutateType(ExtTy);
- Promoted.insert(ZExt);
- }
- else if (ZExt->getSrcTy() == ExtTy) {
- ReplaceAllUsersOfWith(V, ZExt->getOperand(0));
- InstsToRemove.push_back(ZExt);
- }
+
+ if (Sources->count(I) || Sinks->count(I))
continue;
- }
- if (!shouldPromote(V) || isPromotedResultSafe(V))
+ if (!shouldPromote(I) || SafeToPromote->count(I) || NewInsts.count(I))
continue;
+
+ assert(EnableDSP && "DSP intrinisc insertion not enabled!");
// Replace unsafe instructions with appropriate intrinsic calls.
- InsertDSPIntrinsic(cast<Instruction>(V));
+ LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
+ << *I << "\n");
+ Function *DSPInst =
+ Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
+ Builder.SetInsertPoint(I);
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ Value *Args[] = { I->getOperand(0), I->getOperand(1) };
+ CallInst *Call = Builder.CreateCall(DSPInst, Args);
+ NewInsts.insert(Call);
+ ReplaceAllUsersOfWith(I, Call);
}
+}
+
+void IRPromoter::TruncateSinks() {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
+
+ IRBuilder<> Builder{Ctx};
+
+ auto InsertTrunc = [&](Value *V, Type *TruncTy) -> Instruction* {
+ if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType()))
+ return nullptr;
+
+ if ((!Promoted.count(V) && !NewInsts.count(V)) || Sources->count(V))
+ return nullptr;
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for "
+ << *V << "\n");
+ Builder.SetInsertPoint(cast<Instruction>(V));
+ auto *Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(V, TruncTy));
+ if (Trunc)
+ NewInsts.insert(Trunc);
+ return Trunc;
+ };
- LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the roots:\n");
// Fix up any stores or returns that use the results of the promoted
// chain.
- for (auto I : Roots) {
- LLVM_DEBUG(dbgs() << " - " << *I << "\n");
- Type *TruncTy = OrigTy;
- if (auto *Store = dyn_cast<StoreInst>(I)) {
- auto *PtrTy = cast<PointerType>(Store->getPointerOperandType());
- TruncTy = PtrTy->getElementType();
- } else if (isa<ReturnInst>(I)) {
- Function *F = I->getParent()->getParent();
- TruncTy = F->getFunctionType()->getReturnType();
+ for (auto I : *Sinks) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: For Sink: " << *I << "\n");
+
+ // Handle calls separately as we need to iterate over arg operands.
+ if (auto *Call = dyn_cast<CallInst>(I)) {
+ for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) {
+ Value *Arg = Call->getArgOperand(i);
+ Type *Ty = TruncTysMap[Call][i];
+ if (Instruction *Trunc = InsertTrunc(Arg, Ty)) {
+ Trunc->moveBefore(Call);
+ Call->setArgOperand(i, Trunc);
+ }
+ }
+ continue;
}
+ // Special case switches because we need to truncate the condition.
+ if (auto *Switch = dyn_cast<SwitchInst>(I)) {
+ Type *Ty = TruncTysMap[Switch][0];
+ if (Instruction *Trunc = InsertTrunc(Switch->getCondition(), Ty)) {
+ Trunc->moveBefore(Switch);
+ Switch->setCondition(Trunc);
+ }
+ continue;
+ }
+
+ // Now handle the others.
for (unsigned i = 0; i < I->getNumOperands(); ++i) {
- Value *V = I->getOperand(i);
- if (Promoted.count(V) || NewInsts.count(V)) {
- if (auto *Op = dyn_cast<Instruction>(V)) {
-
- if (auto *Call = dyn_cast<CallInst>(I))
- TruncTy = Call->getFunctionType()->getParamType(i);
-
- if (TruncTy == ExtTy)
- continue;
-
- LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy
- << " Trunc for " << *Op << "\n");
- Builder.SetInsertPoint(Op);
- auto *Trunc = cast<Instruction>(Builder.CreateTrunc(Op, TruncTy));
- Trunc->moveBefore(I);
- I->setOperand(i, Trunc);
- NewInsts.insert(Trunc);
- }
+ Type *Ty = TruncTysMap[I][i];
+ if (Instruction *Trunc = InsertTrunc(I->getOperand(i), Ty)) {
+ Trunc->moveBefore(I);
+ I->setOperand(i, Trunc);
}
}
}
- LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete.\n");
}
-bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) {
- if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
- return false;
+void IRPromoter::Cleanup() {
+ // Some zexts will now have become redundant, along with their trunc
+ // operands, so remove them
+ for (auto V : *Visited) {
+ if (!isa<CastInst>(V))
+ continue;
- if (ST->isThumb() && !ST->hasThumb2())
- return false;
+ auto ZExt = cast<CastInst>(V);
+ if (ZExt->getDestTy() != ExtTy)
+ continue;
- if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
- return false;
+ Value *Src = ZExt->getOperand(0);
+ if (ZExt->getSrcTy() == ZExt->getDestTy()) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast: " << *ZExt
+ << "\n");
+ ReplaceAllUsersOfWith(ZExt, Src);
+ continue;
+ }
- // TODO
- // Would it be profitable? For Thumb code, these parallel DSP instructions
- // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
- // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
- // halved. They also do not take immediates as operands.
- for (auto &Op : I->operands()) {
- if (isa<Constant>(Op)) {
- if (!EnableDSPWithImms)
- return false;
+ // For any truncs that we insert to handle zexts, we can replace the
+ // result of the zext with the input to the trunc.
+ if (NewInsts.count(Src) && isa<ZExtInst>(V) && isa<TruncInst>(Src)) {
+ auto *Trunc = cast<TruncInst>(Src);
+ assert(Trunc->getOperand(0)->getType() == ExtTy &&
+ "expected inserted trunc to be operating on i32");
+ ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
}
}
- return true;
+
+ for (auto *I : InstsToRemove) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
+ I->dropAllReferences();
+ I->eraseFromParent();
+ }
+
+ InstsToRemove.clear();
+ NewInsts.clear();
+ TruncTysMap.clear();
+ Promoted.clear();
+}
+
+void IRPromoter::ConvertTruncs() {
+ IRBuilder<> Builder{Ctx};
+
+ for (auto *V : *Visited) {
+ if (!isa<TruncInst>(V) || Sources->count(V))
+ continue;
+
+ auto *Trunc = cast<TruncInst>(V);
+ assert(LessThanTypeSize(Trunc) && "expected narrow trunc");
+
+ Builder.SetInsertPoint(Trunc);
+ unsigned NumBits =
+ cast<IntegerType>(Trunc->getType())->getScalarSizeInBits();
+ ConstantInt *Mask = ConstantInt::get(Ctx, APInt::getMaxValue(NumBits));
+ Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask);
+
+ if (auto *I = dyn_cast<Instruction>(Masked))
+ NewInsts.insert(I);
+
+ ReplaceAllUsersOfWith(Trunc, Masked);
+ }
+}
+
+void IRPromoter::Mutate(Type *OrigTy,
+ SmallPtrSetImpl<Value*> &Visited,
+ SmallPtrSetImpl<Value*> &Sources,
+ SmallPtrSetImpl<Instruction*> &Sinks,
+ SmallPtrSetImpl<Instruction*> &SafeToPromote) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
+ << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
+
+ assert(isa<IntegerType>(OrigTy) && "expected integer type");
+ this->OrigTy = cast<IntegerType>(OrigTy);
+ assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits() &&
+ "original type not smaller than extended type");
+
+ this->Visited = &Visited;
+ this->Sources = &Sources;
+ this->Sinks = &Sinks;
+ this->SafeToPromote = &SafeToPromote;
+
+ // Cache original types of the values that will likely need truncating
+ for (auto *I : Sinks) {
+ if (auto *Call = dyn_cast<CallInst>(I)) {
+ for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) {
+ Value *Arg = Call->getArgOperand(i);
+ TruncTysMap[Call].push_back(Arg->getType());
+ }
+ } else if (auto *Switch = dyn_cast<SwitchInst>(I))
+ TruncTysMap[I].push_back(Switch->getCondition()->getType());
+ else {
+ for (unsigned i = 0; i < I->getNumOperands(); ++i)
+ TruncTysMap[I].push_back(I->getOperand(i)->getType());
+ }
+ }
+
+ // Convert adds and subs using negative immediates to equivalent instructions
+ // that use positive constants.
+ PrepareConstants();
+
+ // Insert zext instructions between sources and their users.
+ ExtendSources();
+
+ // Convert any truncs, that aren't sources, into AND masks.
+ ConvertTruncs();
+
+ // Promote visited instructions, mutating their types in place. Also insert
+ // DSP intrinsics, if enabled, for adds and subs which would be unsafe to
+ // promote.
+ PromoteTree();
+
+ // Insert trunc instructions for use by calls, stores etc...
+ TruncateSinks();
+
+ // Finally, remove unecessary zexts and truncs, delete old instructions and
+ // clear the data structures.
+ Cleanup();
+
+ LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete\n");
}
/// We accept most instructions, as well as Arguments and ConstantInsts. We
@@ -488,102 +809,133 @@ bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) {
/// return value is zeroext. We don't allow opcodes that can introduce sign
/// bits.
bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
- LLVM_DEBUG(dbgs() << "ARM CGP: Is " << *V << " supported?\n");
-
- // Non-instruction values that we can handle.
- if (isa<ConstantInt>(V) || isa<Argument>(V))
- return true;
+ if (auto *I = dyn_cast<ICmpInst>(V)) {
+ // Now that we allow small types than TypeSize, only allow icmp of
+ // TypeSize because they will require a trunc to be legalised.
+ // TODO: Allow icmp of smaller types, and calculate at the end
+ // whether the transform would be beneficial.
+ if (isa<PointerType>(I->getOperand(0)->getType()))
+ return true;
+ return EqualTypeSize(I->getOperand(0));
+ }
// Memory instructions
- if (isa<StoreInst>(V) || isa<LoadInst>(V) || isa<GetElementPtrInst>(V))
+ if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
return true;
// Branches and targets.
- if (auto *ICmp = dyn_cast<ICmpInst>(V))
- return ICmp->isEquality() || !ICmp->isSigned();
-
if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
return true;
- if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V))
- return true;
+ // Non-instruction values that we can handle.
+ if ((isa<Constant>(V) && !isa<ConstantExpr>(V)) || isa<Argument>(V))
+ return isSupportedType(V);
+
+ if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V) ||
+ isa<LoadInst>(V))
+ return isSupportedType(V);
+
+ if (isa<SExtInst>(V))
+ return false;
+
+ if (auto *Cast = dyn_cast<CastInst>(V))
+ return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0));
// Special cases for calls as we need to check for zeroext
// TODO We should accept calls even if they don't have zeroext, as they can
- // still be roots.
+ // still be sinks.
if (auto *Call = dyn_cast<CallInst>(V))
- return Call->hasRetAttr(Attribute::AttrKind::ZExt);
- else if (auto *Cast = dyn_cast<CastInst>(V)) {
- if (isa<ZExtInst>(Cast))
- return Cast->getDestTy()->getScalarSizeInBits() <= 32;
- else if (auto *Trunc = dyn_cast<TruncInst>(V))
- return Trunc->getDestTy()->getScalarSizeInBits() <= TypeSize;
- else {
- LLVM_DEBUG(dbgs() << "ARM CGP: No, unsupported cast.\n");
- return false;
- }
- } else if (!isa<BinaryOperator>(V)) {
- LLVM_DEBUG(dbgs() << "ARM CGP: No, not a binary operator.\n");
+ return isSupportedType(Call) &&
+ Call->hasRetAttr(Attribute::AttrKind::ZExt);
+
+ if (!isa<BinaryOperator>(V))
+ return false;
+
+ if (!isSupportedType(V))
return false;
- }
- bool res = !isSigned(V);
- if (!res)
- LLVM_DEBUG(dbgs() << "ARM CGP: No, it's a signed instruction.\n");
- return res;
+ if (generateSignBits(V)) {
+ LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n");
+ return false;
+ }
+ return true;
}
/// Check that the type of V would be promoted and that the original type is
/// smaller than the targeted promoted type. Check that we're not trying to
/// promote something larger than our base 'TypeSize' type.
bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
- if (!isSupportedType(V))
- return false;
- unsigned VSize = 0;
- if (auto *Ld = dyn_cast<LoadInst>(V)) {
- auto *PtrTy = cast<PointerType>(Ld->getPointerOperandType());
- VSize = PtrTy->getElementType()->getPrimitiveSizeInBits();
- } else if (auto *ZExt = dyn_cast<ZExtInst>(V)) {
- VSize = ZExt->getOperand(0)->getType()->getPrimitiveSizeInBits();
- } else {
- VSize = V->getType()->getPrimitiveSizeInBits();
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return true;
+
+ if (SafeToPromote.count(I))
+ return true;
+
+ if (isPromotedResultSafe(V) || isSafeOverflow(I)) {
+ SafeToPromote.insert(I);
+ return true;
}
- if (VSize > TypeSize)
+ if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
return false;
- if (isPromotedResultSafe(V))
- return true;
+ // If promotion is not safe, can we use a DSP instruction to natively
+ // handle the narrow type?
+ if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
+ return false;
- if (auto *I = dyn_cast<Instruction>(V))
- return isNarrowInstSupported(I);
+ if (ST->isThumb() && !ST->hasThumb2())
+ return false;
- return false;
+ // TODO
+ // Would it be profitable? For Thumb code, these parallel DSP instructions
+ // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
+ // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
+ // halved. They also do not take immediates as operands.
+ for (auto &Op : I->operands()) {
+ if (isa<Constant>(Op)) {
+ if (!EnableDSPWithImms)
+ return false;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "ARM CGP: Will use an intrinsic for: " << *I << "\n");
+ return true;
}
bool ARMCodeGenPrepare::TryToPromote(Value *V) {
OrigTy = V->getType();
TypeSize = OrigTy->getPrimitiveSizeInBits();
+ if (TypeSize > 16 || TypeSize < 8)
+ return false;
+
+ SafeToPromote.clear();
if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
return false;
- LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << "\n");
+ LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << ", TypeSize = "
+ << TypeSize << "\n");
SetVector<Value*> WorkList;
- SmallPtrSet<Value*, 8> Leaves;
- SmallPtrSet<Instruction*, 4> Roots;
- WorkList.insert(V);
+ SmallPtrSet<Value*, 8> Sources;
+ SmallPtrSet<Instruction*, 4> Sinks;
SmallPtrSet<Value*, 16> CurrentVisited;
- CurrentVisited.clear();
+ WorkList.insert(V);
- // Return true if the given value can, or has been, visited. Add V to the
- // worklist if needed.
+ // Return true if V was added to the worklist as a supported instruction,
+ // if it was already visited, or if we don't need to explore it (e.g.
+ // pointer values and GEPs), and false otherwise.
auto AddLegalInst = [&](Value *V) {
if (CurrentVisited.count(V))
return true;
+ // Ignore GEPs because they don't need promoting and the constant indices
+ // will prevent the transformation.
+ if (isa<GetElementPtrInst>(V))
+ return true;
+
if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n");
return false;
@@ -600,6 +952,7 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
if (CurrentVisited.count(V))
continue;
+ // Ignore non-instructions, other than arguments.
if (!isa<Instruction>(V) && !isSource(V))
continue;
@@ -607,24 +960,26 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
// the tree has already been explored.
// TODO: This could limit the transform, ie if we try to promote something
// from an i8 and fail first, before trying an i16.
- if (AllVisited.count(V)) {
- LLVM_DEBUG(dbgs() << "ARM CGP: Already visited this: " << *V << "\n");
+ if (AllVisited.count(V))
return false;
- }
CurrentVisited.insert(V);
AllVisited.insert(V);
// Calls can be both sources and sinks.
if (isSink(V))
- Roots.insert(cast<Instruction>(V));
+ Sinks.insert(cast<Instruction>(V));
+
if (isSource(V))
- Leaves.insert(V);
- else if (auto *I = dyn_cast<Instruction>(V)) {
- // Visit operands of any instruction visited.
- for (auto &U : I->operands()) {
- if (!AddLegalInst(U))
- return false;
+ Sources.insert(V);
+
+ if (!isSink(V) && !isSource(V)) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ // Visit operands of any instruction visited.
+ for (auto &U : I->operands()) {
+ if (!AddLegalInst(U))
+ return false;
+ }
}
}
@@ -638,43 +993,23 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
}
}
- unsigned NumToPromote = 0;
- unsigned Cost = 0;
+ LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
+ for (auto *I : CurrentVisited)
+ I->dump();
+ );
+ unsigned ToPromote = 0;
for (auto *V : CurrentVisited) {
- // Truncs will cause a uxt and no zeroext arguments will often require
- // a uxt somewhere.
- if (isa<TruncInst>(V))
- ++Cost;
- else if (auto *Arg = dyn_cast<Argument>(V)) {
- if (!Arg->hasZExtAttr())
- ++Cost;
- }
-
- // Mem ops can automatically be extended/truncated and non-instructions
- // don't need anything done.
- if (Leaves.count(V) || isa<StoreInst>(V) || !isa<Instruction>(V))
+ if (Sources.count(V))
continue;
-
- // Will need to truncate calls args and returns.
- if (Roots.count(cast<Instruction>(V))) {
- ++Cost;
+ if (Sinks.count(cast<Instruction>(V)))
continue;
- }
-
- if (shouldPromote(V))
- ++NumToPromote;
+ ++ToPromote;
}
- LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
- for (auto *I : CurrentVisited)
- I->dump();
- );
- LLVM_DEBUG(dbgs() << "ARM CGP: Cost of promoting " << NumToPromote
- << " instructions = " << Cost << "\n");
- if (Cost > NumToPromote || (NumToPromote == 0))
+ if (ToPromote < 2)
return false;
- Promoter->Mutate(OrigTy, CurrentVisited, Leaves, Roots);
+ Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote);
return true;
}
@@ -711,19 +1046,15 @@ bool ARMCodeGenPrepare::runOnFunction(Function &F) {
continue;
LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
+
for (auto &Op : CI.operands()) {
- if (auto *I = dyn_cast<Instruction>(Op)) {
- if (isa<ZExtInst>(I))
- MadeChange |= TryToPromote(I->getOperand(0));
- else
- MadeChange |= TryToPromote(I);
- }
+ if (auto *I = dyn_cast<Instruction>(Op))
+ MadeChange |= TryToPromote(I);
}
}
}
- Promoter->Cleanup();
LLVM_DEBUG(if (verifyFunction(F, &dbgs())) {
- dbgs();
+ dbgs() << F;
report_fatal_error("Broken function after type promotion");
});
}
@@ -744,6 +1075,7 @@ INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations",
false, false)
char ARMCodeGenPrepare::ID = 0;
+unsigned ARMCodeGenPrepare::TypeSize = 0;
FunctionPass *llvm::createARMCodeGenPreparePass() {
return new ARMCodeGenPrepare();
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 2c4738d3cb74..5e97c4cb35e3 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -1420,6 +1420,22 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
MI = LastIT;
}
+ // Avoid splitting a MOVW+MOVT pair with a relocation on Windows.
+ // On Windows, this instruction pair is covered by one single
+ // IMAGE_REL_ARM_MOV32T relocation which covers both instructions. If a
+ // constant island is injected inbetween them, the relocation will clobber
+ // the instruction and fail to update the MOVT instruction.
+ // (These instructions are bundled up until right before the ConstantIslands
+ // pass.)
+ if (STI->isTargetWindows() && isThumb && MI->getOpcode() == ARM::t2MOVTi16 &&
+ (MI->getOperand(2).getTargetFlags() & ARMII::MO_OPTION_MASK) ==
+ ARMII::MO_HI16) {
+ --MI;
+ assert(MI->getOpcode() == ARM::t2MOVi16 &&
+ (MI->getOperand(1).getTargetFlags() & ARMII::MO_OPTION_MASK) ==
+ ARMII::MO_LO16);
+ }
+
// We really must not split an IT block.
LLVM_DEBUG(unsigned PredReg; assert(
!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 5dac6ec0b799..eecd0a10dc7d 100644
--- a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -570,7 +570,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
TransferImpOps(MI, MIB, MIB);
// Transfer memoperands.
- MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIB.cloneMemRefs(MI);
MI.eraseFromParent();
}
@@ -645,7 +645,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
TransferImpOps(MI, MIB, MIB);
// Transfer memoperands.
- MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIB.cloneMemRefs(MI);
MI.eraseFromParent();
}
@@ -735,7 +735,7 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
TransferImpOps(MI, MIB, MIB);
// Transfer memoperands.
- MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIB.cloneMemRefs(MI);
MI.eraseFromParent();
}
@@ -848,8 +848,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
LO16 = LO16.addImm(SOImmValV1);
HI16 = HI16.addImm(SOImmValV2);
- LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
- HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ LO16.cloneMemRefs(MI);
+ HI16.cloneMemRefs(MI);
LO16.addImm(Pred).addReg(PredReg).add(condCodeOp());
HI16.addImm(Pred).addReg(PredReg).add(condCodeOp());
if (isCC)
@@ -899,8 +899,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
}
}
- LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
- HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ LO16.cloneMemRefs(MI);
+ HI16.cloneMemRefs(MI);
LO16.addImm(Pred).addReg(PredReg);
HI16.addImm(Pred).addReg(PredReg);
@@ -1030,10 +1030,10 @@ static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
if (IsThumb) {
unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
- MIB.addReg(RegLo, Flags | getKillRegState(Reg.isDead()));
- MIB.addReg(RegHi, Flags | getKillRegState(Reg.isDead()));
+ MIB.addReg(RegLo, Flags);
+ MIB.addReg(RegHi, Flags);
} else
- MIB.addReg(Reg.getReg(), Flags | getKillRegState(Reg.isDead()));
+ MIB.addReg(Reg.getReg(), Flags);
}
/// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop.
@@ -1103,7 +1103,8 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
// bne .Lloadcmp
unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
MIB = BuildMI(StoreBB, DL, TII->get(STREXD), TempReg);
- addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
+ unsigned Flags = getKillRegState(New.isDead());
+ addExclusiveRegPair(MIB, New, Flags, IsThumb, TRI);
MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
@@ -1425,7 +1426,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MIB.addExternalSymbol("__aeabi_read_tp", 0);
}
- MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIB.cloneMemRefs(MI);
TransferImpOps(MI, MIB, MIB);
MI.eraseFromParent();
return true;
@@ -1440,7 +1441,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg)
.add(MI.getOperand(1))
.add(predOps(ARMCC::AL));
- MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIB1.cloneMemRefs(MI);
MachineInstrBuilder MIB2 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPICADD))
.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
@@ -1544,7 +1545,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
if (isARM) {
MIB3.add(predOps(ARMCC::AL));
if (Opcode == ARM::MOV_ga_pcrel_ldr)
- MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIB3.cloneMemRefs(MI);
}
TransferImpOps(MI, MIB1, MIB3);
MI.eraseFromParent();
@@ -1596,7 +1597,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// Add an implicit def for the super-register.
MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
TransferImpOps(MI, MIB, MIB);
- MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIB.cloneMemRefs(MI);
MI.eraseFromParent();
return true;
}
@@ -1629,7 +1630,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MIB->addRegisterKilled(SrcReg, TRI, true);
TransferImpOps(MI, MIB, MIB);
- MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIB.cloneMemRefs(MI);
MI.eraseFromParent();
return true;
}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
index a66cd7053c0a..a50abfdbee44 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -2951,7 +2951,8 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
unsigned ResultReg = MI->getOperand(0).getReg();
if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
return false;
- MI->eraseFromParent();
+ MachineBasicBlock::iterator I(MI);
+ removeDeadCode(I, std::next(I));
return true;
}
@@ -2970,12 +2971,16 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
unsigned ConstAlign =
MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign);
+ MachineMemOperand *CPMMO =
+ MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+ MachineMemOperand::MOLoad, 4, 4);
unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp;
MachineInstrBuilder MIB =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg)
- .addConstantPoolIndex(Idx);
+ .addConstantPoolIndex(Idx)
+ .addMemOperand(CPMMO);
if (Opc == ARM::LDRcp)
MIB.addImm(0);
MIB.add(predOps(ARMCC::AL));
@@ -2988,6 +2993,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
.addReg(TempReg)
.addImm(ARMPCLabelIndex);
+
if (!Subtarget->isThumb())
MIB.add(predOps(ARMCC::AL));
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 56ad7a0f0446..a9d87ced31f3 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -79,12 +79,11 @@ ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti)
: TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
STI(sti) {}
-bool ARMFrameLowering::noFramePointerElim(const MachineFunction &MF) const {
+bool ARMFrameLowering::keepFramePointer(const MachineFunction &MF) const {
// iOS always has a FP for backtracking, force other targets to keep their FP
// when doing FastISel. The emitted code is currently superior, and in cases
// like test-suite's lencod FastISel isn't quite correct when FP is eliminated.
- return TargetFrameLowering::noFramePointerElim(MF) ||
- MF.getSubtarget<ARMSubtarget>().useFastISel();
+ return MF.getSubtarget<ARMSubtarget>().useFastISel();
}
/// Returns true if the target can safely skip saving callee-saved registers
@@ -526,6 +525,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlags(MachineInstr::FrameSetup);
switch (TM.getCodeModel()) {
+ case CodeModel::Tiny:
+ llvm_unreachable("Tiny code model not available on ARM.");
case CodeModel::Small:
case CodeModel::Medium:
case CodeModel::Kernel:
@@ -909,6 +910,7 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
assert(RegInfo->hasBasePointer(MF) &&
"VLAs and dynamic stack alignment, but missing base pointer!");
FrameReg = RegInfo->getBaseRegister();
+ Offset -= SPAdj;
}
return Offset;
}
@@ -1006,8 +1008,7 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
if (Regs.empty())
continue;
- llvm::sort(Regs.begin(), Regs.end(), [&](const RegAndKill &LHS,
- const RegAndKill &RHS) {
+ llvm::sort(Regs, [&](const RegAndKill &LHS, const RegAndKill &RHS) {
return TRI.getEncodingValue(LHS.first) < TRI.getEncodingValue(RHS.first);
});
@@ -1103,7 +1104,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
if (Regs.empty())
continue;
- llvm::sort(Regs.begin(), Regs.end(), [&](unsigned LHS, unsigned RHS) {
+ llvm::sort(Regs, [&](unsigned LHS, unsigned RHS) {
return TRI.getEncodingValue(LHS) < TRI.getEncodingValue(RHS);
});
@@ -1921,9 +1922,13 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
<< "\n");
}
+ // Avoid spilling LR in Thumb1 if there's a tail call: it's expensive to
+ // restore LR in that case.
+ bool ExpensiveLRRestore = AFI->isThumb1OnlyFunction() && MFI.hasTailCall();
+
// If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
// Spill LR as well so we can fold BX_RET to the registers restore (LDM).
- if (!LRSpilled && CS1Spilled) {
+ if (!LRSpilled && CS1Spilled && !ExpensiveLRRestore) {
SavedRegs.set(ARM::LR);
NumGPRSpills++;
SmallVectorImpl<unsigned>::iterator LRPos;
@@ -1949,7 +1954,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
// Windows on ARM, accept R11 (frame pointer)
if (!AFI->isThumbFunction() ||
(STI.isTargetWindows() && Reg == ARM::R11) ||
- isARMLowRegister(Reg) || Reg == ARM::LR) {
+ isARMLowRegister(Reg) ||
+ (Reg == ARM::LR && !ExpensiveLRRestore)) {
SavedRegs.set(Reg);
LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
<< " to make up alignment\n");
@@ -2151,9 +2157,15 @@ void ARMFrameLowering::adjustForSegmentedStacks(
// Do not generate a prologue for leaf functions with a stack of size zero.
// For non-leaf functions we have to allow for the possibility that the
- // call is to a non-split function, as in PR37807.
- if (StackSize == 0 && !MFI.hasTailCall())
+ // callis to a non-split function, as in PR37807. This function could also
+ // take the address of a non-split function. When the linker tries to adjust
+ // its non-existent prologue, it would fail with an error. Mark the object
+ // file so that such failures are not errors. See this Go language bug-report
+ // https://go-review.googlesource.com/c/go/+/148819/
+ if (StackSize == 0 && !MFI.hasTailCall()) {
+ MF.getMMI().setHasNosplitStack(true);
return;
+ }
// Use R4 and R5 as scratch registers.
// We save R4 and R5 before use and restore them before leaving the function.
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
index e994cab28fe7..2f7e23840e75 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -42,7 +42,7 @@ public:
std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const override;
- bool noFramePointerElim(const MachineFunction &MF) const override;
+ bool keepFramePointer(const MachineFunction &MF) const override;
bool enableCalleeSaveSkip(const MachineFunction &MF) const override;
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 9592dd53c347..8e0e82388251 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1345,9 +1345,8 @@ static inline SDValue getAL(SelectionDAG *CurDAG, const SDLoc &dl) {
}
void ARMDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+ MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(Result), {MemOp});
}
bool ARMDAGToDAGISel::tryARMIndexedLoad(SDNode *N) {
@@ -1764,12 +1763,14 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
default: llvm_unreachable("unhandled vld type");
// Double-register operations:
case MVT::v8i8: OpcodeIndex = 0; break;
+ case MVT::v4f16:
case MVT::v4i16: OpcodeIndex = 1; break;
case MVT::v2f32:
case MVT::v2i32: OpcodeIndex = 2; break;
case MVT::v1i64: OpcodeIndex = 3; break;
// Quad-register operations:
case MVT::v16i8: OpcodeIndex = 0; break;
+ case MVT::v8f16:
case MVT::v8i16: OpcodeIndex = 1; break;
case MVT::v4f32:
case MVT::v4i32: OpcodeIndex = 2; break;
@@ -1854,9 +1855,8 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
}
// Transfer memoperands.
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+ MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(VLd), {MemOp});
if (NumVecs == 1) {
ReplaceNode(N, VLd);
@@ -1893,8 +1893,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
return;
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
SDValue Chain = N->getOperand(0);
EVT VT = N->getOperand(Vec0Idx).getValueType();
@@ -1983,7 +1982,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
// Transfer memoperands.
- cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(VSt), {MemOp});
ReplaceNode(N, VSt);
return;
@@ -2007,7 +2006,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
MemAddr.getValueType(),
MVT::Other, OpsA);
- cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1);
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(VStA), {MemOp});
Chain = SDValue(VStA, 1);
// Store the odd D registers.
@@ -2026,7 +2025,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
Ops.push_back(Chain);
SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
Ops);
- cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1);
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(VStB), {MemOp});
ReplaceNode(N, VStB);
}
@@ -2045,8 +2044,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
return;
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
SDValue Chain = N->getOperand(0);
unsigned Lane =
@@ -2135,7 +2133,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
QOpcodes[OpcodeIndex]);
SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
- cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(VLdLn), {MemOp});
if (!IsLoad) {
ReplaceNode(N, VLdLn);
return;
@@ -2264,9 +2262,8 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
}
// Transfer memoperands.
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
+ MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(VLdDup), {MemOp});
// Extract the subregisters.
if (NumVecs == 1) {
@@ -2309,6 +2306,11 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
Srl_imm)) {
assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+ // Mask off the unnecessary bits of the AND immediate; normally
+ // DAGCombine will do this, but that might not happen if
+ // targetShrinkDemandedConstant chooses a different immediate.
+ And_imm &= -1U >> Srl_imm;
+
// Note: The width operand is encoded as width-1.
unsigned Width = countTrailingOnes(And_imm) - 1;
unsigned LSB = Srl_imm;
@@ -2476,9 +2478,8 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
Opcode, SDLoc(N),
CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other), Ops);
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+ MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
@@ -2627,12 +2628,11 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
// queries work properly. This e.g. gives the register allocation the
// required information for rematerialization.
MachineFunction& MF = CurDAG->getMachineFunction();
- MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
- MemOp[0] = MF.getMachineMemOperand(
- MachinePointerInfo::getConstantPool(MF),
- MachineMemOperand::MOLoad, 4, 4);
+ MachineMemOperand *MemOp =
+ MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
+ MachineMemOperand::MOLoad, 4, 4);
- cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp+1);
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(ResNode), {MemOp});
ReplaceNode(N, ResNode);
return;
@@ -3030,11 +3030,13 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
switch (VT.getSimpleVT().SimpleTy) {
default: return;
case MVT::v8i8: Opc = ARM::VZIPd8; break;
+ case MVT::v4f16:
case MVT::v4i16: Opc = ARM::VZIPd16; break;
case MVT::v2f32:
// vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
case MVT::v2i32: Opc = ARM::VTRNd32; break;
case MVT::v16i8: Opc = ARM::VZIPq8; break;
+ case MVT::v8f16:
case MVT::v8i16: Opc = ARM::VZIPq16; break;
case MVT::v4f32:
case MVT::v4i32: Opc = ARM::VZIPq32; break;
@@ -3051,11 +3053,13 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
switch (VT.getSimpleVT().SimpleTy) {
default: return;
case MVT::v8i8: Opc = ARM::VUZPd8; break;
+ case MVT::v4f16:
case MVT::v4i16: Opc = ARM::VUZPd16; break;
case MVT::v2f32:
// vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
case MVT::v2i32: Opc = ARM::VTRNd32; break;
case MVT::v16i8: Opc = ARM::VUZPq8; break;
+ case MVT::v8f16:
case MVT::v8i16: Opc = ARM::VUZPq16; break;
case MVT::v4f32:
case MVT::v4i32: Opc = ARM::VUZPq32; break;
@@ -3072,10 +3076,12 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
switch (VT.getSimpleVT().SimpleTy) {
default: return;
case MVT::v8i8: Opc = ARM::VTRNd8; break;
+ case MVT::v4f16:
case MVT::v4i16: Opc = ARM::VTRNd16; break;
case MVT::v2f32:
case MVT::v2i32: Opc = ARM::VTRNd32; break;
case MVT::v16i8: Opc = ARM::VTRNq8; break;
+ case MVT::v8f16:
case MVT::v8i16: Opc = ARM::VTRNq16; break;
case MVT::v4f32:
case MVT::v4i32: Opc = ARM::VTRNq32; break;
@@ -3410,9 +3416,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
CurDAG->getRegister(0, MVT::i32), Chain};
SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
// Transfer memoperands.
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+ MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
// Remap uses.
SDValue OutChain = isThumb ? SDValue(Ld, 2) : SDValue(Ld, 1);
@@ -3478,9 +3483,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
// Transfer memoperands.
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+ MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
ReplaceNode(N, St);
return;
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
index ede276dd91bb..21de0f6a7630 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -651,9 +651,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// it have a FP_TO_[SU]INT instruction with a narrower destination than
// source.
setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
@@ -665,8 +669,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);
setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
- setOperationAction(ISD::CTPOP, MVT::v1i64, Expand);
- setOperationAction(ISD::CTPOP, MVT::v2i64, Expand);
+ setOperationAction(ISD::CTPOP, MVT::v1i64, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
@@ -846,8 +850,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
- if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
+ if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
+ }
// @llvm.readcyclecounter requires the Performance Monitors extension.
// Default to the 0 expansion on unsupported platforms.
@@ -950,6 +956,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
setOperationAction(ISD::TRAP, MVT::Other, Legal);
+ setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
// Use the default implementation.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -977,7 +984,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// On v8, we have particularly efficient implementations of atomic fences
// if they can be combined with nearby atomic loads and stores.
- if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
+ if (!Subtarget->hasAcquireRelease() ||
+ getTargetMachine().getOptLevel() == 0) {
// Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
InsertFencesForAtomic = true;
}
@@ -1136,14 +1144,26 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (Subtarget->hasNEON()) {
// vmin and vmax aren't available in a scalar form, so we use
// a NEON instruction with an undef lane instead.
- setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
- setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
- setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
- setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
- setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
- setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
- setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
- setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
+ setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
+ setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
+
+ if (Subtarget->hasFullFP16()) {
+ setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
+
+ setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
+ setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
+ setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
+ setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
+ }
}
// We have target-specific dag combine patterns for the following nodes:
@@ -1181,6 +1201,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// Prefer likely predicted branches to selects on out-of-order cores.
PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
+ setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());
+
setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
}
@@ -1261,6 +1283,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::FMSTAT: return "ARMISD::FMSTAT";
case ARMISD::CMOV: return "ARMISD::CMOV";
+ case ARMISD::SUBS: return "ARMISD::SUBS";
case ARMISD::SSAT: return "ARMISD::SSAT";
case ARMISD::USAT: return "ARMISD::USAT";
@@ -3052,41 +3075,8 @@ static bool allUsersAreInFunction(const Value *V, const Function *F) {
return true;
}
-/// Return true if all users of V are within some (any) function, looking through
-/// ConstantExprs. In other words, are there any global constant users?
-static bool allUsersAreInFunctions(const Value *V) {
- SmallVector<const User*,4> Worklist;
- for (auto *U : V->users())
- Worklist.push_back(U);
- while (!Worklist.empty()) {
- auto *U = Worklist.pop_back_val();
- if (isa<ConstantExpr>(U)) {
- for (auto *UU : U->users())
- Worklist.push_back(UU);
- continue;
- }
-
- if (!isa<Instruction>(U))
- return false;
- }
- return true;
-}
-
-// Return true if T is an integer, float or an array/vector of either.
-static bool isSimpleType(Type *T) {
- if (T->isIntegerTy() || T->isFloatingPointTy())
- return true;
- Type *SubT = nullptr;
- if (T->isArrayTy())
- SubT = T->getArrayElementType();
- else if (T->isVectorTy())
- SubT = T->getVectorElementType();
- else
- return false;
- return SubT->isIntegerTy() || SubT->isFloatingPointTy();
-}
-
-static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
+static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
+ const GlobalValue *GV, SelectionDAG &DAG,
EVT PtrVT, const SDLoc &dl) {
// If we're creating a pool entry for a constant global with unnamed address,
// and the global is small enough, we can emit it inline into the constant pool
@@ -3113,11 +3103,11 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
!GVar->hasLocalLinkage())
return SDValue();
- // Ensure that we don't try and inline any type that contains pointers. If
- // we inline a value that contains relocations, we move the relocations from
- // .data to .text which is not ideal.
+ // If we inline a value that contains relocations, we move the relocations
+ // from .data to .text. This is not allowed in position-independent code.
auto *Init = GVar->getInitializer();
- if (!isSimpleType(Init->getType()))
+ if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
+ Init->needsRelocation())
return SDValue();
// The constant islands pass can only really deal with alignment requests
@@ -3128,7 +3118,7 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
// that are strings for simplicity.
auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
- unsigned Align = GVar->getAlignment();
+ unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar);
unsigned RequiredPadding = 4 - (Size % 4);
bool PaddingPossible =
RequiredPadding == 4 || (CDAInit && CDAInit->isString());
@@ -3149,12 +3139,14 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
ConstpoolPromotionMaxTotal)
return SDValue();
- // This is only valid if all users are in a single function OR it has users
- // in multiple functions but it no larger than a pointer. We also check if
- // GVar has constant (non-ConstantExpr) users. If so, it essentially has its
- // address taken.
- if (!allUsersAreInFunction(GVar, &F) &&
- !(Size <= 4 && allUsersAreInFunctions(GVar)))
+ // This is only valid if all users are in a single function; we can't clone
+ // the constant in general. The LLVM IR unnamed_addr allows merging
+ // constants, but not cloning them.
+ //
+ // We could potentially allow cloning if we could prove all uses of the
+ // constant in the current function don't care about the address, like
+ // printf format strings. But that isn't implemented for now.
+ if (!allUsersAreInFunction(GVar, &F))
return SDValue();
// We're going to inline this global. Pad it out if needed.
@@ -3182,9 +3174,11 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
- GV = GA->getBaseObject();
- return (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
- isa<Function>(GV);
+ if (!(GV = GA->getBaseObject()))
+ return false;
+ if (const auto *V = dyn_cast<GlobalVariable>(GV))
+ return V->isConstant();
+ return isa<Function>(GV);
}
SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
@@ -3210,7 +3204,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
// promoteToConstantPool only if not generating XO text section
if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
- if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl))
+ if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
return V;
if (isPositionIndependent()) {
@@ -3299,9 +3293,13 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
"ROPI/RWPI not currently supported for Windows");
+ const TargetMachine &TM = getTargetMachine();
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
- const ARMII::TOF TargetFlags =
- (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
+ ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
+ if (GV->hasDLLImportStorageClass())
+ TargetFlags = ARMII::MO_DLLIMPORT;
+ else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+ TargetFlags = ARMII::MO_COFFSTUB;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result;
SDLoc DL(Op);
@@ -3313,7 +3311,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
TargetFlags));
- if (GV->hasDLLImportStorageClass())
+ if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
return Result;
@@ -3412,7 +3410,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
Op.getOperand(1), Op.getOperand(2));
}
unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
- ? ISD::FMINNAN : ISD::FMAXNAN;
+ ? ISD::FMINIMUM : ISD::FMAXIMUM;
return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
}
@@ -4832,12 +4830,24 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
return DAG.UnrollVectorOp(Op.getNode());
}
- assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
- "Invalid type for custom lowering!");
- if (VT != MVT::v4i16)
+ const bool HasFullFP16 =
+ static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
+
+ EVT NewTy;
+ const EVT OpTy = Op.getOperand(0).getValueType();
+ if (OpTy == MVT::v4f32)
+ NewTy = MVT::v4i32;
+ else if (OpTy == MVT::v4f16 && HasFullFP16)
+ NewTy = MVT::v4i16;
+ else if (OpTy == MVT::v8f16 && HasFullFP16)
+ NewTy = MVT::v8i16;
+ else
+ llvm_unreachable("Invalid type for custom lowering!");
+
+ if (VT != MVT::v4i16 && VT != MVT::v8i16)
return DAG.UnrollVectorOp(Op.getNode());
- Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
+ Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
}
@@ -4870,9 +4880,21 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
return DAG.UnrollVectorOp(Op.getNode());
}
- assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
+ assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
+ Op.getOperand(0).getValueType() == MVT::v8i16) &&
"Invalid type for custom lowering!");
- if (VT != MVT::v4f32)
+
+ const bool HasFullFP16 =
+ static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
+
+ EVT DestVecType;
+ if (VT == MVT::v4f32)
+ DestVecType = MVT::v4i32;
+ else if (VT == MVT::v4f16 && HasFullFP16)
+ DestVecType = MVT::v4i16;
+ else if (VT == MVT::v8f16 && HasFullFP16)
+ DestVecType = MVT::v8i16;
+ else
return DAG.UnrollVectorOp(Op.getNode());
unsigned CastOpc;
@@ -4889,7 +4911,7 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
break;
}
- Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
+ Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
return DAG.getNode(Opc, dl, VT, Op);
}
@@ -5392,10 +5414,6 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
// Compute with: cttz(x) = ctpop(lsb - 1)
- // Since we can only compute the number of bits in a byte with vcnt.8, we
- // have to gather the result with pairwise addition (vpaddl) for i16, i32,
- // and i64.
-
// Compute LSB - 1.
SDValue Bits;
if (ElemTy == MVT::i64) {
@@ -5408,32 +5426,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
DAG.getTargetConstant(1, dl, ElemTy));
Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
}
-
- // Count #bits with vcnt.8.
- EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
- SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
- SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
-
- // Gather the #bits with vpaddl (pairwise add.)
- EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
- SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
- DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
- Cnt8);
- if (ElemTy == MVT::i16)
- return Cnt16;
-
- EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
- SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
- DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
- Cnt16);
- if (ElemTy == MVT::i32)
- return Cnt32;
-
- assert(ElemTy == MVT::i64);
- SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
- Cnt32);
- return Cnt64;
+ return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
}
if (!ST->hasV6T2Ops())
@@ -5443,112 +5436,37 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
}
-/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
-/// for each 16-bit element from operand, repeated. The basic idea is to
-/// leverage vcnt to get the 8-bit counts, gather and add the results.
-///
-/// Trace for v4i16:
-/// input = [v0 v1 v2 v3 ] (vi 16-bit element)
-/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
-/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
-/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
-/// [b0 b1 b2 b3 b4 b5 b6 b7]
-/// +[b1 b0 b3 b2 b5 b4 b7 b6]
-/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
-/// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits)
-static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
- SDLoc DL(N);
-
- EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
- SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
- SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
- SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
- SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
- return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
-}
-
-/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
-/// bit-count for each 16-bit element from the operand. We need slightly
-/// different sequencing for v4i16 and v8i16 to stay within NEON's available
-/// 64/128-bit registers.
-///
-/// Trace for v4i16:
-/// input = [v0 v1 v2 v3 ] (vi 16-bit element)
-/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
-/// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ]
-/// v4i16:Extracted = [k0 k1 k2 k3 ]
-static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
+static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
EVT VT = N->getValueType(0);
SDLoc DL(N);
- SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
- if (VT.is64BitVector()) {
- SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
- DAG.getIntPtrConstant(0, DL));
- } else {
- SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
- BitCounts, DAG.getIntPtrConstant(0, DL));
- return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
- }
-}
-
-/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
-/// bit-count for each 32-bit element from the operand. The idea here is
-/// to split the vector into 16-bit elements, leverage the 16-bit count
-/// routine, and then combine the results.
-///
-/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
-/// input = [v0 v1 ] (vi: 32-bit elements)
-/// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
-/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
-/// vrev: N0 = [k1 k0 k3 k2 ]
-/// [k0 k1 k2 k3 ]
-/// N1 =+[k1 k0 k3 k2 ]
-/// [k0 k2 k1 k3 ]
-/// N2 =+[k1 k3 k0 k2 ]
-/// [k0 k2 k1 k3 ]
-/// Extended =+[k1 k3 k0 k2 ]
-/// [k0 k2 ]
-/// Extracted=+[k1 k3 ]
-///
-static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
- SDLoc DL(N);
+ assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
+ assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
+ VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
+ "Unexpected type for custom ctpop lowering");
- EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+ SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
+ Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
- SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
- SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
- SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
- SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
- SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
+ // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
+ unsigned EltSize = 8;
+ unsigned NumElts = VT.is64BitVector() ? 8 : 16;
+ while (EltSize != VT.getScalarSizeInBits()) {
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
+ TLI.getPointerTy(DAG.getDataLayout())));
+ Ops.push_back(Res);
- if (VT.is64BitVector()) {
- SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
- DAG.getIntPtrConstant(0, DL));
- } else {
- SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
- DAG.getIntPtrConstant(0, DL));
- return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
+ EltSize *= 2;
+ NumElts /= 2;
+ MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
+ Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
}
-}
-static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *ST) {
- EVT VT = N->getValueType(0);
-
- assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
- assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
- VT == MVT::v4i16 || VT == MVT::v8i16) &&
- "Unexpected type for custom ctpop lowering");
-
- if (VT.getVectorElementType() == MVT::i32)
- return lowerCTPOP32BitElements(N, DAG);
- else
- return lowerCTPOP16BitElements(N, DAG);
+ return Res;
}
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
@@ -7878,6 +7796,50 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
return LowerCallTo(CLI).first;
}
+// This is a code size optimisation: return the original SDIV node to
+// DAGCombiner when we don't want to expand SDIV into a sequence of
+// instructions, and an empty node otherwise which will cause the
+// SDIV to be expanded in DAGCombine.
+SDValue
+ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const {
+ // TODO: Support SREM
+ if (N->getOpcode() != ISD::SDIV)
+ return SDValue();
+
+ const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
+ const auto &MF = DAG.getMachineFunction();
+ const bool MinSize = MF.getFunction().optForMinSize();
+ const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
+ : ST.hasDivideInARMMode();
+
+ // Don't touch vector types; rewriting this may lead to scalarizing
+ // the int divs.
+ if (N->getOperand(0).getValueType().isVector())
+ return SDValue();
+
+ // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
+ // hwdiv support for this to be really profitable.
+ if (!(MinSize && HasDivide))
+ return SDValue();
+
+ // ARM mode is a bit simpler than Thumb: we can handle large power
+ // of 2 immediates with 1 mov instruction; no further checks required,
+ // just return the sdiv node.
+ if (!ST.isThumb())
+ return SDValue(N, 0);
+
+ // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
+ // and thus lose the code size benefits of a MOVS that requires only 2.
+ // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
+ // but as it's doing exactly this, it's not worth the trouble to get TTI.
+ if (Divisor.sgt(128))
+ return SDValue();
+
+ return SDValue(N, 0);
+}
+
SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
bool Signed) const {
assert(Op.getValueType() == MVT::i32 &&
@@ -7990,10 +7952,8 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N,
ARM::CMP_SWAP_64, SDLoc(N),
DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
- MachineFunction &MF = DAG.getMachineFunction();
- MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
- MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+ MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
+ DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
bool isBigEndian = DAG.getDataLayout().isBigEndian();
@@ -9169,6 +9129,8 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
// IP.
switch (TM.getCodeModel()) {
+ case CodeModel::Tiny:
+ llvm_unreachable("Tiny code model not available on ARM.");
case CodeModel::Small:
case CodeModel::Medium:
case CodeModel::Kernel:
@@ -9244,6 +9206,42 @@ ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
return ContBB;
}
+// The CPSR operand of SelectItr might be missing a kill marker
+// because there were multiple uses of CPSR, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
+ MachineBasicBlock* BB,
+ const TargetRegisterInfo* TRI) {
+ // Scan forward through BB for a use/def of CPSR.
+ MachineBasicBlock::iterator miI(std::next(SelectItr));
+ for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+ const MachineInstr& mi = *miI;
+ if (mi.readsRegister(ARM::CPSR))
+ return false;
+ if (mi.definesRegister(ARM::CPSR))
+ break; // Should have kill-flag - update below.
+ }
+
+ // If we hit the end of the block, check whether CPSR is live into a
+ // successor.
+ if (miI == BB->end()) {
+ for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+ sEnd = BB->succ_end();
+ sItr != sEnd; ++sItr) {
+ MachineBasicBlock* succ = *sItr;
+ if (succ->isLiveIn(ARM::CPSR))
+ return false;
+ }
+ }
+
+ // We found a def, or hit the end of the basic block and CPSR wasn't live
+ // out. SelectMI should have a kill flag on CPSR.
+ SelectItr->addRegisterKilled(ARM::CPSR, TRI);
+ return true;
+}
+
MachineBasicBlock *
ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
@@ -9343,6 +9341,14 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
F->insert(It, copy0MBB);
F->insert(It, sinkMBB);
+ // Check whether CPSR is live past the tMOVCCr_pseudo.
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ if (!MI.killsRegister(ARM::CPSR) &&
+ !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
+ copy0MBB->addLiveIn(ARM::CPSR);
+ sinkMBB->addLiveIn(ARM::CPSR);
+ }
+
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
@@ -10407,6 +10413,37 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
return SDValue();
}
+bool
+ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
+ CombineLevel Level) const {
+ if (Level == BeforeLegalizeTypes)
+ return true;
+
+ if (Subtarget->isThumb() && Subtarget->isThumb1Only())
+ return true;
+
+ if (N->getOpcode() != ISD::SHL)
+ return true;
+
+ // Turn off commute-with-shift transform after legalization, so it doesn't
+ // conflict with PerformSHLSimplify. (We could try to detect when
+ // PerformSHLSimplify would trigger more precisely, but it isn't
+ // really necessary.)
+ return false;
+}
+
+bool
+ARMTargetLowering::shouldFoldShiftPairToMask(const SDNode *N,
+ CombineLevel Level) const {
+ if (!Subtarget->isThumb1Only())
+ return true;
+
+ if (Level == BeforeLegalizeTypes)
+ return true;
+
+ return false;
+}
+
static SDValue PerformSHLSimplify(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
@@ -10506,9 +10543,7 @@ static SDValue PerformSHLSimplify(SDNode *N,
LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
SHL.dump(); N->dump());
LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
-
- DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);
- return SDValue(N, 0);
+ return Res;
}
@@ -10712,6 +10747,12 @@ static SDValue CombineANDShift(SDNode *N,
if (!C2 || C2 >= 32)
return SDValue();
+ // Clear irrelevant bits in the mask.
+ if (LeftShift)
+ C1 &= (-1U << C2);
+ else
+ C1 &= (-1U >> C2);
+
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
@@ -10719,9 +10760,7 @@ static SDValue CombineANDShift(SDNode *N,
// "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
// transform to a pair of shifts, to save materializing c1.
- // First pattern: right shift, and c1+1 is a power of two.
- // FIXME: Also check reversed pattern (left shift, and ~c1+1 is a power
- // of two).
+ // First pattern: right shift, then mask off leading bits.
// FIXME: Use demanded bits?
if (!LeftShift && isMask_32(C1)) {
uint32_t C3 = countLeadingZeros(C1);
@@ -10733,13 +10772,23 @@ static SDValue CombineANDShift(SDNode *N,
}
}
- // Second pattern: left shift, and (c1>>c2)+1 is a power of two.
- // FIXME: Also check reversed pattern (right shift, and ~(c1<<c2)+1
- // is a power of two).
+ // First pattern, reversed: left shift, then mask off trailing bits.
+ if (LeftShift && isMask_32(~C1)) {
+ uint32_t C3 = countTrailingZeros(C1);
+ if (C2 < C3) {
+ SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
+ DAG.getConstant(C3 - C2, DL, MVT::i32));
+ return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
+ DAG.getConstant(C3, DL, MVT::i32));
+ }
+ }
+
+ // Second pattern: left shift, then mask off leading bits.
// FIXME: Use demanded bits?
if (LeftShift && isShiftedMask_32(C1)) {
+ uint32_t Trailing = countTrailingZeros(C1);
uint32_t C3 = countLeadingZeros(C1);
- if (C2 + C3 < 32 && C1 == ((-1U << (C2 + C3)) >> C3)) {
+ if (Trailing == C2 && C2 + C3 < 32) {
SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
DAG.getConstant(C2 + C3, DL, MVT::i32));
return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
@@ -10747,6 +10796,19 @@ static SDValue CombineANDShift(SDNode *N,
}
}
+ // Second pattern, reversed: right shift, then mask off trailing bits.
+ // FIXME: Handle other patterns of known/demanded bits.
+ if (!LeftShift && isShiftedMask_32(C1)) {
+ uint32_t Leading = countLeadingZeros(C1);
+ uint32_t C3 = countTrailingZeros(C1);
+ if (Leading == C2 && C2 + C3 < 32) {
+ SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
+ DAG.getConstant(C2 + C3, DL, MVT::i32));
+ return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
+ DAG.getConstant(C3, DL, MVT::i32));
+ }
+ }
+
// FIXME: Transform "(and (shl x, c2) c1)" ->
// "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
// c1.
@@ -11541,8 +11603,15 @@ static SDValue CombineBaseUpdate(SDNode *N,
continue;
// Check that the add is independent of the load/store. Otherwise, folding
- // it would create a cycle.
- if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+ // it would create a cycle. We can avoid searching through Addr as it's a
+ // predecessor to both.
+ SmallPtrSet<const SDNode *, 32> Visited;
+ SmallVector<const SDNode *, 16> Worklist;
+ Visited.insert(Addr.getNode());
+ Worklist.push_back(N);
+ Worklist.push_back(User);
+ if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
+ SDNode::hasPredecessorHelper(User, Visited, Worklist))
continue;
// Find the new opcode for the updating load/store.
@@ -12507,8 +12576,7 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
// Lastly, can we determine that the bits defined by OrCI
// are zero in Y?
- KnownBits Known;
- DAG.computeKnownBits(Y, Known);
+ KnownBits Known = DAG.computeKnownBits(Y);
if ((OrCI & Known.Zero) != OrCI)
return SDValue();
@@ -12679,30 +12747,38 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
}
- } else if (CC == ARMCC::NE && LHS != RHS &&
+ } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
(!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
// This seems pointless but will allow us to combine it further below.
- // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
- SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
+ SDValue Sub =
+ DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
+ SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
+ Sub.getValue(1), SDValue());
Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
- N->getOperand(3), Cmp);
+ N->getOperand(3), CPSRGlue.getValue(1));
+ FalseVal = Sub;
}
} else if (isNullConstant(TrueVal)) {
- if (CC == ARMCC::EQ && LHS != RHS &&
+ if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
(!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
// This seems pointless but will allow us to combine it further below
// Note that we change == for != as this is the dual for the case above.
- // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
- SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
+ SDValue Sub =
+ DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
+ SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
+ Sub.getValue(1), SDValue());
Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
DAG.getConstant(ARMCC::NE, dl, MVT::i32),
- N->getOperand(3), Cmp);
+ N->getOperand(3), CPSRGlue.getValue(1));
+ FalseVal = Sub;
}
}
// On Thumb1, the DAG above may be further combined if z is a power of 2
// (z == 2 ^ K).
- // CMOV (SUB x, y), z, !=, (CMPZ x, y) ->
+ // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
// merge t3, t4
// where t1 = (SUBCARRY (SUB x, y), z, 0)
// t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
@@ -12710,8 +12786,8 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
// t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ]
const APInt *TrueConst;
if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
- (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) &&
- (FalseVal.getOperand(1) == RHS) &&
+ (FalseVal.getOpcode() == ARMISD::SUBS) &&
+ (FalseVal.getOperand(0) == LHS) && (FalseVal.getOperand(1) == RHS) &&
(TrueConst = isPowerOf2Constant(TrueVal))) {
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
unsigned ShiftAmount = TrueConst->logBase2();
@@ -12730,8 +12806,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
}
if (Res.getNode()) {
- KnownBits Known;
- DAG.computeKnownBits(SDValue(N,0), Known);
+ KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
// Capture demanded bits information that would be otherwise lost.
if (Known.Zero == 0xfffffffe)
Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
@@ -13522,12 +13597,11 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
break;
case ARMISD::CMOV: {
// Bits are known zero/one if known on the LHS and RHS.
- DAG.computeKnownBits(Op.getOperand(0), Known, Depth+1);
+ Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
if (Known.isUnknown())
return;
- KnownBits KnownRHS;
- DAG.computeKnownBits(Op.getOperand(1), KnownRHS, Depth+1);
+ KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
Known.Zero &= KnownRHS.Zero;
Known.One &= KnownRHS.One;
return;
@@ -13549,7 +13623,7 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
case ARMISD::BFI: {
// Conservatively, we can recurse down the first operand
// and just mask out all affected bits.
- DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
+ Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
// The operand to BFI is already a mask suitable for removing the bits it
// sets.
@@ -13559,9 +13633,120 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.One &= Mask;
return;
}
+ case ARMISD::VGETLANEs:
+ case ARMISD::VGETLANEu: {
+ const SDValue &SrcSV = Op.getOperand(0);
+ EVT VecVT = SrcSV.getValueType();
+ assert(VecVT.isVector() && "VGETLANE expected a vector type");
+ const unsigned NumSrcElts = VecVT.getVectorNumElements();
+ ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
+ assert(Pos->getAPIntValue().ult(NumSrcElts) &&
+ "VGETLANE index out of bounds");
+ unsigned Idx = Pos->getZExtValue();
+ APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
+ Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
+
+ EVT VT = Op.getValueType();
+ const unsigned DstSz = VT.getScalarSizeInBits();
+ const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
+ assert(SrcSz == Known.getBitWidth());
+ assert(DstSz > SrcSz);
+ if (Op.getOpcode() == ARMISD::VGETLANEs)
+ Known = Known.sext(DstSz);
+ else {
+ Known = Known.zext(DstSz);
+ Known.Zero.setBitsFrom(SrcSz);
+ }
+ assert(DstSz == Known.getBitWidth());
+ break;
+ }
}
}
+bool
+ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
+ const APInt &DemandedAPInt,
+ TargetLoweringOpt &TLO) const {
+ // Delay optimization, so we don't have to deal with illegal types, or block
+ // optimizations.
+ if (!TLO.LegalOps)
+ return false;
+
+ // Only optimize AND for now.
+ if (Op.getOpcode() != ISD::AND)
+ return false;
+
+ EVT VT = Op.getValueType();
+
+ // Ignore vectors.
+ if (VT.isVector())
+ return false;
+
+ assert(VT == MVT::i32 && "Unexpected integer type");
+
+ // Make sure the RHS really is a constant.
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!C)
+ return false;
+
+ unsigned Mask = C->getZExtValue();
+
+ unsigned Demanded = DemandedAPInt.getZExtValue();
+ unsigned ShrunkMask = Mask & Demanded;
+ unsigned ExpandedMask = Mask | ~Demanded;
+
+ // If the mask is all zeros, let the target-independent code replace the
+ // result with zero.
+ if (ShrunkMask == 0)
+ return false;
+
+ // If the mask is all ones, erase the AND. (Currently, the target-independent
+ // code won't do this, so we have to do it explicitly to avoid an infinite
+ // loop in obscure cases.)
+ if (ExpandedMask == ~0U)
+ return TLO.CombineTo(Op, Op.getOperand(0));
+
+ auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
+ return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
+ };
+ auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
+ if (NewMask == Mask)
+ return true;
+ SDLoc DL(Op);
+ SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
+ SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
+ return TLO.CombineTo(Op, NewOp);
+ };
+
+ // Prefer uxtb mask.
+ if (IsLegalMask(0xFF))
+ return UseMask(0xFF);
+
+ // Prefer uxth mask.
+ if (IsLegalMask(0xFFFF))
+ return UseMask(0xFFFF);
+
+ // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
+ // FIXME: Prefer a contiguous sequence of bits for other optimizations.
+ if (ShrunkMask < 256)
+ return UseMask(ShrunkMask);
+
+ // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
+ // FIXME: Prefer a contiguous sequence of bits for other optimizations.
+ if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
+ return UseMask(ExpandedMask);
+
+ // Potential improvements:
+ //
+ // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
+ // We could try to prefer Thumb1 immediates which can be lowered to a
+ // two-instruction sequence.
+ // We could try to recognize more legal ARM/Thumb2 immediates here.
+
+ return false;
+}
+
+
//===----------------------------------------------------------------------===//
// ARM Inline Assembly Support
//===----------------------------------------------------------------------===//
@@ -14412,16 +14597,18 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
: AtomicExpansionKind::None;
}
-bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
- AtomicCmpXchgInst *AI) const {
+TargetLowering::AtomicExpansionKind
+ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
// At -O0, fast-regalloc cannot cope with the live vregs necessary to
// implement cmpxchg without spilling. If the address being exchanged is also
// on the stack and close enough to the spill slot, this can lead to a
// situation where the monitor always gets cleared and the atomic operation
// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
- bool hasAtomicCmpXchg =
+ bool HasAtomicCmpXchg =
!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
- return getTargetMachine().getOptLevel() != 0 && hasAtomicCmpXchg;
+ if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg)
+ return AtomicExpansionKind::LLSC;
+ return AtomicExpansionKind::None;
}
bool ARMTargetLowering::shouldInsertFencesForAtomic(
@@ -14548,6 +14735,11 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Addr});
}
+
+bool ARMTargetLowering::alignLoopsWithOptSize() const {
+ return Subtarget->isMClass();
+}
+
/// A helper function for determining the number of interleaved accesses we
/// will generate when lowering accesses of the given type.
unsigned
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
index 50b4c2977fb5..7a9fc739fc13 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -85,6 +85,7 @@ class VectorType;
FMSTAT, // ARM fmstat instruction.
CMOV, // ARM conditional move instructions.
+ SUBS, // Flag-setting subtraction.
SSAT, // Signed saturation
USAT, // Unsigned saturation
@@ -389,6 +390,9 @@ class VectorType;
const SelectionDAG &DAG,
unsigned Depth) const override;
+ bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+ TargetLoweringOpt &TLO) const override;
+
bool ExpandInlineAsm(CallInst *CI) const override;
@@ -535,7 +539,8 @@ class VectorType;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
- bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
bool useLoadStackGuardNode() const override;
@@ -572,6 +577,8 @@ class VectorType;
bool isLegalInterleavedAccessType(VectorType *VecTy,
const DataLayout &DL) const;
+ bool alignLoopsWithOptSize() const override;
+
/// Returns the number of interleaved accesses that will be generated when
/// lowering accesses of the given type.
unsigned getNumInterleavedAccesses(VectorType *VecTy,
@@ -583,6 +590,11 @@ class VectorType;
unsigned getABIAlignmentForCallingConv(Type *ArgTy,
DataLayout DL) const override;
+ bool isDesirableToCommuteWithShift(const SDNode *N,
+ CombineLevel Level) const override;
+
+ bool shouldFoldShiftPairToMask(const SDNode *N,
+ CombineLevel Level) const override;
protected:
std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -685,6 +697,9 @@ class VectorType;
unsigned getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const override;
+ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const override;
+
/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
/// expanded to FMAs when this method returns true, otherwise fmuladd is
@@ -764,6 +779,8 @@ class VectorType;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+ bool shouldConsiderGEPOffsetSplit() const override { return true; }
+
SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
SDValue ARMcc, SDValue CCR, SDValue Cmp,
SelectionDAG &DAG) const;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
index 1d3b1414f090..0df48ba61299 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -2580,6 +2580,37 @@ class N3VLaneCP8<bit op23, bits<2> op21_20, bit op6, bit op4,
let Inst{3-0} = Vm{3-0};
}
+// In Armv8.2-A, some NEON instructions are added that encode Vn and Vm
+// differently:
+// if Q == ‘1’ then UInt(N:Vn) else UInt(Vn:N);
+// if Q == ‘1’ then UInt(M:Vm) else UInt(Vm:M);
+// Class N3VCP8 above describes the Q=1 case, and this class the Q=0 case.
+class N3VCP8Q0<bits<2> op24_23, bits<2> op21_20, bit op6, bit op4,
+ dag oops, dag iops, InstrItinClass itin,
+ string opc, string dt, string asm, string cstr, list<dag> pattern>
+ : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N3RegCplxFrm, itin, opc, dt, asm, cstr, pattern> {
+ bits<5> Vd;
+ bits<5> Vn;
+ bits<5> Vm;
+
+ let DecoderNamespace = "VFPV8";
+ // These have the same encodings in ARM and Thumb2
+ let PostEncoderMethod = "";
+
+ let Inst{31-25} = 0b1111110;
+ let Inst{24-23} = op24_23;
+ let Inst{22} = Vd{4};
+ let Inst{21-20} = op21_20;
+ let Inst{19-16} = Vn{4-1};
+ let Inst{15-12} = Vd{3-0};
+ let Inst{11-8} = 0b1000;
+ let Inst{7} = Vn{0};
+ let Inst{6} = op6;
+ let Inst{5} = Vm{0};
+ let Inst{4} = op4;
+ let Inst{3-0} = Vm{4-1};
+}
+
// Operand types for complex instructions
class ComplexRotationOperand<int Angle, int Remainder, string Type, string Diag>
: AsmOperandClass {
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index 397c9dadb4ac..bcc31f5fa4cc 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -132,34 +132,6 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg)
.addReg(Reg, RegState::Kill)
.addImm(0)
- .setMemRefs(MI->memoperands_begin(), MI->memoperands_end())
+ .cloneMemRefs(*MI)
.add(predOps(ARMCC::AL));
}
-
-std::pair<unsigned, unsigned>
-ARMInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
- const unsigned Mask = ARMII::MO_OPTION_MASK;
- return std::make_pair(TF & Mask, TF & ~Mask);
-}
-
-ArrayRef<std::pair<unsigned, const char *>>
-ARMInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
- using namespace ARMII;
-
- static const std::pair<unsigned, const char *> TargetFlags[] = {
- {MO_LO16, "arm-lo16"}, {MO_HI16, "arm-hi16"}};
- return makeArrayRef(TargetFlags);
-}
-
-ArrayRef<std::pair<unsigned, const char *>>
-ARMInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
- using namespace ARMII;
-
- static const std::pair<unsigned, const char *> TargetFlags[] = {
- {MO_GOT, "arm-got"},
- {MO_SBREL, "arm-sbrel"},
- {MO_DLLIMPORT, "arm-dllimport"},
- {MO_SECREL, "arm-secrel"},
- {MO_NONLAZY, "arm-nonlazy"}};
- return makeArrayRef(TargetFlags);
-}
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
index c54c987134df..c87fb97448c9 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -38,13 +38,6 @@ public:
///
const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
- std::pair<unsigned, unsigned>
- decomposeMachineOperandsTargetFlags(unsigned TF) const override;
- ArrayRef<std::pair<unsigned, const char *>>
- getSerializableDirectMachineOperandTargetFlags() const override;
- ArrayRef<std::pair<unsigned, const char *>>
- getSerializableBitmaskMachineOperandTargetFlags() const override;
-
private:
void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
};
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
index d4c342cee5c0..13abdc9687ec 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -144,6 +144,7 @@ def ARMintretflag : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
[SDNPInGlue]>;
+def ARMsubs : SDNode<"ARMISD::SUBS", SDTIntBinOp, [SDNPOutGlue]>;
def ARMssatnoshift : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
@@ -221,6 +222,7 @@ def HasV4T : Predicate<"Subtarget->hasV4TOps()">,
def NoV4T : Predicate<"!Subtarget->hasV4TOps()">;
def HasV5T : Predicate<"Subtarget->hasV5TOps()">,
AssemblerPredicate<"HasV5TOps", "armv5t">;
+def NoV5T : Predicate<"!Subtarget->hasV5TOps()">;
def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">,
AssemblerPredicate<"HasV5TEOps", "armv5te">;
def HasV6 : Predicate<"Subtarget->hasV6Ops()">,
@@ -255,6 +257,8 @@ def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">,
AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">,
AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
+def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
+ AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
def NoVFP : Predicate<"!Subtarget->hasVFP2()">;
def HasVFP2 : Predicate<"Subtarget->hasVFP2()">,
AssemblerPredicate<"FeatureVFP2", "VFP2">;
@@ -285,6 +289,8 @@ def HasFP16 : Predicate<"Subtarget->hasFP16()">,
AssemblerPredicate<"FeatureFP16","half-float conversions">;
def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
AssemblerPredicate<"FeatureFullFP16","full half-float">;
+def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">,
+ AssemblerPredicate<"FeatureFP16FML","full half-float fml">;
def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">,
AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">;
def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">,
@@ -351,23 +357,24 @@ def UseNegativeImmediates :
let RecomputePerFunction = 1 in {
def UseMovt : Predicate<"Subtarget->useMovt(*MF)">;
def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">;
- def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
- def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
+ def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
+ def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
+
+ def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&"
+ " TM.Options.AllowFPOpFusion != FPOpFusion::Fast) ||"
+ "MF->getFunction().optForMinSize())">;
}
-def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
def UseMulOps : Predicate<"Subtarget->useMulOps()">;
// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
-// But only select them if more precision in FP computation is allowed.
+// But only select them if more precision in FP computation is allowed, and when
+// they are not slower than a mul + add sequence.
// Do not use them for Darwin platforms.
def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="
" FPOpFusion::Fast && "
" Subtarget->hasVFP4()) && "
- "!Subtarget->isTargetDarwin()">;
-def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion =="
- " FPOpFusion::Fast &&"
- " Subtarget->hasVFP4()) || "
- "Subtarget->isTargetDarwin()">;
+ "!Subtarget->isTargetDarwin() &&"
+ "Subtarget->useFPVMLx()">;
def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
@@ -387,6 +394,10 @@ let RecomputePerFunction = 1 in {
def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">;
+// Armv8.5-A extensions
+def HasSB : Predicate<"Subtarget->hasSB()">,
+ AssemblerPredicate<"FeatureSB", "sb">;
+
//===----------------------------------------------------------------------===//
// ARM Flag Definitions.
@@ -415,24 +426,22 @@ def imm16_31 : ImmLeaf<i32, [{
// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits.
def sext_16_node : PatLeaf<(i32 GPR:$a), [{
- if (CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17)
- return true;
-
- if (N->getOpcode() != ISD::SRA)
- return false;
- if (N->getOperand(0).getOpcode() != ISD::SHL)
- return false;
-
- auto *ShiftVal = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!ShiftVal || ShiftVal->getZExtValue() != 16)
- return false;
+ return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17;
+}]>;
- ShiftVal = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1));
- if (!ShiftVal || ShiftVal->getZExtValue() != 16)
- return false;
+def sext_bottom_16 : PatFrag<(ops node:$a),
+ (sext_inreg node:$a, i16)>;
+def sext_top_16 : PatFrag<(ops node:$a),
+ (i32 (sra node:$a, (i32 16)))>;
- return true;
-}]>;
+def bb_mul : PatFrag<(ops node:$a, node:$b),
+ (mul (sext_bottom_16 node:$a), (sext_bottom_16 node:$b))>;
+def bt_mul : PatFrag<(ops node:$a, node:$b),
+ (mul (sext_bottom_16 node:$a), (sra node:$b, (i32 16)))>;
+def tb_mul : PatFrag<(ops node:$a, node:$b),
+ (mul (sra node:$a, (i32 16)), (sext_bottom_16 node:$b))>;
+def tt_mul : PatFrag<(ops node:$a, node:$b),
+ (mul (sra node:$a, (i32 16)), (sra node:$b, (i32 16)))>;
/// Split a 32-bit immediate into two 16 bit parts.
def hi16 : SDNodeXForm<imm, [{
@@ -713,7 +722,20 @@ def arm_i32imm : PatLeaf<(imm), [{
if (Subtarget->useMovt(*MF))
return true;
return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
-}]>;
+}]> {
+ // Ideally this would be an IntImmLeaf, but then we wouldn't have access to
+ // the MachineFunction.
+ let GISelPredicateCode = [{
+ const auto &MF = *MI.getParent()->getParent();
+ if (STI.useMovt(MF))
+ return true;
+
+ const auto &MO = MI.getOperand(1);
+ if (!MO.isCImm())
+ return false;
+ return ARM_AM::isSOImmTwoPartVal(MO.getCImm()->getZExtValue());
+ }];
+}
/// imm0_1 predicate - Immediate in the range [0,1].
def Imm0_1AsmOperand: ImmAsmOperand<0,1> { let Name = "Imm0_1"; }
@@ -2191,6 +2213,9 @@ def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
let Inst = 0xe7ffdefe;
}
+def : Pat<(debugtrap), (BKPT 0)>, Requires<[IsARM, HasV5T]>;
+def : Pat<(debugtrap), (UDF 254)>, Requires<[IsARM, NoV5T]>;
+
// Address computation and loads and stores in PIC mode.
let isNotDuplicable = 1 in {
def PICADD : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
@@ -3321,7 +3346,7 @@ multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f,
let hasSideEffects = 0 in {
-let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in
defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m,
IIC_iLoad_mu>, ComplexDeprecationPredicate<"ARMLoad">;
@@ -3519,10 +3544,14 @@ def : ARMV6Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, imm8_or_16:$rot),
def SXTB16 : AI_ext_rrot_np<0b01101000, "sxtb16">;
def : ARMV6Pat<(int_arm_sxtb16 GPR:$Src),
(SXTB16 GPR:$Src, 0)>;
+def : ARMV6Pat<(int_arm_sxtb16 (rotr GPR:$Src, rot_imm:$rot)),
+ (SXTB16 GPR:$Src, rot_imm:$rot)>;
def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
def : ARMV6Pat<(int_arm_sxtab16 GPR:$LHS, GPR:$RHS),
(SXTAB16 GPR:$LHS, GPR:$RHS, 0)>;
+def : ARMV6Pat<(int_arm_sxtab16 GPR:$LHS, (rotr GPR:$RHS, rot_imm:$rot)),
+ (SXTAB16 GPR:$LHS, GPR:$RHS, rot_imm:$rot)>;
// Zero extenders
@@ -3544,6 +3573,8 @@ def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
(UXTB16 GPR:$Src, 1)>;
def : ARMV6Pat<(int_arm_uxtb16 GPR:$Src),
(UXTB16 GPR:$Src, 0)>;
+def : ARMV6Pat<(int_arm_uxtb16 (rotr GPR:$Src, rot_imm:$rot)),
+ (UXTB16 GPR:$Src, rot_imm:$rot)>;
def UXTAB : AI_exta_rrot<0b01101110, "uxtab",
BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
@@ -3560,6 +3591,8 @@ def : ARMV6Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), 0xFFFF)),
def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
def : ARMV6Pat<(int_arm_uxtab16 GPR:$LHS, GPR:$RHS),
(UXTAB16 GPR:$LHS, GPR:$RHS, 0)>;
+def : ARMV6Pat<(int_arm_uxtab16 GPR:$LHS, (rotr GPR:$RHS, rot_imm:$rot)),
+ (UXTAB16 GPR:$LHS, GPR:$RHS, rot_imm:$rot)>;
def SBFX : I<(outs GPRnopc:$Rd),
@@ -3620,6 +3653,14 @@ let isAdd = 1 in
defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMaddc, 1>;
defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
+def : ARMPat<(ARMsubs GPR:$Rn, mod_imm:$imm), (SUBSri $Rn, mod_imm:$imm)>;
+def : ARMPat<(ARMsubs GPR:$Rn, GPR:$Rm), (SUBSrr $Rn, $Rm)>;
+def : ARMPat<(ARMsubs GPR:$Rn, so_reg_imm:$shift),
+ (SUBSrsi $Rn, so_reg_imm:$shift)>;
+def : ARMPat<(ARMsubs GPR:$Rn, so_reg_reg:$shift),
+ (SUBSrsr $Rn, so_reg_reg:$shift)>;
+
+
let isAdd = 1 in
defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>;
defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>;
@@ -4211,29 +4252,25 @@ def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
multiclass AI_smul<string opc> {
def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
- [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
- (sext_inreg GPR:$Rm, i16)))]>,
+ [(set GPR:$Rd, (bb_mul GPR:$Rn, GPR:$Rm))]>,
Requires<[IsARM, HasV5TE]>,
Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
- [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
- (sra GPR:$Rm, (i32 16))))]>,
+ [(set GPR:$Rd, (bt_mul GPR:$Rn, GPR:$Rm))]>,
Requires<[IsARM, HasV5TE]>,
Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
- [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
- (sext_inreg GPR:$Rm, i16)))]>,
+ [(set GPR:$Rd, (tb_mul GPR:$Rn, GPR:$Rm))]>,
Requires<[IsARM, HasV5TE]>,
Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
- [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
- (sra GPR:$Rm, (i32 16))))]>,
+ [(set GPR:$Rd, (tt_mul GPR:$Rn, GPR:$Rm))]>,
Requires<[IsARM, HasV5TE]>,
Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
@@ -4257,35 +4294,31 @@ multiclass AI_smla<string opc> {
(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
[(set GPRnopc:$Rd, (add GPR:$Ra,
- (mul (sext_inreg GPRnopc:$Rn, i16),
- (sext_inreg GPRnopc:$Rm, i16))))]>,
+ (bb_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>,
Requires<[IsARM, HasV5TE, UseMulOps]>,
Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
- [(set GPRnopc:$Rd,
- (add GPR:$Ra, (mul (sext_inreg GPRnopc:$Rn, i16),
- (sra GPRnopc:$Rm, (i32 16)))))]>,
+ [(set GPRnopc:$Rd, (add GPR:$Ra,
+ (bt_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>,
Requires<[IsARM, HasV5TE, UseMulOps]>,
Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
- [(set GPRnopc:$Rd,
- (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
- (sext_inreg GPRnopc:$Rm, i16))))]>,
+ [(set GPRnopc:$Rd, (add GPR:$Ra,
+ (tb_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>,
Requires<[IsARM, HasV5TE, UseMulOps]>,
Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
- [(set GPRnopc:$Rd,
- (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
- (sra GPRnopc:$Rm, (i32 16)))))]>,
+ [(set GPRnopc:$Rd, (add GPR:$Ra,
+ (tt_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>,
Requires<[IsARM, HasV5TE, UseMulOps]>,
Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
@@ -4863,6 +4896,14 @@ def TSB : AInoP<(outs), (ins tsb_opt:$opt), MiscFrm, NoItinerary,
}
+// Armv8.5-A speculation barrier
+def SB : AInoP<(outs), (ins), MiscFrm, NoItinerary, "sb", "", []>,
+ Requires<[IsARM, HasSB]>, Sched<[]> {
+ let Inst{31-0} = 0xf57ff070;
+ let Unpredictable = 0x000fff0f;
+ let hasSideEffects = 1;
+}
+
let usesCustomInserter = 1, Defs = [CPSR] in {
// Pseudo instruction that combines movs + predicated rsbmi
@@ -4870,7 +4911,7 @@ let usesCustomInserter = 1, Defs = [CPSR] in {
def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
}
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, Defs = [CPSR] in {
def COPY_STRUCT_BYVAL_I32 : PseudoInst<
(outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
NoItinerary,
@@ -5778,26 +5819,21 @@ def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
// smul* and smla*
def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
- (SMULBB GPR:$a, GPR:$b)>,
- Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
-def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
- (SMULBT GPR:$a, GPR:$b)>,
- Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
-def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
- (SMULTB GPR:$a, GPR:$b)>,
- Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
-def : ARMV5MOPat<(add GPR:$acc,
- (mul sext_16_node:$a, sext_16_node:$b)),
- (SMLABB GPR:$a, GPR:$b, GPR:$acc)>,
- Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
-def : ARMV5MOPat<(add GPR:$acc,
- (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
- (SMLABT GPR:$a, GPR:$b, GPR:$acc)>,
- Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
-def : ARMV5MOPat<(add GPR:$acc,
- (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
- (SMLATB GPR:$a, GPR:$b, GPR:$acc)>,
- Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
+ (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, (sext_bottom_16 GPR:$b)),
+ (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, (sext_top_16 GPR:$b)),
+ (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sext_top_16 GPR:$a), sext_16_node:$b),
+ (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5MOPat<(add GPR:$acc, (mul sext_16_node:$a, sext_16_node:$b)),
+ (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc, (mul sext_16_node:$a, (sext_bottom_16 GPR:$b))),
+ (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc, (mul sext_16_node:$a, (sext_top_16 GPR:$b))),
+ (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc, (mul (sext_top_16 GPR:$a), sext_16_node:$b)),
+ (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
def : ARMV5TEPat<(int_arm_smulbb GPR:$a, GPR:$b),
(SMULBB GPR:$a, GPR:$b)>;
@@ -5902,6 +5938,8 @@ include "ARMInstrNEON.td"
// Memory barriers
def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>;
def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"ssbb", (DSB 0x0), 1>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"pssbb", (DSB 0x4), 1>, Requires<[IsARM, HasDB]>;
def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>;
// Armv8-R 'Data Full Barrier'
def : InstAlias<"dfb", (DSB 0xc), 1>, Requires<[IsARM, HasDFB]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
index 4525eec8da03..96986e74415b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -4305,17 +4305,29 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
(v2f32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
-
+def : Pat<(v8f16 (fmul (v8f16 QPR:$src1),
+ (v8f16 (NEONvduplane (v8f16 QPR:$src2), imm:$lane)))),
+ (v8f16 (VMULslhq(v8f16 QPR:$src1),
+ (v4f16 (EXTRACT_SUBREG QPR:$src2,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
(VMULslfd DPR:$Rn,
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
(i32 0))>;
+def : Pat<(v4f16 (fmul DPR:$Rn, (NEONvdup (f16 HPR:$Rm)))),
+ (VMULslhd DPR:$Rn,
+ (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
+ (i32 0))>;
def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
(VMULslfq QPR:$Rn,
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
(i32 0))>;
-
+def : Pat<(v8f16 (fmul QPR:$Rn, (NEONvdup (f16 HPR:$Rm)))),
+ (VMULslhq QPR:$Rn,
+ (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
+ (i32 0))>;
// VQDMULH : Vector Saturating Doubling Multiply Returning High Half
defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
@@ -4390,16 +4402,16 @@ defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
v2f32, fmul_su, fadd_mlx>,
- Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+ Requires<[HasNEON, UseFPVMLx]>;
def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
v4f32, fmul_su, fadd_mlx>,
- Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+ Requires<[HasNEON, UseFPVMLx]>;
def VMLAhd : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16",
v4f16, fmul_su, fadd_mlx>,
- Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
def VMLAhq : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16",
v8f16, fmul_su, fadd_mlx>,
- Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
@@ -4620,16 +4632,16 @@ defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
v2f32, fmul_su, fsub_mlx>,
- Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+ Requires<[HasNEON, UseFPVMLx]>;
def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
v4f32, fmul_su, fsub_mlx>,
- Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+ Requires<[HasNEON, UseFPVMLx]>;
def VMLShd : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16",
v4f16, fmul, fsub>,
- Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
def VMLShq : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16",
v8f16, fmul, fsub>,
- Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+ Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
@@ -4734,6 +4746,12 @@ def VFMShq : N3VQMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACQ, "vfms", "f16",
Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
// Match @llvm.fma.* intrinsics
+def : Pat<(v4f16 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
+ (VFMAhd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON,HasFullFP16]>;
+def : Pat<(v8f16 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)),
+ (VFMAhq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON,HasFullFP16]>;
def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
(VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
Requires<[HasVFP4]>;
@@ -5066,7 +5084,7 @@ def VACGThd : N3VDInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
"f16", v4i16, v4f16, int_arm_neon_vacgt, 0>,
Requires<[HasNEON, HasFullFP16]>;
def VACGThq : N3VQInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
- "f16", v8f16, v8f16, int_arm_neon_vacgt, 0>,
+ "f16", v8i16, v8f16, int_arm_neon_vacgt, 0>,
Requires<[HasNEON, HasFullFP16]>;
// VTST : Vector Test Bits
defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
@@ -5091,6 +5109,54 @@ def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm",
(VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
}
+// +fp16fml Floating Point Multiplication Variants
+let Predicates = [HasNEON, HasFP16FML], DecoderNamespace= "VFPV8" in {
+
+class N3VCP8F16Q1<string asm, RegisterClass Td, RegisterClass Tn,
+ RegisterClass Tm, bits<2> op1, bits<2> op2, bit op3>
+ : N3VCP8<op1, op2, 1, op3, (outs Td:$Vd), (ins Tn:$Vn, Tm:$Vm), NoItinerary,
+ asm, "f16", "$Vd, $Vn, $Vm", "", []>;
+
+class N3VCP8F16Q0<string asm, RegisterClass Td, RegisterClass Tn,
+ RegisterClass Tm, bits<2> op1, bits<2> op2, bit op3>
+ : N3VCP8Q0<op1, op2, 0, op3, (outs Td:$Vd), (ins Tn:$Vn, Tm:$Vm), NoItinerary,
+ asm, "f16", "$Vd, $Vn, $Vm", "", []>;
+
+class VFMQ0<string opc, bits<2> S>
+ : N3VLaneCP8<0, S, 0, 1, (outs DPR:$Vd),
+ (ins SPR:$Vn, SPR:$Vm, VectorIndex32:$idx),
+ IIC_VMACD, opc, "f16", "$Vd, $Vn, $Vm$idx", "", []> {
+ bit idx;
+ let Inst{3} = idx;
+ let Inst{19-16} = Vn{4-1};
+ let Inst{7} = Vn{0};
+ let Inst{5} = Vm{0};
+ let Inst{2-0} = Vm{3-1};
+}
+
+class VFMQ1<string opc, bits<2> S>
+ : N3VLaneCP8<0, S, 1, 1, (outs QPR:$Vd),
+ (ins DPR:$Vn, DPR:$Vm, VectorIndex16:$idx),
+ IIC_VMACD, opc, "f16", "$Vd, $Vn, $Vm$idx", "", []> {
+ bits<2> idx;
+ let Inst{5} = idx{1};
+ let Inst{3} = idx{0};
+}
+
+let hasNoSchedulingInfo = 1 in {
+// op1 op2 op3
+def VFMALD : N3VCP8F16Q0<"vfmal", DPR, SPR, SPR, 0b00, 0b10, 1>;
+def VFMSLD : N3VCP8F16Q0<"vfmsl", DPR, SPR, SPR, 0b01, 0b10, 1>;
+def VFMALQ : N3VCP8F16Q1<"vfmal", QPR, DPR, DPR, 0b00, 0b10, 1>;
+def VFMSLQ : N3VCP8F16Q1<"vfmsl", QPR, DPR, DPR, 0b01, 0b10, 1>;
+def VFMALDI : VFMQ0<"vfmal", 0b00>;
+def VFMSLDI : VFMQ0<"vfmsl", 0b01>;
+def VFMALQI : VFMQ1<"vfmal", 0b00>;
+def VFMSLQI : VFMQ1<"vfmsl", 0b01>;
+}
+} // HasNEON, HasFP16FML
+
+
def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
(VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
@@ -5455,17 +5521,17 @@ defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm,
"vmax", "u", umax, 1>;
def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND,
"vmax", "f32",
- v2f32, v2f32, fmaxnan, 1>;
+ v2f32, v2f32, fmaximum, 1>;
def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
"vmax", "f32",
- v4f32, v4f32, fmaxnan, 1>;
+ v4f32, v4f32, fmaximum, 1>;
def VMAXhd : N3VDInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBIND,
"vmax", "f16",
- v4f16, v4f16, fmaxnan, 1>,
+ v4f16, v4f16, fmaximum, 1>,
Requires<[HasNEON, HasFullFP16]>;
def VMAXhq : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ,
"vmax", "f16",
- v8f16, v8f16, fmaxnan, 1>,
+ v8f16, v8f16, fmaximum, 1>,
Requires<[HasNEON, HasFullFP16]>;
// VMAXNM
@@ -5497,17 +5563,17 @@ defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm,
"vmin", "u", umin, 1>;
def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND,
"vmin", "f32",
- v2f32, v2f32, fminnan, 1>;
+ v2f32, v2f32, fminimum, 1>;
def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
"vmin", "f32",
- v4f32, v4f32, fminnan, 1>;
+ v4f32, v4f32, fminimum, 1>;
def VMINhd : N3VDInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBIND,
"vmin", "f16",
- v4f16, v4f16, fminnan, 1>,
+ v4f16, v4f16, fminimum, 1>,
Requires<[HasNEON, HasFullFP16]>;
def VMINhq : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ,
"vmin", "f16",
- v8f16, v8f16, fminnan, 1>,
+ v8f16, v8f16, fminimum, 1>,
Requires<[HasNEON, HasFullFP16]>;
// VMINNM
@@ -6318,6 +6384,9 @@ def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> {
let Inst{19} = lane{0};
}
+def : Pat<(v4f16 (NEONvduplane (v4f16 DPR:$Vm), imm:$lane)),
+ (VDUPLN32d DPR:$Vm, imm:$lane)>;
+
def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
(VDUPLN32d DPR:$Vm, imm:$lane)>;
@@ -6332,6 +6401,10 @@ def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)),
(v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v8f16 (NEONvduplane (v8f16 QPR:$src), imm:$lane)),
+ (v8f16 (VDUPLN16q (v4f16 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)),
(v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i32_reg imm:$lane))),
@@ -6341,12 +6414,18 @@ def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f16 (NEONvdup HPR:$src)),
+ (v4f16 (VDUPLN16d (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)),
+ HPR:$src, ssub_0), (i32 0)))>;
def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))),
(v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
SPR:$src, ssub_0), (i32 0)))>;
def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))),
(v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
SPR:$src, ssub_0), (i32 0)))>;
+def : Pat<(v8f16 (NEONvdup HPR:$src)),
+ (v8f16 (VDUPLN16q (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)),
+ HPR:$src, ssub_0), (i32 0)))>;
// VMOVN : Vector Narrowing Move
defm VMOVN : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN,
@@ -6558,6 +6637,8 @@ def VREV64q8 : VREV64Q<0b00, "vrev64", "8", v16i8>;
def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>;
def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>;
def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>;
+def : Pat<(v8f16 (NEONvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>;
+def : Pat<(v4f16 (NEONvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>;
// VREV32 : Vector Reverse elements within 32-bit words
@@ -6647,13 +6728,14 @@ def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> {
let Inst{10-9} = index{1-0};
let Inst{8} = 0b0;
}
+def : Pat<(v4f16 (NEONvext (v4f16 DPR:$Vn), (v4f16 DPR:$Vm), (i32 imm:$index))),
+ (VEXTd16 DPR:$Vn, DPR:$Vm, imm:$index)>;
+
def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> {
let Inst{10} = index{0};
let Inst{9-8} = 0b00;
}
-def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn),
- (v2f32 DPR:$Vm),
- (i32 imm:$index))),
+def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn), (v2f32 DPR:$Vm), (i32 imm:$index))),
(VEXTd32 DPR:$Vn, DPR:$Vm, imm:$index)>;
def VEXTq8 : VEXTq<"vext", "8", v16i8, imm0_15> {
@@ -6663,6 +6745,9 @@ def VEXTq16 : VEXTq<"vext", "16", v8i16, imm0_7> {
let Inst{11-9} = index{2-0};
let Inst{8} = 0b0;
}
+def : Pat<(v8f16 (NEONvext (v8f16 QPR:$Vn), (v8f16 QPR:$Vm), (i32 imm:$index))),
+ (VEXTq16 QPR:$Vn, QPR:$Vm, imm:$index)>;
+
def VEXTq32 : VEXTq<"vext", "32", v4i32, imm0_3> {
let Inst{11-10} = index{1-0};
let Inst{9-8} = 0b00;
@@ -6671,9 +6756,7 @@ def VEXTq64 : VEXTq<"vext", "64", v2i64, imm0_1> {
let Inst{11} = index{0};
let Inst{10-8} = 0b000;
}
-def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn),
- (v4f32 QPR:$Vm),
- (i32 imm:$index))),
+def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn), (v4f32 QPR:$Vm), (i32 imm:$index))),
(VEXTq32 QPR:$Vn, QPR:$Vm, imm:$index)>;
// VTRN : Vector Transpose
@@ -7001,19 +7084,19 @@ def : N3VSPat<fadd, VADDfd>;
def : N3VSPat<fsub, VSUBfd>;
def : N3VSPat<fmul, VMULfd>;
def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
- Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+ Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
- Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+ Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
def : N2VSPat<fabs, VABSfd>;
def : N2VSPat<fneg, VNEGfd>;
-def : N3VSPatFP16<fmaxnan, VMAXhd>, Requires<[HasFullFP16]>;
-def : N3VSPatFP16<fminnan, VMINhd>, Requires<[HasFullFP16]>;
-def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>;
-def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>;
+def : N3VSPatFP16<fmaximum, VMAXhd>, Requires<[HasFullFP16]>;
+def : N3VSPatFP16<fminimum, VMINhd>, Requires<[HasFullFP16]>;
+def : N3VSPat<fmaximum, VMAXfd>, Requires<[HasNEON]>;
+def : N3VSPat<fminimum, VMINfd>, Requires<[HasNEON]>;
def : NVCVTFIPat<fp_to_sint, VCVTf2sd>;
def : NVCVTFIPat<fp_to_uint, VCVTf2ud>;
def : NVCVTIFPat<sint_to_fp, VCVTs2fd>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
index 88aab47a79bf..b20b34eaa6a9 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -781,7 +781,7 @@ defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr,
// These require base address to be written back or one of the loaded regs.
let hasSideEffects = 0 in {
-let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in
def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> {
bits<3> Rn;
@@ -826,7 +826,8 @@ def : InstAlias<"ldm${p} $Rn!, $regs",
(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs), 0>,
Requires<[IsThumb, IsThumb1Only]>;
-let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
+let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1,
+ variadicOpsAreDefs = 1 in
def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
IIC_iPop,
"pop${p}\t$regs", []>,
@@ -1343,8 +1344,20 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
tGPR:$Rm))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
+
+ def tRSBS : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn),
+ 2, IIC_iALUr,
+ [(set tGPR:$Rd, CPSR, (ARMsubc 0, tGPR:$Rn))]>,
+ Requires<[IsThumb1Only]>,
+ Sched<[WriteALU]>;
}
+
+def : T1Pat<(ARMsubs tGPR:$Rn, tGPR:$Rm), (tSUBSrr $Rn, $Rm)>;
+def : T1Pat<(ARMsubs tGPR:$Rn, imm0_7:$imm3), (tSUBSi3 $Rn, imm0_7:$imm3)>;
+def : T1Pat<(ARMsubs tGPR:$Rn, imm0_255:$imm8), (tSUBSi8 $Rn, imm0_255:$imm8)>;
+
+
// Sign-extend byte
def tSXTB : // A8.6.222
T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
@@ -1380,6 +1393,9 @@ def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8",
let Inst{7-0} = imm8;
}
+def : Pat<(debugtrap), (tBKPT 0)>, Requires<[IsThumb, HasV5T]>;
+def : Pat<(debugtrap), (tUDF 254)>, Requires<[IsThumb, NoV5T]>;
+
def t__brkdiv0 : TI<(outs), (ins), IIC_Br, "__brkdiv0",
[(int_arm_undefined 249)]>, Encoding16,
Requires<[IsThumb, IsWindows]> {
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
index f67075fbf9fd..7a6673b49d57 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -1775,7 +1775,7 @@ multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
let hasSideEffects = 0 in {
-let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in
defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
multiclass thumb2_st_mult<string asm, InstrItinClass itin,
@@ -1997,6 +1997,10 @@ def : Thumb2DSPPat<(int_arm_sxtb16 rGPR:$Rn),
(t2SXTB16 rGPR:$Rn, 0)>;
def : Thumb2DSPPat<(int_arm_sxtab16 rGPR:$Rn, rGPR:$Rm),
(t2SXTAB16 rGPR:$Rn, rGPR:$Rm, 0)>;
+def : Thumb2DSPPat<(int_arm_sxtb16 (rotr rGPR:$Rn, rot_imm:$rot)),
+ (t2SXTB16 rGPR:$Rn, rot_imm:$rot)>;
+def : Thumb2DSPPat<(int_arm_sxtab16 rGPR:$Rn, (rotr rGPR:$Rm, rot_imm:$rot)),
+ (t2SXTAB16 rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
// A simple right-shift can also be used in most cases (the exception is the
@@ -2032,6 +2036,8 @@ def : Thumb2DSPPat<(and (rotr rGPR:$Rm, rot_imm:$rot), 0x00FF00FF),
def : Thumb2DSPPat<(int_arm_uxtb16 rGPR:$Rm),
(t2UXTB16 rGPR:$Rm, 0)>;
+def : Thumb2DSPPat<(int_arm_uxtb16 (rotr rGPR:$Rn, rot_imm:$rot)),
+ (t2UXTB16 rGPR:$Rn, rot_imm:$rot)>;
// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
// The transformation should probably be done as a combiner action
@@ -2062,6 +2068,8 @@ def : Thumb2DSPPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot),
(t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
def : Thumb2DSPPat<(int_arm_uxtab16 rGPR:$Rn, rGPR:$Rm),
(t2UXTAB16 rGPR:$Rn, rGPR:$Rm, 0)>;
+def : Thumb2DSPPat<(int_arm_uxtab16 rGPR:$Rn, (rotr rGPR:$Rm, rot_imm:$rot)),
+ (t2UXTAB16 rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
}
@@ -2086,6 +2094,12 @@ defm t2SUB : T2I_bin_ii12rs<0b101, "sub", sub>;
defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMaddc, 1>;
defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMsubc>;
+def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_imm:$imm),
+ (t2SUBSri $Rn, t2_so_imm:$imm)>;
+def : T2Pat<(ARMsubs GPRnopc:$Rn, rGPR:$Rm), (t2SUBSrr $Rn, $Rm)>;
+def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_reg:$ShiftedRm),
+ (t2SUBSrs $Rn, t2_so_reg:$ShiftedRm)>;
+
let hasPostISelHook = 1 in {
defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1>;
defm t2SBC : T2I_adde_sube_irs<0b1011, "sbc", ARMsube>;
@@ -2718,28 +2732,25 @@ class T2ThreeRegSMUL<bits<3> op22_20, bits<2> op5_4, string opc,
}
def t2SMULBB : T2ThreeRegSMUL<0b001, 0b00, "smulbb",
- [(set rGPR:$Rd, (mul (sext_inreg rGPR:$Rn, i16),
- (sext_inreg rGPR:$Rm, i16)))]>;
+ [(set rGPR:$Rd, (bb_mul rGPR:$Rn, rGPR:$Rm))]>;
def t2SMULBT : T2ThreeRegSMUL<0b001, 0b01, "smulbt",
- [(set rGPR:$Rd, (mul (sext_inreg rGPR:$Rn, i16),
- (sra rGPR:$Rm, (i32 16))))]>;
+ [(set rGPR:$Rd, (bt_mul rGPR:$Rn, rGPR:$Rm))]>;
def t2SMULTB : T2ThreeRegSMUL<0b001, 0b10, "smultb",
- [(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)),
- (sext_inreg rGPR:$Rm, i16)))]>;
+ [(set rGPR:$Rd, (tb_mul rGPR:$Rn, rGPR:$Rm))]>;
def t2SMULTT : T2ThreeRegSMUL<0b001, 0b11, "smultt",
- [(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)),
- (sra rGPR:$Rm, (i32 16))))]>;
+ [(set rGPR:$Rd, (tt_mul rGPR:$Rn, rGPR:$Rm))]>;
def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb",
[(set rGPR:$Rd, (ARMsmulwb rGPR:$Rn, rGPR:$Rm))]>;
def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt",
[(set rGPR:$Rd, (ARMsmulwt rGPR:$Rn, rGPR:$Rm))]>;
-def : Thumb2DSPPat<(mul sext_16_node:$Rm, sext_16_node:$Rn),
- (t2SMULBB rGPR:$Rm, rGPR:$Rn)>;
-def : Thumb2DSPPat<(mul sext_16_node:$Rn, (sra rGPR:$Rm, (i32 16))),
+def : Thumb2DSPPat<(mul sext_16_node:$Rn, (sext_bottom_16 rGPR:$Rm)),
+ (t2SMULBB rGPR:$Rn, rGPR:$Rm)>;
+def : Thumb2DSPPat<(mul sext_16_node:$Rn, (sext_top_16 rGPR:$Rm)),
(t2SMULBT rGPR:$Rn, rGPR:$Rm)>;
-def : Thumb2DSPPat<(mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm),
+def : Thumb2DSPPat<(mul (sext_top_16 rGPR:$Rn), sext_16_node:$Rm),
(t2SMULTB rGPR:$Rn, rGPR:$Rm)>;
+
def : Thumb2DSPPat<(int_arm_smulbb rGPR:$Rn, rGPR:$Rm),
(t2SMULBB rGPR:$Rn, rGPR:$Rm)>;
def : Thumb2DSPPat<(int_arm_smulbt rGPR:$Rn, rGPR:$Rm),
@@ -2767,18 +2778,13 @@ class T2FourRegSMLA<bits<3> op22_20, bits<2> op5_4, string opc,
}
def t2SMLABB : T2FourRegSMLA<0b001, 0b00, "smlabb",
- [(set rGPR:$Rd, (add rGPR:$Ra,
- (mul (sext_inreg rGPR:$Rn, i16),
- (sext_inreg rGPR:$Rm, i16))))]>;
+ [(set rGPR:$Rd, (add rGPR:$Ra, (bb_mul rGPR:$Rn, rGPR:$Rm)))]>;
def t2SMLABT : T2FourRegSMLA<0b001, 0b01, "smlabt",
- [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sext_inreg rGPR:$Rn, i16),
- (sra rGPR:$Rm, (i32 16)))))]>;
+ [(set rGPR:$Rd, (add rGPR:$Ra, (bt_mul rGPR:$Rn, rGPR:$Rm)))]>;
def t2SMLATB : T2FourRegSMLA<0b001, 0b10, "smlatb",
- [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)),
- (sext_inreg rGPR:$Rm, i16))))]>;
+ [(set rGPR:$Rd, (add rGPR:$Ra, (tb_mul rGPR:$Rn, rGPR:$Rm)))]>;
def t2SMLATT : T2FourRegSMLA<0b001, 0b11, "smlatt",
- [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)),
- (sra rGPR:$Rm, (i32 16)))))]>;
+ [(set rGPR:$Rd, (add rGPR:$Ra, (tt_mul rGPR:$Rn, rGPR:$Rm)))]>;
def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb",
[(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwb rGPR:$Rn, rGPR:$Rm)))]>;
def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt",
@@ -2786,11 +2792,14 @@ def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt",
def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn, sext_16_node:$Rm)),
(t2SMLABB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
-def : Thumb2DSPMulPat<(add rGPR:$Ra,
- (mul sext_16_node:$Rn, (sra rGPR:$Rm, (i32 16)))),
+def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn,
+ (sext_bottom_16 rGPR:$Rm))),
+ (t2SMLABB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
+def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn,
+ (sext_top_16 rGPR:$Rm))),
(t2SMLABT rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
-def : Thumb2DSPMulPat<(add rGPR:$Ra,
- (mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm)),
+def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul (sext_top_16 rGPR:$Rn),
+ sext_16_node:$Rm)),
(t2SMLATB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
def : Thumb2DSPPat<(int_arm_smlabb GPR:$a, GPR:$b, GPR:$acc),
@@ -3223,6 +3232,14 @@ def t2TSB : T2I<(outs), (ins tsb_opt:$opt), NoItinerary,
}
}
+// Armv8.5-A speculation barrier
+def t2SB : Thumb2XI<(outs), (ins), AddrModeNone, 4, NoItinerary, "sb", "", []>,
+ Requires<[IsThumb2, HasSB]>, Sched<[]> {
+ let Inst{31-0} = 0xf3bf8f70;
+ let Unpredictable = 0x000f2f0f;
+ let hasSideEffects = 1;
+}
+
class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
InstrItinClass itin, string opc, string asm, string cstr,
list<dag> pattern, bits<4> rt2 = 0b1111>
@@ -4429,13 +4446,13 @@ def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val),
def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val),
(t2STRs GPR:$val, t2addrmode_so_reg:$addr)>;
-let AddedComplexity = 8 in {
- def : T2Pat<(atomic_load_acquire_8 addr_offset_none:$addr), (t2LDAB addr_offset_none:$addr)>;
- def : T2Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>;
- def : T2Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA addr_offset_none:$addr)>;
- def : T2Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (t2STLB GPR:$val, addr_offset_none:$addr)>;
- def : T2Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>;
- def : T2Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL GPR:$val, addr_offset_none:$addr)>;
+let AddedComplexity = 8, Predicates = [IsThumb, HasAcquireRelease, HasV7Clrex] in {
+ def : Pat<(atomic_load_acquire_8 addr_offset_none:$addr), (t2LDAB addr_offset_none:$addr)>;
+ def : Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>;
+ def : Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA addr_offset_none:$addr)>;
+ def : Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (t2STLB GPR:$val, addr_offset_none:$addr)>;
+ def : Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>;
+ def : Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL GPR:$val, addr_offset_none:$addr)>;
}
@@ -4538,6 +4555,12 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+
+// Non-predicable aliases of a predicable DSB: the predicate is (14, 0) where
+// 14 = AL (always execute) and 0 = "instruction doesn't read the CPSR".
+def : InstAlias<"ssbb", (t2DSB 0x0, 14, 0), 1>, Requires<[HasDB, IsThumb2]>;
+def : InstAlias<"pssbb", (t2DSB 0x4, 14, 0), 1>, Requires<[HasDB, IsThumb2]>;
+
// Armv8-R 'Data Full Barrier'
def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
index 2f14b78c91fd..b58730c452f7 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -725,9 +725,11 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
}
def : FullFP16Pat<(f64 (fpextend HPR:$Sm)),
- (VCVTBHD (COPY_TO_REGCLASS HPR:$Sm, SPR))>;
+ (VCVTBHD (COPY_TO_REGCLASS HPR:$Sm, SPR))>,
+ Requires<[HasFPARMv8, HasDPVFP]>;
def : FP16Pat<(f64 (f16_to_fp GPR:$a)),
- (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>;
+ (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>,
+ Requires<[HasFPARMv8, HasDPVFP]>;
def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
(outs SPR:$Sd), (ins DPR:$Dm),
@@ -746,9 +748,11 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
}
def : FullFP16Pat<(f16 (fpround DPR:$Dm)),
- (COPY_TO_REGCLASS (VCVTBDH DPR:$Dm), HPR)>;
+ (COPY_TO_REGCLASS (VCVTBDH DPR:$Dm), HPR)>,
+ Requires<[HasFPARMv8, HasDPVFP]>;
def : FP16Pat<(fp_to_f16 (f64 DPR:$a)),
- (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
+ (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>,
+ Requires<[HasFPARMv8, HasDPVFP]>;
def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
(outs DPR:$Dd), (ins SPR:$Sm),
@@ -1810,7 +1814,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0,
[(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
(f64 DPR:$Ddin)))]>,
RegConstraint<"$Ddin = $Dd">,
- Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
@@ -1819,7 +1823,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
[(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
SPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
@@ -1832,17 +1836,17 @@ def VMLAH : AHbI<0b11100, 0b00, 0, 0,
[(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasFullFP16,UseFPVMLx]>;
def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
(VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
- Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
- Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>;
def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
(VMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
- Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>;
def VMLSD : ADbI<0b11100, 0b00, 1, 0,
@@ -1851,7 +1855,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0,
[(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
(f64 DPR:$Ddin)))]>,
RegConstraint<"$Ddin = $Dd">,
- Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
@@ -1860,7 +1864,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
[(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
SPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
@@ -1873,17 +1877,17 @@ def VMLSH : AHbI<0b11100, 0b00, 1, 0,
[(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasFullFP16,UseFPVMLx]>;
def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
(VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
- Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
- Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
(VMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
- Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1891,7 +1895,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
[(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
(f64 DPR:$Ddin)))]>,
RegConstraint<"$Ddin = $Dd">,
- Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
@@ -1900,7 +1904,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
[(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
SPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
@@ -1913,29 +1917,29 @@ def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
[(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasFullFP16,UseFPVMLx]>;
// (-(a * b) - dst) -> -(dst + (a * b))
def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
(VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
- Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
- Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin),
(VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
- Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
// (-dst - (a * b)) -> -(dst + (a * b))
def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
(VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
- Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)),
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
- Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)),
(VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
- Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1943,7 +1947,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
[(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
(f64 DPR:$Ddin)))]>,
RegConstraint<"$Ddin = $Dd">,
- Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
@@ -1951,7 +1955,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
[(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
@@ -1963,17 +1967,17 @@ def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
[(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
- Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasFullFP16,UseFPVMLx]>;
def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
(VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
- Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
(VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
- Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin),
(VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
- Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+ Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
//===----------------------------------------------------------------------===//
// Fused FP Multiply-Accumulate Operations.
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
index 6692a4d41420..293e734c97cd 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -76,6 +76,42 @@ private:
const ARMRegisterBankInfo &RBI;
const ARMSubtarget &STI;
+ // Store the opcodes that we might need, so we don't have to check what kind
+ // of subtarget (ARM vs Thumb) we have all the time.
+ struct OpcodeCache {
+ unsigned ZEXT16;
+ unsigned SEXT16;
+
+ unsigned ZEXT8;
+ unsigned SEXT8;
+
+ // Used for implementing ZEXT/SEXT from i1
+ unsigned AND;
+ unsigned RSB;
+
+ unsigned STORE32;
+ unsigned LOAD32;
+
+ unsigned STORE16;
+ unsigned LOAD16;
+
+ unsigned STORE8;
+ unsigned LOAD8;
+
+ OpcodeCache(const ARMSubtarget &STI);
+ } const Opcodes;
+
+ // Select the opcode for simple extensions (that translate to a single SXT/UXT
+ // instruction). Extension operations more complicated than that should not
+ // invoke this. Returns the original opcode if it doesn't know how to select a
+ // better one.
+ unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) const;
+
+ // Select the opcode for simple loads and stores. Returns the original opcode
+ // if it doesn't know how to select a better one.
+ unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
+ unsigned Size) const;
+
#define GET_GLOBALISEL_PREDICATES_DECL
#include "ARMGenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_DECL
@@ -107,7 +143,7 @@ ARMInstructionSelector::ARMInstructionSelector(const ARMBaseTargetMachine &TM,
const ARMSubtarget &STI,
const ARMRegisterBankInfo &RBI)
: InstructionSelector(), TII(*STI.getInstrInfo()),
- TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), STI(STI),
+ TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), STI(STI), Opcodes(STI),
#define GET_GLOBALISEL_PREDICATES_INIT
#include "ARMGenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_INIT
@@ -225,41 +261,63 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB,
return true;
}
-/// Select the opcode for simple extensions (that translate to a single SXT/UXT
-/// instruction). Extension operations more complicated than that should not
-/// invoke this. Returns the original opcode if it doesn't know how to select a
-/// better one.
-static unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) {
+ARMInstructionSelector::OpcodeCache::OpcodeCache(const ARMSubtarget &STI) {
+ bool isThumb = STI.isThumb();
+
+ using namespace TargetOpcode;
+
+#define STORE_OPCODE(VAR, OPC) VAR = isThumb ? ARM::t2##OPC : ARM::OPC
+ STORE_OPCODE(SEXT16, SXTH);
+ STORE_OPCODE(ZEXT16, UXTH);
+
+ STORE_OPCODE(SEXT8, SXTB);
+ STORE_OPCODE(ZEXT8, UXTB);
+
+ STORE_OPCODE(AND, ANDri);
+ STORE_OPCODE(RSB, RSBri);
+
+ STORE_OPCODE(STORE32, STRi12);
+ STORE_OPCODE(LOAD32, LDRi12);
+
+ // LDRH/STRH are special...
+ STORE16 = isThumb ? ARM::t2STRHi12 : ARM::STRH;
+ LOAD16 = isThumb ? ARM::t2LDRHi12 : ARM::LDRH;
+
+ STORE_OPCODE(STORE8, STRBi12);
+ STORE_OPCODE(LOAD8, LDRBi12);
+#undef MAP_OPCODE
+}
+
+unsigned ARMInstructionSelector::selectSimpleExtOpc(unsigned Opc,
+ unsigned Size) const {
using namespace TargetOpcode;
if (Size != 8 && Size != 16)
return Opc;
if (Opc == G_SEXT)
- return Size == 8 ? ARM::SXTB : ARM::SXTH;
+ return Size == 8 ? Opcodes.SEXT8 : Opcodes.SEXT16;
if (Opc == G_ZEXT)
- return Size == 8 ? ARM::UXTB : ARM::UXTH;
+ return Size == 8 ? Opcodes.ZEXT8 : Opcodes.ZEXT16;
return Opc;
}
-/// Select the opcode for simple loads and stores. For types smaller than 32
-/// bits, the value will be zero extended. Returns the original opcode if it
-/// doesn't know how to select a better one.
-static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
- unsigned Size) {
+unsigned ARMInstructionSelector::selectLoadStoreOpCode(unsigned Opc,
+ unsigned RegBank,
+ unsigned Size) const {
bool isStore = Opc == TargetOpcode::G_STORE;
if (RegBank == ARM::GPRRegBankID) {
switch (Size) {
case 1:
case 8:
- return isStore ? ARM::STRBi12 : ARM::LDRBi12;
+ return isStore ? Opcodes.STORE8 : Opcodes.LOAD8;
case 16:
- return isStore ? ARM::STRH : ARM::LDRH;
+ return isStore ? Opcodes.STORE16 : Opcodes.LOAD16;
case 32:
- return isStore ? ARM::STRi12 : ARM::LDRi12;
+ return isStore ? Opcodes.STORE32 : Opcodes.LOAD32;
default:
return Opc;
}
@@ -702,7 +760,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
switch (SrcSize) {
case 1: {
// ZExt boils down to & 0x1; for SExt we also subtract that from 0
- I.setDesc(TII.get(ARM::ANDri));
+ I.setDesc(TII.get(Opcodes.AND));
MIB.addImm(1).add(predOps(ARMCC::AL)).add(condCodeOp());
if (isSExt) {
@@ -714,7 +772,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
auto InsertBefore = std::next(I.getIterator());
auto SubI =
- BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::RSBri))
+ BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(Opcodes.RSB))
.addDef(SExtResult)
.addUse(AndResult)
.addImm(0)
diff --git a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index 891418306903..4a0c24d58474 100644
--- a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -75,13 +75,48 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
- getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
- getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
+ if (ST.isThumb1Only()) {
+ // Thumb1 is not supported yet.
+ computeTables();
+ verify(*ST.getInstrInfo());
+ return;
+ }
+
+ getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
+ .legalForCartesianProduct({s32}, {s1, s8, s16});
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
.legalFor({s32})
.minScalar(0, s32);
+ getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}});
+ getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}});
+
+ getActionDefinitionsBuilder(G_CONSTANT)
+ .legalFor({s32, p0})
+ .clampScalar(0, s32, s32);
+
+ // We're keeping these builders around because we'll want to add support for
+ // floating point to them.
+ auto &LoadStoreBuilder =
+ getActionDefinitionsBuilder({G_LOAD, G_STORE})
+ .legalForTypesWithMemSize({
+ {s1, p0, 8},
+ {s8, p0, 8},
+ {s16, p0, 16},
+ {s32, p0, 32},
+ {p0, p0, 32}});
+
+ if (ST.isThumb()) {
+ // FIXME: merge with the code for non-Thumb.
+ computeTables();
+ verify(*ST.getInstrInfo());
+ return;
+ }
+
+ getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
+ getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
+
if (ST.hasDivideInARMMode())
getActionDefinitionsBuilder({G_SDIV, G_UDIV})
.legalFor({s32})
@@ -101,14 +136,24 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
setAction({Op, s32}, Libcall);
}
- getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
- .legalForCartesianProduct({s32}, {s1, s8, s16});
-
- getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}});
- getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}});
-
getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL}).legalFor({s32});
+ if (ST.hasV5TOps()) {
+ getActionDefinitionsBuilder(G_CTLZ)
+ .legalFor({s32})
+ .clampScalar(0, s32, s32);
+ getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
+ .lowerFor({s32})
+ .clampScalar(0, s32, s32);
+ } else {
+ getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
+ .libcallFor({s32})
+ .clampScalar(0, s32, s32);
+ getActionDefinitionsBuilder(G_CTLZ)
+ .lowerFor({s32})
+ .clampScalar(0, s32, s32);
+ }
+
getActionDefinitionsBuilder(G_GEP).legalFor({{p0, s32}});
getActionDefinitionsBuilder(G_SELECT).legalForCartesianProduct({s32, p0},
@@ -116,20 +161,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
getActionDefinitionsBuilder(G_BRCOND).legalFor({s1});
- getActionDefinitionsBuilder(G_CONSTANT)
- .legalFor({s32, p0})
- .clampScalar(0, s32, s32);
-
getActionDefinitionsBuilder(G_ICMP)
.legalForCartesianProduct({s1}, {s32, p0})
.minScalar(1, s32);
// We're keeping these builders around because we'll want to add support for
// floating point to them.
- auto &LoadStoreBuilder =
- getActionDefinitionsBuilder({G_LOAD, G_STORE})
- .legalForCartesianProduct({s1, s8, s16, s32, p0}, {p0});
-
auto &PhiBuilder =
getActionDefinitionsBuilder(G_PHI).legalFor({s32, p0}).minScalar(0, s32);
@@ -302,7 +339,8 @@ ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate,
bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const {
+ MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const {
using namespace TargetOpcode;
MIRBuilder.setInstr(MI);
diff --git a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h
index 78ab9412c04b..527bf87f1093 100644
--- a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h
@@ -15,6 +15,7 @@
#define LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H
#include "llvm/ADT/IndexedMap.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/IR/Instructions.h"
@@ -29,7 +30,8 @@ public:
ARMLegalizerInfo(const ARMSubtarget &ST);
bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const override;
+ MachineIRBuilder &MIRBuilder,
+ GISelChangeObserver &Observer) const override;
private:
void setFCmpLibcallsGNU();
diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index db5f28480e90..6da7430a8e51 100644
--- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -1027,6 +1027,18 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI))
CanMergeToLSMulti = CanMergeToLSDouble = false;
+ // vldm / vstm limit are 32 for S variants, 16 for D variants.
+ unsigned Limit;
+ switch (Opcode) {
+ default:
+ Limit = UINT_MAX;
+ break;
+ case ARM::VLDRD:
+ case ARM::VSTRD:
+ Limit = 16;
+ break;
+ }
+
// Merge following instructions where possible.
for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
int NewOffset = MemOps[I].Offset;
@@ -1036,6 +1048,8 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
unsigned Reg = MO.getReg();
if (Reg == ARM::SP || Reg == ARM::PC)
break;
+ if (Count == Limit)
+ break;
// See if the current load/store may be part of a multi load/store.
unsigned RegNum = MO.isUndef() ? std::numeric_limits<unsigned>::max()
@@ -1303,7 +1317,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
MIB.add(MI->getOperand(OpNum));
// Transfer memoperands.
- MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ MIB.setMemRefs(MI->memoperands());
MBB.erase(MBBI);
return true;
@@ -1527,7 +1541,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
// Transfer implicit operands.
for (const MachineOperand &MO : MI.implicit_operands())
MIB.add(MO);
- MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIB.setMemRefs(MI.memoperands());
MBB.erase(MBBI);
return true;
@@ -1834,7 +1848,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
return M0->InsertPos < M1->InsertPos;
};
- llvm::sort(Candidates.begin(), Candidates.end(), LessThan);
+ llvm::sort(Candidates, LessThan);
// Go through list of candidates and merge.
bool Changed = false;
@@ -2172,13 +2186,12 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
bool RetVal = false;
// Sort by offset (in reverse order).
- llvm::sort(Ops.begin(), Ops.end(),
- [](const MachineInstr *LHS, const MachineInstr *RHS) {
- int LOffset = getMemoryOpOffset(*LHS);
- int ROffset = getMemoryOpOffset(*RHS);
- assert(LHS == RHS || LOffset != ROffset);
- return LOffset > ROffset;
- });
+ llvm::sort(Ops, [](const MachineInstr *LHS, const MachineInstr *RHS) {
+ int LOffset = getMemoryOpOffset(*LHS);
+ int ROffset = getMemoryOpOffset(*RHS);
+ assert(LHS == RHS || LOffset != ROffset);
+ return LOffset > ROffset;
+ });
// The loads / stores of the same base are in order. Scan them from first to
// last and check for the following:
@@ -2290,7 +2303,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
if (!isT2)
MIB.addReg(0);
MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
- MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
+ MIB.cloneMergedMemRefs({Op0, Op1});
LLVM_DEBUG(dbgs() << "Formed " << *MIB << "\n");
++NumLDRDFormed;
} else {
@@ -2304,7 +2317,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
if (!isT2)
MIB.addReg(0);
MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
- MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
+ MIB.cloneMergedMemRefs({Op0, Op1});
LLVM_DEBUG(dbgs() << "Formed " << *MIB << "\n");
++NumSTRDFormed;
}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp
index d11fe9d5c502..df1da9d8e474 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp
@@ -23,20 +23,13 @@ namespace llvm {
static bool isAESPair(const MachineInstr *FirstMI,
const MachineInstr &SecondMI) {
// Assume the 1st instr to be a wildcard if it is unspecified.
- unsigned FirstOpcode =
- FirstMI ? FirstMI->getOpcode()
- : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
- unsigned SecondOpcode = SecondMI.getOpcode();
-
- switch(SecondOpcode) {
+ switch(SecondMI.getOpcode()) {
// AES encode.
case ARM::AESMC :
- return FirstOpcode == ARM::AESE ||
- FirstOpcode == ARM::INSTRUCTION_LIST_END;
+ return FirstMI == nullptr || FirstMI->getOpcode() == ARM::AESE;
// AES decode.
case ARM::AESIMC:
- return FirstOpcode == ARM::AESD ||
- FirstOpcode == ARM::INSTRUCTION_LIST_END;
+ return FirstMI == nullptr || FirstMI->getOpcode() == ARM::AESD;
}
return false;
@@ -46,15 +39,8 @@ static bool isAESPair(const MachineInstr *FirstMI,
static bool isLiteralsPair(const MachineInstr *FirstMI,
const MachineInstr &SecondMI) {
// Assume the 1st instr to be a wildcard if it is unspecified.
- unsigned FirstOpcode =
- FirstMI ? FirstMI->getOpcode()
- : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
- unsigned SecondOpcode = SecondMI.getOpcode();
-
- // 32 bit immediate.
- if ((FirstOpcode == ARM::INSTRUCTION_LIST_END ||
- FirstOpcode == ARM::MOVi16) &&
- SecondOpcode == ARM::MOVTi16)
+ if ((FirstMI == nullptr || FirstMI->getOpcode() == ARM::MOVi16) &&
+ SecondMI.getOpcode() == ARM::MOVTi16)
return true;
return false;
diff --git a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.h b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.h
index 1e4fc6687eae..b3abd7b593a1 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.h
+++ b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.h
@@ -12,6 +12,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_ARM_ARMMACROFUSION_H
+#define LLVM_LIB_TARGET_ARM_ARMMACROFUSION_H
+
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
@@ -22,3 +25,5 @@ namespace llvm {
std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation();
} // llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index 9d5478b76c18..fc3258914f92 100644
--- a/contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -42,6 +42,10 @@ using namespace PatternMatch;
STATISTIC(NumSMLAD , "Number of smlad instructions generated");
+static cl::opt<bool>
+DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
+ cl::desc("Disable the ARM Parallel DSP pass"));
+
namespace {
struct OpChain;
struct BinOpChain;
@@ -67,7 +71,7 @@ namespace {
virtual ~OpChain() = default;
void SetMemoryLocations() {
- const auto Size = MemoryLocation::UnknownSize;
+ const auto Size = LocationSize::unknown();
for (auto *V : AllValues) {
if (auto *I = dyn_cast<Instruction>(V)) {
if (I->mayWriteToMemory())
@@ -88,12 +92,15 @@ namespace {
struct BinOpChain : public OpChain {
ValueList LHS; // List of all (narrow) left hand operands.
ValueList RHS; // List of all (narrow) right hand operands.
+ bool Exchange = false;
BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
OpChain(I, lhs), LHS(lhs), RHS(rhs) {
for (auto *V : RHS)
AllValues.push_back(V);
}
+
+ bool AreSymmetrical(BinOpChain *Other);
};
struct Reduction {
@@ -101,9 +108,9 @@ namespace {
// pattern matching.
Instruction *AccIntAdd; // The accumulating integer add statement,
// i.e, the reduction statement.
-
OpChainList MACCandidates; // The MAC candidates associated with
// this reduction statement.
+ PMACPairList PMACPairs;
Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
};
@@ -116,12 +123,16 @@ namespace {
Loop *L;
const DataLayout *DL;
Module *M;
+ std::map<LoadInst*, LoadInst*> LoadPairs;
+ std::map<LoadInst*, SmallVector<LoadInst*, 4>> SequentialLoads;
- bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
+ bool RecordSequentialLoads(BasicBlock *Header);
+ bool InsertParallelMACs(Reduction &Reduction);
bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
- PMACPairList CreateParallelMACPairs(OpChainList &Candidates);
+ void CreateParallelMACPairs(Reduction &R);
Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
- Instruction *Acc, Instruction *InsertAfter);
+ Instruction *Acc, bool Exchange,
+ Instruction *InsertAfter);
/// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
/// Dual performs two signed 16x16-bit multiplications. It adds the
@@ -149,6 +160,8 @@ namespace {
}
bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
+ if (DisableParallelDSP)
+ return false;
L = TheLoop;
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
@@ -192,7 +205,14 @@ namespace {
LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
bool Changes = false;
- LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n\n");
+ LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
+ LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
+
+ if (!RecordSequentialLoads(Header)) {
+ LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
+ return false;
+ }
+
Changes = MatchSMLAD(F);
return Changes;
}
@@ -245,57 +265,14 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) {
return false;
}
-// Element-by-element comparison of Value lists returning true if they are
-// instructions with the same opcode or constants with the same value.
-static bool AreSymmetrical(const ValueList &VL0,
- const ValueList &VL1) {
- if (VL0.size() != VL1.size()) {
- LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
- << VL0.size() << " != " << VL1.size() << "\n");
- return false;
- }
-
- const unsigned Pairs = VL0.size();
- LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
-
- for (unsigned i = 0; i < Pairs; ++i) {
- const Value *V0 = VL0[i];
- const Value *V1 = VL1[i];
- const auto *Inst0 = dyn_cast<Instruction>(V0);
- const auto *Inst1 = dyn_cast<Instruction>(V1);
-
- LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
- dbgs() << "mul1: "; V0->dump();
- dbgs() << "mul2: "; V1->dump());
-
- if (!Inst0 || !Inst1)
- return false;
-
- if (Inst0->isSameOperationAs(Inst1)) {
- LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
- continue;
- }
-
- const APInt *C0, *C1;
- if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
- return false;
- }
-
- LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
- return true;
-}
-
template<typename MemInst>
static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
- MemInstList &VecMem, const DataLayout &DL,
- ScalarEvolution &SE) {
+ const DataLayout &DL, ScalarEvolution &SE) {
if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
return false;
}
if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
- VecMem.push_back(MemOp0);
- VecMem.push_back(MemOp1);
LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
return true;
}
@@ -318,82 +295,156 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
return false;
}
- return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, *DL, *SE);
+ if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
+ return false;
+
+ VecMem.clear();
+ VecMem.push_back(Ld0);
+ VecMem.push_back(Ld1);
+ return true;
}
-PMACPairList
-ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
+/// Iterate through the block and record base, offset pairs of loads as well as
+/// maximal sequences of sequential loads.
+bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) {
+ SmallVector<LoadInst*, 8> Loads;
+ for (auto &I : *Header) {
+ auto *Ld = dyn_cast<LoadInst>(&I);
+ if (!Ld)
+ continue;
+ Loads.push_back(Ld);
+ }
+
+ std::map<LoadInst*, LoadInst*> BaseLoads;
+
+ for (auto *Ld0 : Loads) {
+ for (auto *Ld1 : Loads) {
+ if (Ld0 == Ld1)
+ continue;
+
+ if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
+ LoadPairs[Ld0] = Ld1;
+ if (BaseLoads.count(Ld0)) {
+ LoadInst *Base = BaseLoads[Ld0];
+ BaseLoads[Ld1] = Base;
+ SequentialLoads[Base].push_back(Ld1);
+ } else {
+ BaseLoads[Ld1] = Ld0;
+ SequentialLoads[Ld0].push_back(Ld1);
+ }
+ }
+ }
+ }
+ return LoadPairs.size() > 1;
+}
+
+void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
+ OpChainList &Candidates = R.MACCandidates;
+ PMACPairList &PMACPairs = R.PMACPairs;
const unsigned Elems = Candidates.size();
- PMACPairList PMACPairs;
if (Elems < 2)
- return PMACPairs;
+ return;
- // TODO: for now we simply try to match consecutive pairs i and i+1.
- // We can compare all elements, but then we need to compare and evaluate
- // different solutions.
- for(unsigned i=0; i<Elems-1; i+=2) {
- BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i].get());
- BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[i+1].get());
- const Instruction *Mul0 = PMul0->Root;
- const Instruction *Mul1 = PMul1->Root;
+ auto CanPair = [&](BinOpChain *PMul0, BinOpChain *PMul1) {
+ if (!PMul0->AreSymmetrical(PMul1))
+ return false;
+
+ // The first elements of each vector should be loads with sexts. If we
+ // find that its two pairs of consecutive loads, then these can be
+ // transformed into two wider loads and the users can be replaced with
+ // DSP intrinsics.
+ for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) {
+ auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]);
+ auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]);
+ auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]);
+ auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]);
+
+ if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
+ return false;
- if (Mul0 == Mul1)
+ LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
+ << "\t Ld0: " << *Ld0 << "\n"
+ << "\t Ld1: " << *Ld1 << "\n"
+ << "and operands " << x + 2 << ":\n"
+ << "\t Ld2: " << *Ld2 << "\n"
+ << "\t Ld3: " << *Ld3 << "\n");
+
+ if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
+ if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+ LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+ PMACPairs.push_back(std::make_pair(PMul0, PMul1));
+ return true;
+ } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
+ LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+ LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
+ PMul1->Exchange = true;
+ PMACPairs.push_back(std::make_pair(PMul0, PMul1));
+ return true;
+ }
+ } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
+ AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+ LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+ LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
+ LLVM_DEBUG(dbgs() << " and swapping muls\n");
+ PMul0->Exchange = true;
+ // Only the second operand can be exchanged, so swap the muls.
+ PMACPairs.push_back(std::make_pair(PMul1, PMul0));
+ return true;
+ }
+ }
+ return false;
+ };
+
+ SmallPtrSet<const Instruction*, 4> Paired;
+ for (unsigned i = 0; i < Elems; ++i) {
+ BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i].get());
+ if (Paired.count(PMul0->Root))
continue;
- LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
- dbgs() << "- "; Mul0->dump();
- dbgs() << "- "; Mul1->dump());
+ for (unsigned j = 0; j < Elems; ++j) {
+ if (i == j)
+ continue;
- const ValueList &Mul0_LHS = PMul0->LHS;
- const ValueList &Mul0_RHS = PMul0->RHS;
- const ValueList &Mul1_LHS = PMul1->LHS;
- const ValueList &Mul1_RHS = PMul1->RHS;
+ BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[j].get());
+ if (Paired.count(PMul1->Root))
+ continue;
- if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) ||
- !AreSymmetrical(Mul0_RHS, Mul1_RHS))
- continue;
+ const Instruction *Mul0 = PMul0->Root;
+ const Instruction *Mul1 = PMul1->Root;
+ if (Mul0 == Mul1)
+ continue;
- LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
- // The first elements of each vector should be loads with sexts. If we find
- // that its two pairs of consecutive loads, then these can be transformed
- // into two wider loads and the users can be replaced with DSP
- // intrinsics.
- for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) {
- auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]);
- auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]);
- auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]);
- auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]);
-
- LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n";
- dbgs() << "\t mul1: "; Mul0_LHS[x]->dump();
- dbgs() << "\t mul2: "; Mul1_LHS[x]->dump();
- dbgs() << "and operands " << x + 2 << ":\n";
- dbgs() << "\t mul1: "; Mul0_RHS[x]->dump();
- dbgs() << "\t mul2: "; Mul1_RHS[x]->dump());
-
- if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd) &&
- AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
- LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
- PMACPairs.push_back(std::make_pair(PMul0, PMul1));
+ assert(PMul0 != PMul1 && "expected different chains");
+
+ LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
+ dbgs() << "- "; Mul0->dump();
+ dbgs() << "- "; Mul1->dump());
+
+ LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
+ if (CanPair(PMul0, PMul1)) {
+ Paired.insert(Mul0);
+ Paired.insert(Mul1);
+ break;
}
}
}
- return PMACPairs;
}
-bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
- PMACPairList &PMACPairs) {
+bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
Instruction *Acc = Reduction.Phi;
Instruction *InsertAfter = Reduction.AccIntAdd;
- for (auto &Pair : PMACPairs) {
+ for (auto &Pair : Reduction.PMACPairs) {
+ BinOpChain *PMul0 = Pair.first;
+ BinOpChain *PMul1 = Pair.second;
LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
- dbgs() << "- "; Pair.first->Root->dump();
- dbgs() << "- "; Pair.second->Root->dump());
- auto *VecLd0 = cast<LoadInst>(Pair.first->VecLd[0]);
- auto *VecLd1 = cast<LoadInst>(Pair.second->VecLd[0]);
- Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter);
+ dbgs() << "- "; PMul0->Root->dump();
+ dbgs() << "- "; PMul1->Root->dump());
+
+ auto *VecLd0 = cast<LoadInst>(PMul0->VecLd[0]);
+ auto *VecLd1 = cast<LoadInst>(PMul1->VecLd[0]);
+ Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter);
InsertAfter = Acc;
}
@@ -420,7 +471,7 @@ static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
for (PHINode &Phi : Header->phis()) {
const auto *Ty = Phi.getType();
- if (!Ty->isIntegerTy(32))
+ if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
continue;
const bool IsReduction =
@@ -447,10 +498,11 @@ static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
}
static void AddMACCandidate(OpChainList &Candidates,
- const Instruction *Acc,
- Value *MulOp0, Value *MulOp1, int MulOpNum) {
- Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
+ Instruction *Mul,
+ Value *MulOp0, Value *MulOp1) {
LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
+ assert(Mul->getOpcode() == Instruction::Mul &&
+ "expected mul instruction");
ValueList LHS;
ValueList RHS;
if (IsNarrowSequence<16>(MulOp0, LHS) &&
@@ -462,31 +514,38 @@ static void AddMACCandidate(OpChainList &Candidates,
static void MatchParallelMACSequences(Reduction &R,
OpChainList &Candidates) {
- const Instruction *Acc = R.AccIntAdd;
- Value *A, *MulOp0, *MulOp1;
- LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
-
- // Pattern 1: the accumulator is the RHS of the mul.
- while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)),
- m_Value(A)))){
- AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);
- Acc = dyn_cast<Instruction>(A);
- }
- // Pattern 2: the accumulator is the LHS of the mul.
- while(match(Acc, m_Add(m_Value(A),
- m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
- AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 1);
- Acc = dyn_cast<Instruction>(A);
- }
+ Instruction *Acc = R.AccIntAdd;
+ LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc);
- // The last mul in the chain has a slightly different pattern:
- // the mul is the first operand
- if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A))))
- AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);
+ // Returns false to signal the search should be stopped.
+ std::function<bool(Value*)> Match =
+ [&Candidates, &Match](Value *V) -> bool {
- // Because we start at the bottom of the chain, and we work our way up,
- // the muls are added in reverse program order to the list.
- std::reverse(Candidates.begin(), Candidates.end());
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ if (Match(I->getOperand(0)) || (Match(I->getOperand(1))))
+ return true;
+ break;
+ case Instruction::Mul: {
+ Value *MulOp0 = I->getOperand(0);
+ Value *MulOp1 = I->getOperand(1);
+ if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1))
+ AddMACCandidate(Candidates, I, MulOp0, MulOp1);
+ return false;
+ }
+ case Instruction::SExt:
+ return Match(I->getOperand(0));
+ }
+ return false;
+ };
+
+ while (Match (Acc));
+ LLVM_DEBUG(dbgs() << "Finished matching MAC sequences, found "
+ << Candidates.size() << " candidates.\n");
}
// Collects all instructions that are not part of the MAC chains, which is the
@@ -621,45 +680,100 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
for (auto &R : Reductions) {
if (AreAliased(AA, Reads, Writes, R.MACCandidates))
return false;
- PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
- Changed |= InsertParallelMACs(R, PMACPairs);
+ CreateParallelMACPairs(R);
+ Changed |= InsertParallelMACs(R);
}
LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
return Changed;
}
-static void CreateLoadIns(IRBuilder<NoFolder> &IRB, Instruction *Acc,
- LoadInst **VecLd) {
- const Type *AccTy = Acc->getType();
- const unsigned AddrSpace = (*VecLd)->getPointerAddressSpace();
+static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad,
+ const Type *LoadTy) {
+ const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
- Value *VecPtr = IRB.CreateBitCast((*VecLd)->getPointerOperand(),
- AccTy->getPointerTo(AddrSpace));
- *VecLd = IRB.CreateAlignedLoad(VecPtr, (*VecLd)->getAlignment());
+ Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
+ LoadTy->getPointerTo(AddrSpace));
+ return IRB.CreateAlignedLoad(VecPtr, BaseLoad.getAlignment());
}
Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
- Instruction *Acc,
+ Instruction *Acc, bool Exchange,
Instruction *InsertAfter) {
- LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n";
- dbgs() << "- "; VecLd0->dump();
- dbgs() << "- "; VecLd1->dump();
- dbgs() << "- "; Acc->dump());
+ LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n"
+ << "- " << *VecLd0 << "\n"
+ << "- " << *VecLd1 << "\n"
+ << "- " << *Acc << "\n"
+ << "Exchange: " << Exchange << "\n");
IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
++BasicBlock::iterator(InsertAfter));
// Replace the reduction chain with an intrinsic call
- CreateLoadIns(Builder, Acc, &VecLd0);
- CreateLoadIns(Builder, Acc, &VecLd1);
- Value* Args[] = { VecLd0, VecLd1, Acc };
- Function *SMLAD = Intrinsic::getDeclaration(M, Intrinsic::arm_smlad);
+ const Type *Ty = IntegerType::get(M->getContext(), 32);
+ LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
+ LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
+ Value* Args[] = { NewLd0, NewLd1, Acc };
+ Function *SMLAD = nullptr;
+ if (Exchange)
+ SMLAD = Acc->getType()->isIntegerTy(32) ?
+ Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
+ Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
+ else
+ SMLAD = Acc->getType()->isIntegerTy(32) ?
+ Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
+ Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
CallInst *Call = Builder.CreateCall(SMLAD, Args);
NumSMLAD++;
return Call;
}
+// Compare the value lists in Other to this chain.
+bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
+ // Element-by-element comparison of Value lists returning true if they are
+ // instructions with the same opcode or constants with the same value.
+ auto CompareValueList = [](const ValueList &VL0,
+ const ValueList &VL1) {
+ if (VL0.size() != VL1.size()) {
+ LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
+ << VL0.size() << " != " << VL1.size() << "\n");
+ return false;
+ }
+
+ const unsigned Pairs = VL0.size();
+ LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
+
+ for (unsigned i = 0; i < Pairs; ++i) {
+ const Value *V0 = VL0[i];
+ const Value *V1 = VL1[i];
+ const auto *Inst0 = dyn_cast<Instruction>(V0);
+ const auto *Inst1 = dyn_cast<Instruction>(V1);
+
+ LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
+ dbgs() << "mul1: "; V0->dump();
+ dbgs() << "mul2: "; V1->dump());
+
+ if (!Inst0 || !Inst1)
+ return false;
+
+ if (Inst0->isSameOperationAs(Inst1)) {
+ LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
+ continue;
+ }
+
+ const APInt *C0, *C1;
+ if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
+ return true;
+ };
+
+ return CompareValueList(LHS, Other->LHS) &&
+ CompareValueList(RHS, Other->RHS);
+}
+
Pass *llvm::createARMParallelDSPPass() {
return new ARMParallelDSP();
}
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 0e16d6bcfe2b..4f28f2dafc70 100644
--- a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -234,6 +234,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case G_GEP:
case G_INTTOPTR:
case G_PTRTOINT:
+ case G_CTLZ:
// FIXME: We're abusing the fact that everything lives in a GPR for now; in
// the real world we would use different mappings.
OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx];
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
index f42cbbda1b71..b1d0761e3231 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -188,8 +188,10 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
assert(hasV6T2Ops() || !hasThumb2());
// Execute only support requires movt support
- if (genExecuteOnly())
- assert(hasV8MBaselineOps() && !NoMovt && "Cannot generate execute-only code for this target");
+ if (genExecuteOnly()) {
+ NoMovt = false;
+ assert(hasV8MBaselineOps() && "Cannot generate execute-only code for this target");
+ }
// Keep a pointer to static instruction cost data for the specified CPU.
SchedModel = getSchedModelForCPU(CPUString);
@@ -287,7 +289,13 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
case CortexR7:
case CortexM3:
case CortexR52:
- case ExynosM1:
+ break;
+ case Exynos:
+ LdStMultipleTiming = SingleIssuePlusExtras;
+ MaxInterleaveFactor = 4;
+ if (!isThumb())
+ PrefLoopAlignment = 3;
+ break;
case Kryo:
break;
case Krait:
@@ -370,7 +378,8 @@ bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const {
// For general targets, the prologue can grow when VFPs are allocated with
// stride 4 (more vpush instructions). But WatchOS uses a compact unwind
// format which it's more important to get right.
- return isTargetWatchABI() || (isSwift() && !MF.getFunction().optForMinSize());
+ return isTargetWatchABI() ||
+ (useWideStrideVFP() && !MF.getFunction().optForMinSize());
}
bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
index 74aee9a8ed38..11841b4467a2 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -68,7 +68,7 @@ protected:
CortexR5,
CortexR52,
CortexR7,
- ExynosM1,
+ Exynos,
Krait,
Kryo,
Swift
@@ -106,6 +106,7 @@ protected:
ARMv82a,
ARMv83a,
ARMv84a,
+ ARMv85a,
ARMv8a,
ARMv8mBaseline,
ARMv8mMainline,
@@ -153,6 +154,7 @@ protected:
bool HasV8_2aOps = false;
bool HasV8_3aOps = false;
bool HasV8_4aOps = false;
+ bool HasV8_5aOps = false;
bool HasV8MBaselineOps = false;
bool HasV8MMainlineOps = false;
@@ -227,6 +229,9 @@ protected:
/// HasFullFP16 - True if subtarget supports half-precision FP operations
bool HasFullFP16 = false;
+ /// HasFP16FML - True if subtarget supports half-precision FP fml operations
+ bool HasFP16FML = false;
+
/// HasD16 - True if subtarget is limited to 16 double precision
/// FP registers for VFPv3.
bool HasD16 = false;
@@ -353,6 +358,9 @@ protected:
/// If true, loading into a D subregister will be penalized.
bool SlowLoadDSubregister = false;
+ /// If true, use a wider stride when allocating VFP registers.
+ bool UseWideStrideVFP = false;
+
/// If true, the AGU and NEON/FPU units are multiplexed.
bool HasMuxedUnits = false;
@@ -408,6 +416,9 @@ protected:
/// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS).
bool UseSjLjEH = false;
+ /// Has speculation barrier
+ bool HasSB = false;
+
/// Implicitly convert an instruction to a different one if its immediates
/// cannot be encoded. For example, ADD r0, r1, #FFFFFFFF -> SUB r0, r1, #1.
bool NegativeImmediates = true;
@@ -432,6 +443,9 @@ protected:
/// operand cycle returned by the itinerary data for pre-ISel operands.
int PreISelOperandLatencyAdjustment = 2;
+ /// What alignment is preferred for loop bodies, in log2(bytes).
+ unsigned PrefLoopAlignment = 0;
+
/// IsLittle - The target is Little Endian
bool IsLittle;
@@ -529,6 +543,7 @@ public:
bool hasV8_2aOps() const { return HasV8_2aOps; }
bool hasV8_3aOps() const { return HasV8_3aOps; }
bool hasV8_4aOps() const { return HasV8_4aOps; }
+ bool hasV8_5aOps() const { return HasV8_5aOps; }
bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
@@ -596,6 +611,7 @@ public:
bool hasVMLxHazards() const { return HasVMLxHazards; }
bool hasSlowOddRegister() const { return SlowOddRegister; }
bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; }
+ bool useWideStrideVFP() const { return UseWideStrideVFP; }
bool hasMuxedUnits() const { return HasMuxedUnits; }
bool dontWidenVMOVS() const { return DontWidenVMOVS; }
bool useSplatVFPToNeon() const { return SplatVFPToNeon; }
@@ -612,12 +628,14 @@ public:
bool hasDSP() const { return HasDSP; }
bool useNaClTrap() const { return UseNaClTrap; }
bool useSjLjEH() const { return UseSjLjEH; }
+ bool hasSB() const { return HasSB; }
bool genLongCalls() const { return GenLongCalls; }
bool genExecuteOnly() const { return GenExecuteOnly; }
bool hasFP16() const { return HasFP16; }
bool hasD16() const { return HasD16; }
bool hasFullFP16() const { return HasFullFP16; }
+ bool hasFP16FML() const { return HasFP16FML; }
bool hasFuseAES() const { return HasFuseAES; }
bool hasFuseLiterals() const { return HasFuseLiterals; }
@@ -796,6 +814,10 @@ public:
bool allowPositionIndependentMovt() const {
return isROPI() || !isTargetELF();
}
+
+ unsigned getPrefLoopAlignment() const {
+ return PrefLoopAlignment;
+ }
};
} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 519f789fc215..ec02c840d5e1 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -194,12 +194,6 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
return *RM;
}
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
- if (CM)
- return *CM;
- return CodeModel::Small;
-}
-
/// Create an ARM architecture model.
///
ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
@@ -210,7 +204,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OL, bool isLittle)
: LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
CPU, FS, Options, getEffectiveRelocModel(TT, RM),
- getEffectiveCodeModel(CM), OL),
+ getEffectiveCodeModel(CM, CodeModel::Small), OL),
TargetABI(computeTargetABI(TT, CPU, Options)),
TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) {
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index d0620761ea9c..9c13359cba71 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -32,7 +32,8 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
const TargetMachine &TM) {
const ARMBaseTargetMachine &ARM_TM = static_cast<const ARMBaseTargetMachine &>(TM);
bool isAAPCS_ABI = ARM_TM.TargetABI == ARMBaseTargetMachine::ARMABI::ARM_ABI_AAPCS;
- // genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly();
+ bool genExecuteOnly =
+ ARM_TM.getMCSubtargetInfo()->hasFeature(ARM::FeatureExecuteOnly);
TargetLoweringObjectFileELF::Initialize(Ctx, TM);
InitializeELF(isAAPCS_ABI);
@@ -40,6 +41,17 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
if (isAAPCS_ABI) {
LSDASection = nullptr;
}
+
+ // Make code section unreadable when in execute-only mode
+ if (genExecuteOnly) {
+ unsigned Type = ELF::SHT_PROGBITS;
+ unsigned Flags =
+ ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_ARM_PURECODE;
+ // Since we cannot modify flags for an existing section, we create a new
+ // section with the right flags, and use 0 as the unique ID for
+ // execute-only text
+ TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U);
+ }
}
const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 94f9cefe429c..f72bb8632eb7 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -77,8 +77,8 @@ int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
return 1;
return ST->hasV6T2Ops() ? 2 : 3;
}
- // Thumb1.
- if (SImmVal >= 0 && SImmVal < 256)
+ // Thumb1, any i8 imm cost 1.
+ if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
return 1;
if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
return 2;
@@ -400,10 +400,29 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
- // We only handle costs of reverse and select shuffles for now.
- if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Select)
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ if (Kind == TTI::SK_Broadcast) {
+ static const CostTblEntry NEONDupTbl[] = {
+ // VDUP handles these cases.
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE,
+ LT.second))
+ return LT.first * Entry->Cost;
+
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ }
if (Kind == TTI::SK_Reverse) {
static const CostTblEntry NEONShuffleTbl[] = {
// Reverse shuffle cost one instruction if we are shuffling within a
@@ -412,6 +431,8 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
{ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
{ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
{ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
{ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
@@ -542,14 +563,17 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
// vldN/vstN doesn't support vector types of i64/f64 element.
bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
- if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
+ if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
+ !UseMaskForCond && !UseMaskForGaps) {
unsigned NumElts = VecTy->getVectorNumElements();
auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
@@ -562,7 +586,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
}
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index e0cd2d8e26a6..2dd143d48a15 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -57,7 +57,7 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
const FeatureBitset InlineFeatureWhitelist = {
ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2,
ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8,
- ARM::FeatureFullFP16, ARM::FeatureHWDivThumb,
+ ARM::FeatureFullFP16, ARM::FeatureFP16FML, ARM::FeatureHWDivThumb,
ARM::FeatureHWDivARM, ARM::FeatureDB, ARM::FeatureV7Clrex,
ARM::FeatureAcquireRelease, ARM::FeatureSlowFPBrcc,
ARM::FeaturePerfMon, ARM::FeatureTrustZone, ARM::Feature8MSecExt,
@@ -169,7 +169,9 @@ public:
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
- unsigned AddressSpace);
+ unsigned AddressSpace,
+ bool UseMaskForCond = false,
+ bool UseMaskForGaps = false);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index a5fbbbf26be9..3832b0112b87 100644
--- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -8,6 +8,7 @@
//===----------------------------------------------------------------------===//
#include "ARMFeatures.h"
+#include "InstPrinter/ARMInstPrinter.h"
#include "Utils/ARMBaseInfo.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "MCTargetDesc/ARMBaseInfo.h"
@@ -631,6 +632,8 @@ public:
void ReportNearMisses(SmallVectorImpl<NearMissInfo> &NearMisses, SMLoc IDLoc,
OperandVector &Operands);
+ void doBeforeLabelEmit(MCSymbol *Symbol) override;
+
void onLabelParsed(MCSymbol *Symbol) override;
};
@@ -3203,17 +3206,26 @@ public:
} // end anonymous namespace.
void ARMOperand::print(raw_ostream &OS) const {
+ auto RegName = [](unsigned Reg) {
+ if (Reg)
+ return ARMInstPrinter::getRegisterName(Reg);
+ else
+ return "noreg";
+ };
+
switch (Kind) {
case k_CondCode:
OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">";
break;
case k_CCOut:
- OS << "<ccout " << getReg() << ">";
+ OS << "<ccout " << RegName(getReg()) << ">";
break;
case k_ITCondMask: {
static const char *const MaskStr[] = {
- "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)",
- "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)"
+ "(invalid)", "(teee)", "(tee)", "(teet)",
+ "(te)", "(tete)", "(tet)", "(tett)",
+ "(t)", "(ttee)", "(tte)", "(ttet)",
+ "(tt)", "(ttte)", "(ttt)", "(tttt)"
};
assert((ITMask.Mask & 0xf) == ITMask.Mask);
OS << "<it-mask " << MaskStr[ITMask.Mask] << ">";
@@ -3247,13 +3259,25 @@ void ARMOperand::print(raw_ostream &OS) const {
OS << "<ARM_TSB::" << TraceSyncBOptToString(getTraceSyncBarrierOpt()) << ">";
break;
case k_Memory:
- OS << "<memory "
- << " base:" << Memory.BaseRegNum;
+ OS << "<memory";
+ if (Memory.BaseRegNum)
+ OS << " base:" << RegName(Memory.BaseRegNum);
+ if (Memory.OffsetImm)
+ OS << " offset-imm:" << *Memory.OffsetImm;
+ if (Memory.OffsetRegNum)
+ OS << " offset-reg:" << (Memory.isNegative ? "-" : "")
+ << RegName(Memory.OffsetRegNum);
+ if (Memory.ShiftType != ARM_AM::no_shift) {
+ OS << " shift-type:" << ARM_AM::getShiftOpcStr(Memory.ShiftType);
+ OS << " shift-imm:" << Memory.ShiftImm;
+ }
+ if (Memory.Alignment)
+ OS << " alignment:" << Memory.Alignment;
OS << ">";
break;
case k_PostIndexRegister:
OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-")
- << PostIdxReg.RegNum;
+ << RegName(PostIdxReg.RegNum);
if (PostIdxReg.ShiftTy != ARM_AM::no_shift)
OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " "
<< PostIdxReg.ShiftImm;
@@ -3269,23 +3293,21 @@ void ARMOperand::print(raw_ostream &OS) const {
break;
}
case k_Register:
- OS << "<register " << getReg() << ">";
+ OS << "<register " << RegName(getReg()) << ">";
break;
case k_ShifterImmediate:
OS << "<shift " << (ShifterImm.isASR ? "asr" : "lsl")
<< " #" << ShifterImm.Imm << ">";
break;
case k_ShiftedRegister:
- OS << "<so_reg_reg "
- << RegShiftedReg.SrcReg << " "
- << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy)
- << " " << RegShiftedReg.ShiftReg << ">";
+ OS << "<so_reg_reg " << RegName(RegShiftedReg.SrcReg) << " "
+ << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy) << " "
+ << RegName(RegShiftedReg.ShiftReg) << ">";
break;
case k_ShiftedImmediate:
- OS << "<so_reg_imm "
- << RegShiftedImm.SrcReg << " "
- << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy)
- << " #" << RegShiftedImm.ShiftImm << ">";
+ OS << "<so_reg_imm " << RegName(RegShiftedImm.SrcReg) << " "
+ << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy) << " #"
+ << RegShiftedImm.ShiftImm << ">";
break;
case k_RotateImmediate:
OS << "<ror " << " #" << (RotImm.Imm * 8) << ">";
@@ -3309,7 +3331,7 @@ void ARMOperand::print(raw_ostream &OS) const {
const SmallVectorImpl<unsigned> &RegList = getRegList();
for (SmallVectorImpl<unsigned>::const_iterator
I = RegList.begin(), E = RegList.end(); I != E; ) {
- OS << *I;
+ OS << RegName(*I);
if (++I < E) OS << ", ";
}
@@ -3318,15 +3340,15 @@ void ARMOperand::print(raw_ostream &OS) const {
}
case k_VectorList:
OS << "<vector_list " << VectorList.Count << " * "
- << VectorList.RegNum << ">";
+ << RegName(VectorList.RegNum) << ">";
break;
case k_VectorListAllLanes:
OS << "<vector_list(all lanes) " << VectorList.Count << " * "
- << VectorList.RegNum << ">";
+ << RegName(VectorList.RegNum) << ">";
break;
case k_VectorListIndexed:
OS << "<vector_list(lane " << VectorList.LaneIndex << ") "
- << VectorList.Count << " * " << VectorList.RegNum << ">";
+ << VectorList.Count << " * " << RegName(VectorList.RegNum) << ">";
break;
case k_Token:
OS << "'" << getToken() << "'";
@@ -5626,7 +5648,8 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" ||
Mnemonic == "bxns" || Mnemonic == "blxns" ||
Mnemonic == "vudot" || Mnemonic == "vsdot" ||
- Mnemonic == "vcmla" || Mnemonic == "vcadd")
+ Mnemonic == "vcmla" || Mnemonic == "vcadd" ||
+ Mnemonic == "vfmal" || Mnemonic == "vfmsl")
return Mnemonic;
// First, split out any predication code. Ignore mnemonics we know aren't
@@ -5716,7 +5739,10 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
(FullInst.startswith("vmull") && FullInst.endswith(".p64")) ||
Mnemonic == "vmovx" || Mnemonic == "vins" ||
Mnemonic == "vudot" || Mnemonic == "vsdot" ||
- Mnemonic == "vcmla" || Mnemonic == "vcadd") {
+ Mnemonic == "vcmla" || Mnemonic == "vcadd" ||
+ Mnemonic == "vfmal" || Mnemonic == "vfmsl" ||
+ Mnemonic == "sb" || Mnemonic == "ssbb" ||
+ Mnemonic == "pssbb") {
// These mnemonics are never predicable
CanAcceptPredicationCode = false;
} else if (!isThumb()) {
@@ -6819,6 +6845,26 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
"code specified");
break;
}
+ case ARM::DSB:
+ case ARM::t2DSB: {
+
+ if (Inst.getNumOperands() < 2)
+ break;
+
+ unsigned Option = Inst.getOperand(0).getImm();
+ unsigned Pred = Inst.getOperand(1).getImm();
+
+ // SSBB and PSSBB (DSB #0|#4) are not predicable (pred must be AL).
+ if (Option == 0 && Pred != ARMCC::AL)
+ return Error(Operands[1]->getStartLoc(),
+ "instruction 'ssbb' is not predicable, but condition code "
+ "specified");
+ if (Option == 4 && Pred != ARMCC::AL)
+ return Error(Operands[1]->getStartLoc(),
+ "instruction 'pssbb' is not predicable, but condition code "
+ "specified");
+ break;
+ }
case ARM::VMOVRRS: {
// Source registers must be sequential.
const unsigned Sm = MRI->getEncodingValue(Inst.getOperand(2).getReg());
@@ -6837,6 +6883,15 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
"destination operands must be sequential");
break;
}
+ case ARM::VLDMDIA:
+ case ARM::VSTMDIA: {
+ ARMOperand &Op = static_cast<ARMOperand&>(*Operands[3]);
+ auto &RegList = Op.getRegList();
+ if (RegList.size() < 1 || RegList.size() > 16)
+ return Error(Operands[3]->getStartLoc(),
+ "list of registers must be at least 1 and at most 16");
+ break;
+ }
}
return false;
@@ -9122,33 +9177,9 @@ bool ARMAsmParser::isITBlockTerminator(MCInst &Inst) const {
// Any arithmetic instruction which writes to the PC also terminates the IT
// block.
- for (unsigned OpIdx = 0; OpIdx < MCID.getNumDefs(); ++OpIdx) {
- MCOperand &Op = Inst.getOperand(OpIdx);
- if (Op.isReg() && Op.getReg() == ARM::PC)
- return true;
- }
-
- if (MCID.hasImplicitDefOfPhysReg(ARM::PC, MRI))
+ if (MCID.hasDefOfPhysReg(Inst, ARM::PC, *MRI))
return true;
- // Instructions with variable operand lists, which write to the variable
- // operands. We only care about Thumb instructions here, as ARM instructions
- // obviously can't be in an IT block.
- switch (Inst.getOpcode()) {
- case ARM::tLDMIA:
- case ARM::t2LDMIA:
- case ARM::t2LDMIA_UPD:
- case ARM::t2LDMDB:
- case ARM::t2LDMDB_UPD:
- if (listContainsReg(Inst, 3, ARM::PC))
- return true;
- break;
- case ARM::tPOP:
- if (listContainsReg(Inst, 2, ARM::PC))
- return true;
- break;
- }
-
return false;
}
@@ -9255,6 +9286,10 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
switch (MatchResult) {
case Match_Success:
+ LLVM_DEBUG(dbgs() << "Parsed as: ";
+ Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
+ dbgs() << "\n");
+
// Context sensitive operand constraints aren't handled by the matcher,
// so check them here.
if (validateInstruction(Inst, Operands)) {
@@ -9272,7 +9307,9 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// individual transformations can chain off each other. E.g.,
// tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
while (processInstruction(Inst, Operands, Out))
- ;
+ LLVM_DEBUG(dbgs() << "Changed to: ";
+ Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
+ dbgs() << "\n");
// Only after the instruction is fully processed, we can validate it
if (wasInITBlock && hasV8Ops() && isThumb() &&
@@ -9441,10 +9478,13 @@ bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
return false;
}
-void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
+void ARMAsmParser::doBeforeLabelEmit(MCSymbol *Symbol) {
// We need to flush the current implicit IT block on a label, because it is
// not legal to branch into an IT block.
flushPendingInstructions(getStreamer());
+}
+
+void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
if (NextSymbolIsThumb) {
getParser().getStreamer().EmitThumbFunc(Symbol);
NextSymbolIsThumb = false;
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index bfc32073ba18..2f84719c4c4f 100644
--- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -273,6 +273,21 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
case ARM::t2TSB:
O << "\ttsb\tcsync";
return;
+ case ARM::t2DSB:
+ switch (MI->getOperand(0).getImm()) {
+ default:
+ if (!printAliasInstr(MI, STI, O))
+ printInstruction(MI, STI, O);
+ break;
+ case 0:
+ O << "\tssbb";
+ break;
+ case 4:
+ O << "\tpssbb";
+ break;
+ }
+ printAnnotation(O, Annot);
+ return;
}
if (!printAliasInstr(MI, STI, O))
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index f472b2154314..e1ea5964cf67 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -16,6 +16,7 @@
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/bit.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include <cassert>
@@ -627,27 +628,22 @@ namespace ARM_AM {
//
inline float getFPImmFloat(unsigned Imm) {
// We expect an 8-bit binary encoding of a floating-point number here.
- union {
- uint32_t I;
- float F;
- } FPUnion;
uint8_t Sign = (Imm >> 7) & 0x1;
uint8_t Exp = (Imm >> 4) & 0x7;
uint8_t Mantissa = Imm & 0xf;
- // 8-bit FP iEEEE Float Encoding
+ // 8-bit FP IEEE Float Encoding
// abcd efgh aBbbbbbc defgh000 00000000 00000000
//
// where B = NOT(b);
-
- FPUnion.I = 0;
- FPUnion.I |= Sign << 31;
- FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
- FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
- FPUnion.I |= (Exp & 0x3) << 23;
- FPUnion.I |= Mantissa << 19;
- return FPUnion.F;
+ uint32_t I = 0;
+ I |= Sign << 31;
+ I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+ I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+ I |= (Exp & 0x3) << 23;
+ I |= Mantissa << 19;
+ return bit_cast<float>(I);
}
/// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index f524a0081301..c2a07d4ddcef 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -373,6 +373,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
// interfere with checking valid expressions.
if (const MCSymbolRefExpr *A = Target.getSymA()) {
if (A->hasSubsectionsViaSymbols() && Asm.isThumbFunc(&A->getSymbol()) &&
+ A->getSymbol().isExternal() &&
(Kind == FK_Data_4 || Kind == ARM::fixup_arm_movw_lo16 ||
Kind == ARM::fixup_arm_movt_hi16 || Kind == ARM::fixup_t2_movw_lo16 ||
Kind == ARM::fixup_t2_movt_hi16))
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index beeb5dec4baf..33c32d5464af 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -248,6 +248,11 @@ namespace ARMII {
/// just that part of the flag set.
MO_OPTION_MASK = 0x3,
+ /// MO_COFFSTUB - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the ".refptrp.FOO" symbol. This is used for
+ /// stub symbols on windows.
+ MO_COFFSTUB = 0x4,
+
/// MO_GOT - On a symbol operand, this represents a GOT relative relocation.
MO_GOT = 0x8,
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 7d04c73fb3f2..b8ba7584911b 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -14,6 +14,7 @@
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
@@ -40,6 +41,8 @@ namespace {
bool needsRelocateWithSymbol(const MCSymbol &Sym,
unsigned Type) const override;
+
+ void addTargetSectionFlags(MCContext &Ctx, MCSectionELF &Sec) override;
};
} // end anonymous namespace
@@ -236,6 +239,21 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
}
}
+void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx,
+ MCSectionELF &Sec) {
+ // The mix of execute-only and non-execute-only at link time is
+ // non-execute-only. To avoid the empty implicitly created .text
+ // section from making the whole .text section non-execute-only, we
+ // mark it execute-only if it is empty and there is at least one
+ // execute-only section in the object.
+ MCSectionELF *TextSection =
+ static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
+ if (Sec.getKind().isExecuteOnly() && !TextSection->hasInstructions() &&
+ !TextSection->hasData()) {
+ TextSection->setFlags(TextSection->getFlags() | ELF::SHF_ARM_PURECODE);
+ }
+}
+
std::unique_ptr<MCObjectTargetWriter>
llvm::createARMELFObjectWriter(uint8_t OSABI) {
return llvm::make_unique<ARMELFObjectWriter>(OSABI);
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 3373d691db50..d3744fffac32 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -465,6 +465,11 @@ public:
void emitPad(int64_t Offset);
void emitRegSave(const SmallVectorImpl<unsigned> &RegList, bool isVector);
void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes);
+ void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
+ SMLoc Loc) override {
+ EmitDataMappingSymbol();
+ MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
+ }
void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
LastMappingSymbols[getCurrentSection().first] = std::move(LastEMSInfo);
@@ -861,6 +866,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
case ARM::ArchKind::ARMV8_2A:
case ARM::ArchKind::ARMV8_3A:
case ARM::ArchKind::ARMV8_4A:
+ case ARM::ArchKind::ARMV8_5A:
setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
setAttributeItem(ARM_ISA_use, Allowed, false);
setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
@@ -1071,7 +1077,7 @@ void ARMTargetELFStreamer::finishAttributeSection() {
if (Contents.empty())
return;
- llvm::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag);
+ llvm::sort(Contents, AttributeItem::LessTag);
ARMELFStreamer &Streamer = getStreamer();
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 0cef683778e5..3ee63ac374b3 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -31,6 +31,9 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) {
SupportsDebugInformation = true;
+ // Conditional Thumb 4-byte instructions can have an implicit IT.
+ MaxInstLength = 6;
+
// Exceptions handling
ExceptionsType = (TheTriple.isOSDarwin() && !TheTriple.isWatchABI())
? ExceptionHandling::SjLj
@@ -56,6 +59,9 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo(const Triple &TheTriple) {
SupportsDebugInformation = true;
+ // Conditional Thumb 4-byte instructions can have an implicit IT.
+ MaxInstLength = 6;
+
// Exceptions handling
switch (TheTriple.getOS()) {
case Triple::NetBSD:
@@ -90,6 +96,9 @@ ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() {
PrivateGlobalPrefix = "$M";
PrivateLabelPrefix = "$M";
CommentString = ";";
+
+ // Conditional Thumb 4-byte instructions can have an implicit IT.
+ MaxInstLength = 6;
}
void ARMCOFFMCAsmInfoGNU::anchor() { }
@@ -110,5 +119,7 @@ ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
UseIntegratedAssembler = true;
DwarfRegNumForCFI = false;
-}
+ // Conditional Thumb 4-byte instructions can have an implicit IT.
+ MaxInstLength = 6;
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 4b4956e914f2..0ced8195790d 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -22,6 +22,8 @@
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ScopedPrinter.h"
+
using namespace llvm;
namespace {
@@ -144,6 +146,15 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
MCValue Target,
uint64_t &FixedValue) {
uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+
+ if (FixupOffset & 0xff000000) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "can not encode offset '0x" +
+ to_hexString(FixupOffset) +
+ "' in resulting scattered relocation.");
+ return;
+ }
+
unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
unsigned Type = MachO::ARM_RELOC_HALF;
@@ -250,6 +261,15 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
unsigned Log2Size,
uint64_t &FixedValue) {
uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+
+ if (FixupOffset & 0xff000000) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "can not encode offset '0x" +
+ to_hexString(FixupOffset) +
+ "' in resulting scattered relocation.");
+ return;
+ }
+
unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
// See <reloc.h>.
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index 63aa9735e8a4..91836cff95c8 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -11,7 +11,7 @@
//
//===----------------------------------------------------------------------===//
-#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
#include "llvm/MC/ConstantPools.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 8ae713b7b489..30cbde1ca71f 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -75,8 +75,8 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx,
case ARM::fixup_t2_condbranch:
return COFF::IMAGE_REL_ARM_BRANCH20T;
case ARM::fixup_t2_uncondbranch:
- return COFF::IMAGE_REL_ARM_BRANCH24T;
case ARM::fixup_arm_thumb_bl:
+ return COFF::IMAGE_REL_ARM_BRANCH24T;
case ARM::fixup_arm_thumb_blx:
return COFF::IMAGE_REL_ARM_BLX23T;
case ARM::fixup_t2_movw_lo16:
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 1a91a7030657..d567d3339049 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -146,9 +146,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
- if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass ||
- RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
- RC == &ARM::GPRnopcRegClass) {
+ if (ARM::GPRRegClass.hasSubClassEq(RC)) {
BuildMI(MBB, I, DL, get(ARM::t2STRi12))
.addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FI)
@@ -190,9 +188,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
DebugLoc DL;
if (I != MBB.end()) DL = I->getDebugLoc();
- if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass ||
- RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
- RC == &ARM::GPRnopcRegClass) {
+ if (ARM::GPRRegClass.hasSubClassEq(RC)) {
BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
.addFrameIndex(FI)
.addImm(0)
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index abf54ba7e87c..65889fc4e28b 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -122,6 +122,7 @@ namespace {
{ ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0,0,0 },
{ ARM::t2SXTB, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 },
{ ARM::t2SXTH, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 },
+ { ARM::t2TEQrr, ARM::tEOR, 0, 0, 0, 1, 0, 2,0, 0,1,0 },
{ ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0,0,0 },
{ ARM::t2UXTB, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 },
{ ARM::t2UXTH, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 },
@@ -485,7 +486,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
.addReg(Rt, IsStore ? 0 : RegState::Define);
// Transfer memoperands.
- MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ MIB.setMemRefs(MI->memoperands());
// Transfer MI flags.
MIB.setMIFlags(MI->getFlags());
@@ -605,7 +606,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
MIB.add(MI->getOperand(OpNum));
// Transfer memoperands.
- MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ MIB.setMemRefs(MI->memoperands());
// Transfer MI flags.
MIB.setMIFlags(MI->getFlags());
@@ -717,6 +718,16 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
return true;
return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
}
+ case ARM::t2TEQrr: {
+ unsigned PredReg = 0;
+ // Can only convert to eors if we're not in an IT block.
+ if (getInstrPredicate(*MI, PredReg) != ARMCC::AL)
+ break;
+ // TODO if Operand 0 is not killed but Operand 1 is, then we could write
+ // to Op1 instead.
+ if (MI->getOperand(0).isKill())
+ return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+ }
}
return false;
}
@@ -903,9 +914,24 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
// Add the 16-bit instruction.
DebugLoc dl = MI->getDebugLoc();
MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
- MIB.add(MI->getOperand(0));
- if (NewMCID.hasOptionalDef())
- MIB.add(HasCC ? t1CondCodeOp(CCDead) : condCodeOp());
+
+ // TEQ is special in that it doesn't define a register but we're converting
+ // it into an EOR which does. So add the first operand as a def and then
+ // again as a use.
+ if (MCID.getOpcode() == ARM::t2TEQrr) {
+ MIB.add(MI->getOperand(0));
+ MIB->getOperand(0).setIsKill(false);
+ MIB->getOperand(0).setIsDef(true);
+ MIB->getOperand(0).setIsDead(true);
+
+ if (NewMCID.hasOptionalDef())
+ MIB.add(HasCC ? t1CondCodeOp(CCDead) : condCodeOp());
+ MIB.add(MI->getOperand(0));
+ } else {
+ MIB.add(MI->getOperand(0));
+ if (NewMCID.hasOptionalDef())
+ MIB.add(HasCC ? t1CondCodeOp(CCDead) : condCodeOp());
+ }
// Transfer the rest of operands.
unsigned NumOps = MCID.getNumOperands();