diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-01-20 11:41:25 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-01-20 11:41:25 +0000 |
commit | d9484dd61cc151c4f34c31e07f693fefa66316b5 (patch) | |
tree | ab0560b3da293f1fafd3269c59692e929418f5c2 /contrib/llvm/lib/Target/ARM | |
parent | 79e0962d4c3cf1f0acf359a9d69cb3ac68c414c4 (diff) | |
parent | d8e91e46262bc44006913e6796843909f1ac7bcd (diff) |
Notes
Diffstat (limited to 'contrib/llvm/lib/Target/ARM')
52 files changed, 2587 insertions, 1264 deletions
diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td index 2e62a0790418..3db60f1c16d6 100644 --- a/contrib/llvm/lib/Target/ARM/ARM.td +++ b/contrib/llvm/lib/Target/ARM/ARM.td @@ -61,6 +61,11 @@ def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", "floating point", [FeatureFPARMv8]>; +def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true", + "Enable full half-precision " + "floating point fml instructions", + [FeatureFullFP16]>; + def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", "Floating point unit supports " "single precision only">; @@ -194,6 +199,10 @@ def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg", "SlowLoadDSubregister", "true", "Loading into D subregs is slow">; +def FeatureUseWideStrideVFP : SubtargetFeature<"wide-stride-vfp", + "UseWideStrideVFP", "true", + "Use a wide stride when allocating VFP registers">; + // Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD. def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs", "DontWidenVMOVS", "true", @@ -256,6 +265,9 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", "Prefer 32-bit Thumb instrs">; +def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2", + "Prefer 32-bit alignment for loops">; + /// Some instructions update CPSR partially, which can add false dependency for /// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is /// mapped to a separate physical register. Avoid partial CPSR update for these @@ -351,6 +363,11 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler", def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", "Use alias analysis during codegen">; +// Armv8.5-A extensions + +def FeatureSB : SubtargetFeature<"sb", "HasSB", "true", + "Enable v8.5a Speculation Barrier" >; + //===----------------------------------------------------------------------===// // ARM architecture class // @@ -440,6 +457,10 @@ def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true", "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd]>; +def HasV8_5aOps : SubtargetFeature<"v8.5a", "HasV8_5aOps", "true", + "Support ARM v8.5a instructions", + [HasV8_4aOps, FeatureSB]>; + //===----------------------------------------------------------------------===// // ARM Processor subtarget features. // @@ -482,8 +503,25 @@ def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift", "Swift ARM processors", []>; -def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", - "Samsung Exynos-Mx processors", []>; +def ProcExynos : SubtargetFeature<"exynos", "ARMProcFamily", "Exynos", + "Samsung Exynos processors", + [FeatureZCZeroing, + FeatureUseWideStrideVFP, + FeatureUseAA, + FeatureSplatVFPToNeon, + FeatureSlowVGETLNi32, + FeatureSlowVDUP32, + FeatureSlowFPBrcc, + FeatureProfUnpredicate, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureHasSlowFPVMLx, + FeatureHasRetAddrStack, + FeatureFuseLiterals, + FeatureFuseAES, + FeatureExpandMLx, + FeatureCrypto, + FeatureCRC]>; def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4", "Cortex-R4 ARM processors", []>; @@ -659,6 +697,20 @@ def ARMv84a : Architecture<"armv8.4-a", "ARMv84a", [HasV8_4aOps, FeatureRAS, FeatureDotProd]>; +def ARMv85a : Architecture<"armv8.5-a", "ARMv85a", [HasV8_5aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC, + FeatureRAS, + FeatureDotProd]>; + def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops, FeatureRClass, FeatureDB, @@ -865,6 +917,7 @@ def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, FeatureHasRetAddrStack, FeatureNEONForFP, FeatureVFP4, + FeatureUseWideStrideVFP, FeatureMP, FeatureHWDivThumb, FeatureHWDivARM, @@ -926,6 +979,7 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m, ProcM3, + FeaturePrefLoopAlign32, FeatureHasNoBranchPredictor]>; def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m, @@ -936,6 +990,8 @@ def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em, FeatureVFP4, FeatureVFPOnlySP, FeatureD16, + FeaturePrefLoopAlign32, + FeatureHasSlowFPVMLx, FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-m7", [ARMv7em, @@ -950,6 +1006,8 @@ def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline, FeatureFPARMv8, FeatureD16, FeatureVFPOnlySP, + FeaturePrefLoopAlign32, + FeatureHasSlowFPVMLx, FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-a32", [ARMv8a, @@ -985,7 +1043,7 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ARMv8a, ProcA57, FeatureAvoidPartialCPSR, FeatureCheapPredicableCPSR]>; -def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72, +def : ProcessorModel<"cortex-a72", CortexA57Model, [ARMv8a, ProcA72, FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, @@ -1017,29 +1075,12 @@ def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureZCZeroing, FeatureNoPostRASched]>; -def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynosM1, - FeatureHWDivThumb, - FeatureHWDivARM, - FeatureCrypto, - FeatureCRC]>; - -def : ProcNoItin<"exynos-m2", [ARMv8a, ProcExynosM1, - FeatureHWDivThumb, - FeatureHWDivARM, - FeatureCrypto, - FeatureCRC]>; - -def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynosM1, - FeatureHWDivThumb, - FeatureHWDivARM, - FeatureCrypto, - FeatureCRC]>; - -def : ProcNoItin<"exynos-m4", [ARMv8a, ProcExynosM1, - FeatureHWDivThumb, - FeatureHWDivARM, - FeatureCrypto, - FeatureCRC]>; +def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynos]>; +def : ProcNoItin<"exynos-m2", [ARMv8a, ProcExynos]>; +def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynos]>; +def : ProcNoItin<"exynos-m4", [ARMv82a, ProcExynos, + FeatureFullFP16, + FeatureDotProd]>; def : ProcNoItin<"kryo", [ARMv8a, ProcKryo, FeatureHWDivThumb, diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index b227eaed8d61..b7cd3a0c2dae 100644 --- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -367,6 +367,18 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags); unsigned RC; + bool FirstHalf; + const ARMBaseTargetMachine &ATM = + static_cast<const ARMBaseTargetMachine &>(TM); + + // 'Q' should correspond to the low order register and 'R' to the high + // order register. Whether this corresponds to the upper or lower half + // depends on the endianess mode. + if (ExtraCode[0] == 'Q') + FirstHalf = ATM.isLittleEndian(); + else + // ExtraCode[0] == 'R'. + FirstHalf = !ATM.isLittleEndian(); const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); if (InlineAsm::hasRegClassConstraint(Flags, RC) && ARM::GPRPairRegClass.hasSubClassEq(TRI->getRegClass(RC))) { @@ -376,14 +388,14 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, if (!MO.isReg()) return true; const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - unsigned Reg = TRI->getSubReg(MO.getReg(), ExtraCode[0] == 'Q' ? + unsigned Reg = TRI->getSubReg(MO.getReg(), FirstHalf ? ARM::gsub_0 : ARM::gsub_1); O << ARMInstPrinter::getRegisterName(Reg); return false; } if (NumVals != 2) return true; - unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1; + unsigned RegOp = FirstHalf ? OpNum : OpNum + 1; if (RegOp >= MI->getNumOperands()) return true; const MachineOperand &MO = MI->getOperand(RegOp); @@ -815,15 +827,31 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); - bool IsIndirect = (TargetFlags & ARMII::MO_DLLIMPORT); + bool IsIndirect = + (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)); if (!IsIndirect) return getSymbol(GV); SmallString<128> Name; - Name = "__imp_"; + if (TargetFlags & ARMII::MO_DLLIMPORT) + Name = "__imp_"; + else if (TargetFlags & ARMII::MO_COFFSTUB) + Name = ".refptr."; getNameWithPrefix(Name, GV); - return OutContext.getOrCreateSymbol(Name); + MCSymbol *MCSym = OutContext.getOrCreateSymbol(Name); + + if (TargetFlags & ARMII::MO_COFFSTUB) { + MachineModuleInfoCOFF &MMICOFF = + MMI->getObjFileInfo<MachineModuleInfoCOFF>(); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMICOFF.getGVStubEntry(MCSym); + + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV), true); + } + + return MCSym; } else if (Subtarget->isTargetELF()) { return getSymbol(GV); } @@ -1043,10 +1071,12 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); const MachineFunction &MF = *MI->getParent()->getParent(); - const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + const TargetRegisterInfo *TargetRegInfo = + MF.getSubtarget().getRegisterInfo(); + const MachineRegisterInfo &MachineRegInfo = MF.getRegInfo(); const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>(); - unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned FramePtr = TargetRegInfo->getFrameRegister(MF); unsigned Opc = MI->getOpcode(); unsigned SrcReg, DstReg; @@ -1103,7 +1133,9 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { if (MO.isUndef()) { assert(RegList.empty() && "Pad registers must come before restored ones"); - Pad += 4; + unsigned Width = + TargetRegInfo->getRegSizeInBits(MO.getReg(), MachineRegInfo) / 8; + Pad += Width; continue; } RegList.push_back(MO.getReg()); diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index b1c2031c7d7b..bbebed59c851 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -708,8 +708,12 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return MCID.getSize(); // If this machine instr is an inline asm, measure it. - if (MI.getOpcode() == ARM::INLINEASM) - return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); + if (MI.getOpcode() == ARM::INLINEASM) { + unsigned Size = getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); + if (!MF->getInfo<ARMFunctionInfo>()->isThumbFunction()) + Size = alignTo(Size, 4); + return Size; + } unsigned Opc = MI.getOpcode(); switch (Opc) { default: @@ -935,9 +939,9 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Mov->addRegisterKilled(SrcReg, TRI); } -bool ARMBaseInstrInfo::isCopyInstr(const MachineInstr &MI, - const MachineOperand *&Src, - const MachineOperand *&Dest) const { +bool ARMBaseInstrInfo::isCopyInstrImpl(const MachineInstr &MI, + const MachineOperand *&Src, + const MachineOperand *&Dest) const { // VMOVRRD is also a copy instruction but it requires // special way of handling. It is more complex copy version // and since that we are not considering it. For recognition @@ -971,8 +975,6 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - DebugLoc DL; - if (I != MBB.end()) DL = I->getDebugLoc(); MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); @@ -984,7 +986,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, switch (TRI->getSpillSize(*RC)) { case 2: if (ARM::HPRRegClass.hasSubClassEq(RC)) { - BuildMI(MBB, I, DL, get(ARM::VSTRH)) + BuildMI(MBB, I, DebugLoc(), get(ARM::VSTRH)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI) .addImm(0) @@ -995,14 +997,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 4: if (ARM::GPRRegClass.hasSubClassEq(RC)) { - BuildMI(MBB, I, DL, get(ARM::STRi12)) + BuildMI(MBB, I, DebugLoc(), get(ARM::STRi12)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI) .addImm(0) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); } else if (ARM::SPRRegClass.hasSubClassEq(RC)) { - BuildMI(MBB, I, DL, get(ARM::VSTRS)) + BuildMI(MBB, I, DebugLoc(), get(ARM::VSTRS)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI) .addImm(0) @@ -1013,7 +1015,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 8: if (ARM::DPRRegClass.hasSubClassEq(RC)) { - BuildMI(MBB, I, DL, get(ARM::VSTRD)) + BuildMI(MBB, I, DebugLoc(), get(ARM::VSTRD)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI) .addImm(0) @@ -1021,7 +1023,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .add(predOps(ARMCC::AL)); } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) { if (Subtarget.hasV5TEOps()) { - MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STRD)); + MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::STRD)); AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO) @@ -1029,7 +1031,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, } else { // Fallback to STM instruction, which has existed since the dawn of // time. - MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STMIA)) + MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::STMIA)) .addFrameIndex(FI) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); @@ -1043,14 +1045,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, if (ARM::DPairRegClass.hasSubClassEq(RC)) { // Use aligned spills if the stack can be realigned. if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { - BuildMI(MBB, I, DL, get(ARM::VST1q64)) + BuildMI(MBB, I, DebugLoc(), get(ARM::VST1q64)) .addFrameIndex(FI) .addImm(16) .addReg(SrcReg, getKillRegState(isKill)) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); } else { - BuildMI(MBB, I, DL, get(ARM::VSTMQIA)) + BuildMI(MBB, I, DebugLoc(), get(ARM::VSTMQIA)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI) .addMemOperand(MMO) @@ -1063,14 +1065,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, if (ARM::DTripleRegClass.hasSubClassEq(RC)) { // Use aligned spills if the stack can be realigned. if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { - BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo)) + BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64TPseudo)) .addFrameIndex(FI) .addImm(16) .addReg(SrcReg, getKillRegState(isKill)) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); } else { - MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA)) + MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), + get(ARM::VSTMDIA)) .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); @@ -1086,14 +1089,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { // FIXME: It's possible to only store part of the QQ register if the // spilled def has a sub-register index. - BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo)) + BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64QPseudo)) .addFrameIndex(FI) .addImm(16) .addReg(SrcReg, getKillRegState(isKill)) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); } else { - MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA)) + MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), + get(ARM::VSTMDIA)) .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); @@ -1107,7 +1111,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 64: if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) { - MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA)) + MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::VSTMDIA)) .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); @@ -1172,8 +1176,14 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI, unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { - const MachineMemOperand *Dummy; - return MI.mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex); + SmallVector<const MachineMemOperand *, 1> Accesses; + if (MI.mayStore() && hasStoreToStackSlot(MI, Accesses)) { + FrameIndex = + cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue()) + ->getFrameIndex(); + return true; + } + return false; } void ARMBaseInstrInfo:: @@ -1386,8 +1396,14 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { - const MachineMemOperand *Dummy; - return MI.mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex); + SmallVector<const MachineMemOperand *, 1> Accesses; + if (MI.mayLoad() && hasLoadFromStackSlot(MI, Accesses)) { + FrameIndex = + cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue()) + ->getFrameIndex(); + return true; + } + return false; } /// Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD @@ -1432,9 +1448,8 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const { SmallVector<unsigned, 6> ScratchRegs; for(unsigned I = 5; I < MI->getNumOperands(); ++I) ScratchRegs.push_back(MI->getOperand(I).getReg()); - llvm::sort(ScratchRegs.begin(), ScratchRegs.end(), - [&TRI](const unsigned &Reg1, - const unsigned &Reg2) -> bool { + llvm::sort(ScratchRegs, + [&TRI](const unsigned &Reg1, const unsigned &Reg2) -> bool { return TRI.getEncodingValue(Reg1) < TRI.getEncodingValue(Reg2); }); @@ -1590,11 +1605,10 @@ void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); unsigned CPI = Orig.getOperand(1).getIndex(); unsigned PCLabelId = duplicateCPV(MF, CPI); - MachineInstrBuilder MIB = - BuildMI(MBB, I, Orig.getDebugLoc(), get(Opcode), DestReg) - .addConstantPoolIndex(CPI) - .addImm(PCLabelId); - MIB->setMemRefs(Orig.memoperands_begin(), Orig.memoperands_end()); + BuildMI(MBB, I, Orig.getDebugLoc(), get(Opcode), DestReg) + .addConstantPoolIndex(CPI) + .addImm(PCLabelId) + .cloneMemRefs(Orig); break; } } @@ -2185,6 +2199,7 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = { {ARM::tSUBSi8, ARM::tSUBi8}, {ARM::tSUBSrr, ARM::tSUBrr}, {ARM::tSBCS, ARM::tSBC}, + {ARM::tRSBS, ARM::tRSB}, {ARM::t2ADDSri, ARM::t2ADDri}, {ARM::t2ADDSrr, ARM::t2ADDrr}, @@ -2949,6 +2964,8 @@ bool ARMBaseInstrInfo::optimizeCompareInstr( for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++) OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second); + MI->clearRegisterDeads(ARM::CPSR); + return true; } @@ -4534,9 +4551,9 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI, MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg); MIB.addReg(Reg, RegState::Kill) - .addImm(0) - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()) - .add(predOps(ARMCC::AL)); + .addImm(0) + .cloneMemRefs(*MI) + .add(predOps(ARMCC::AL)); } bool @@ -5061,3 +5078,32 @@ bool ARMBaseInstrInfo::getInsertSubregLikeInputs( } llvm_unreachable("Target dependent opcode missing"); } + +std::pair<unsigned, unsigned> +ARMBaseInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + const unsigned Mask = ARMII::MO_OPTION_MASK; + return std::make_pair(TF & Mask, TF & ~Mask); +} + +ArrayRef<std::pair<unsigned, const char *>> +ARMBaseInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace ARMII; + + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_LO16, "arm-lo16"}, {MO_HI16, "arm-hi16"}}; + return makeArrayRef(TargetFlags); +} + +ArrayRef<std::pair<unsigned, const char *>> +ARMBaseInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { + using namespace ARMII; + + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_COFFSTUB, "arm-coffstub"}, + {MO_GOT, "arm-got"}, + {MO_SBREL, "arm-sbrel"}, + {MO_DLLIMPORT, "arm-dllimport"}, + {MO_SECREL, "arm-secrel"}, + {MO_NONLAZY, "arm-nonlazy"}}; + return makeArrayRef(TargetFlags); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index b54be15097b1..de1f307083ba 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -101,6 +101,12 @@ protected: unsigned OpIdx1, unsigned OpIdx2) const override; + /// If the specific machine instruction is a instruction that moves/copies + /// value from one register to another register return true along with + /// @Source machine operand and @Destination machine operand. + bool isCopyInstrImpl(const MachineInstr &MI, const MachineOperand *&Source, + const MachineOperand *&Destination) const override; + public: // Return whether the target has an explicit NOP encoding. bool hasNOP() const; @@ -201,9 +207,6 @@ public: const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; - bool isCopyInstr(const MachineInstr &MI, const MachineOperand *&Src, - const MachineOperand *&Dest) const override; - void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, bool isKill, int FrameIndex, @@ -331,6 +334,13 @@ public: /// Get the number of addresses by LDM or VLDM or zero for unknown. unsigned getNumLDMAddresses(const MachineInstr &MI) const; + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableBitmaskMachineOperandTargetFlags() const override; + private: unsigned getInstBundleLength(const MachineInstr &MI) const; diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 5342e6e2cd13..02b3daf3c6fd 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -209,6 +209,11 @@ getReservedRegs(const MachineFunction &MF) const { return Reserved; } +bool ARMBaseRegisterInfo:: +isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const { + return !getReservedRegs(MF).test(PhysReg); +} + const TargetRegisterClass * ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const { diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h index f755f66a0f3a..45d29ebc0bd3 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -131,6 +131,8 @@ public: CallingConv::ID) const; BitVector getReservedRegs(const MachineFunction &MF) const override; + bool isAsmClobberable(const MachineFunction &MF, + unsigned PhysReg) const override; const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, @@ -154,7 +156,6 @@ public: void updateRegAllocHint(unsigned Reg, unsigned NewReg, MachineFunction &MF) const override; - bool enableMultipleCopyHints() const override { return true; } bool hasBasePointer(const MachineFunction &MF) const; diff --git a/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp index 47f998b696f5..8e80c32bcf89 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp @@ -237,7 +237,7 @@ void ARMCallLowering::splitToValueTypes( /// Lower the return value for the already existing \p Ret. This assumes that /// \p MIRBuilder's insertion point is correct. bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder, - const Value *Val, unsigned VReg, + const Value *Val, ArrayRef<unsigned> VRegs, MachineInstrBuilder &Ret) const { if (!Val) // Nothing to do here. @@ -251,16 +251,24 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder, if (!isSupportedType(DL, TLI, Val->getType())) return false; - SmallVector<ArgInfo, 4> SplitVTs; - SmallVector<unsigned, 4> Regs; - ArgInfo RetInfo(VReg, Val->getType()); - setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); - splitToValueTypes(RetInfo, SplitVTs, MF, [&](unsigned Reg, uint64_t Offset) { - Regs.push_back(Reg); - }); + SmallVector<EVT, 4> SplitEVTs; + ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs); + assert(VRegs.size() == SplitEVTs.size() && + "For each split Type there should be exactly one VReg."); - if (Regs.size() > 1) - MIRBuilder.buildUnmerge(Regs, VReg); + SmallVector<ArgInfo, 4> SplitVTs; + LLVMContext &Ctx = Val->getType()->getContext(); + for (unsigned i = 0; i < SplitEVTs.size(); ++i) { + ArgInfo CurArgInfo(VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)); + setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F); + + SmallVector<unsigned, 4> Regs; + splitToValueTypes( + CurArgInfo, SplitVTs, MF, + [&](unsigned Reg, uint64_t Offset) { Regs.push_back(Reg); }); + if (Regs.size() > 1) + MIRBuilder.buildUnmerge(Regs, VRegs[i]); + } CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg()); @@ -270,14 +278,15 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder, } bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, - const Value *Val, unsigned VReg) const { - assert(!Val == !VReg && "Return value without a vreg"); + const Value *Val, + ArrayRef<unsigned> VRegs) const { + assert(!Val == VRegs.empty() && "Return value without a vreg"); auto const &ST = MIRBuilder.getMF().getSubtarget<ARMSubtarget>(); unsigned Opcode = ST.getReturnOpcode(); auto Ret = MIRBuilder.buildInstrNoInsert(Opcode).add(predOps(ARMCC::AL)); - if (!lowerReturnVal(MIRBuilder, Val, VReg, Ret)) + if (!lowerReturnVal(MIRBuilder, Val, VRegs, Ret)) return false; MIRBuilder.insertInstr(Ret); @@ -420,7 +429,7 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, auto &TLI = *getTLI<ARMTargetLowering>(); auto Subtarget = TLI.getSubtarget(); - if (Subtarget->isThumb()) + if (Subtarget->isThumb1Only()) return false; // Quick exit if there aren't any args @@ -491,6 +500,22 @@ struct CallReturnHandler : public IncomingValueHandler { MachineInstrBuilder MIB; }; +// FIXME: This should move to the ARMSubtarget when it supports all the opcodes. +unsigned getCallOpcode(const ARMSubtarget &STI, bool isDirect) { + if (isDirect) + return STI.isThumb() ? ARM::tBL : ARM::BL; + + if (STI.isThumb()) + return ARM::tBLXr; + + if (STI.hasV5TOps()) + return ARM::BLX; + + if (STI.hasV4TOps()) + return ARM::BX_CALL; + + return ARM::BMOVPCRX_CALL; +} } // end anonymous namespace bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, @@ -508,27 +533,34 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (STI.genLongCalls()) return false; + if (STI.isThumb1Only()) + return false; + auto CallSeqStart = MIRBuilder.buildInstr(ARM::ADJCALLSTACKDOWN); // Create the call instruction so we can add the implicit uses of arg // registers, but don't insert it yet. bool isDirect = !Callee.isReg(); - auto CallOpcode = - isDirect ? ARM::BL - : STI.hasV5TOps() - ? ARM::BLX - : STI.hasV4TOps() ? ARM::BX_CALL : ARM::BMOVPCRX_CALL; - auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode) - .add(Callee) - .addRegMask(TRI->getCallPreservedMask(MF, CallConv)); - if (Callee.isReg()) { + auto CallOpcode = getCallOpcode(STI, isDirect); + auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode); + + bool isThumb = STI.isThumb(); + if (isThumb) + MIB.add(predOps(ARMCC::AL)); + + MIB.add(Callee); + if (!isDirect) { auto CalleeReg = Callee.getReg(); - if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) - MIB->getOperand(0).setReg(constrainOperandRegClass( + if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) { + unsigned CalleeIdx = isThumb ? 2 : 0; + MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass( MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(), - *MIB.getInstr(), MIB->getDesc(), Callee, 0)); + *MIB.getInstr(), MIB->getDesc(), Callee, CalleeIdx)); + } } + MIB.addRegMask(TRI->getCallPreservedMask(MF, CallConv)); + SmallVector<ArgInfo, 8> ArgInfos; for (auto Arg : OrigArgs) { if (!isSupportedType(DL, TLI, Arg.Ty)) diff --git a/contrib/llvm/lib/Target/ARM/ARMCallLowering.h b/contrib/llvm/lib/Target/ARM/ARMCallLowering.h index 86854c53f179..45a988a2f00e 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCallLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMCallLowering.h @@ -33,8 +33,8 @@ class ARMCallLowering : public CallLowering { public: ARMCallLowering(const ARMTargetLowering &TLI); - bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, - unsigned VReg) const override; + bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, + ArrayRef<unsigned> VRegs) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const override; @@ -45,7 +45,8 @@ public: private: bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val, - unsigned VReg, MachineInstrBuilder &Ret) const; + ArrayRef<unsigned> VRegs, + MachineInstrBuilder &Ret) const; using SplitArgTy = std::function<void(unsigned Reg, uint64_t Offset)>; diff --git a/contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp b/contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp index 24071277427a..b631c2bc687b 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp @@ -54,47 +54,108 @@ EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false), cl::desc("Use DSP instructions for scalar operations\ with immediate operands")); -namespace { +// The goal of this pass is to enable more efficient code generation for +// operations on narrow types (i.e. types with < 32-bits) and this is a +// motivating IR code example: +// +// define hidden i32 @cmp(i8 zeroext) { +// %2 = add i8 %0, -49 +// %3 = icmp ult i8 %2, 3 +// .. +// } +// +// The issue here is that i8 is type-legalized to i32 because i8 is not a +// legal type. Thus, arithmetic is done in integer-precision, but then the +// byte value is masked out as follows: +// +// t19: i32 = add t4, Constant:i32<-49> +// t24: i32 = and t19, Constant:i32<255> +// +// Consequently, we generate code like this: +// +// subs r0, #49 +// uxtb r1, r0 +// cmp r1, #3 +// +// This shows that masking out the byte value results in generation of +// the UXTB instruction. This is not optimal as r0 already contains the byte +// value we need, and so instead we can just generate: +// +// sub.w r1, r0, #49 +// cmp r1, #3 +// +// We achieve this by type promoting the IR to i32 like so for this example: +// +// define i32 @cmp(i8 zeroext %c) { +// %0 = zext i8 %c to i32 +// %c.off = add i32 %0, -49 +// %1 = icmp ult i32 %c.off, 3 +// .. +// } +// +// For this to be valid and legal, we need to prove that the i32 add is +// producing the same value as the i8 addition, and that e.g. no overflow +// happens. +// +// A brief sketch of the algorithm and some terminology. +// We pattern match interesting IR patterns: +// - which have "sources": instructions producing narrow values (i8, i16), and +// - they have "sinks": instructions consuming these narrow values. +// +// We collect all instruction connecting sources and sinks in a worklist, so +// that we can mutate these instruction and perform type promotion when it is +// legal to do so. +namespace { class IRPromoter { SmallPtrSet<Value*, 8> NewInsts; - SmallVector<Instruction*, 4> InstsToRemove; + SmallPtrSet<Instruction*, 4> InstsToRemove; + DenseMap<Value*, SmallVector<Type*, 4>> TruncTysMap; + SmallPtrSet<Value*, 8> Promoted; Module *M = nullptr; LLVMContext &Ctx; + IntegerType *ExtTy = nullptr; + IntegerType *OrigTy = nullptr; + SmallPtrSetImpl<Value*> *Visited; + SmallPtrSetImpl<Value*> *Sources; + SmallPtrSetImpl<Instruction*> *Sinks; + SmallPtrSetImpl<Instruction*> *SafeToPromote; + + void ReplaceAllUsersOfWith(Value *From, Value *To); + void PrepareConstants(void); + void ExtendSources(void); + void ConvertTruncs(void); + void PromoteTree(void); + void TruncateSinks(void); + void Cleanup(void); public: - IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { } + IRPromoter(Module *M) : M(M), Ctx(M->getContext()), + ExtTy(Type::getInt32Ty(Ctx)) { } - void Cleanup() { - for (auto *I : InstsToRemove) { - LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n"); - I->dropAllReferences(); - I->eraseFromParent(); - } - InstsToRemove.clear(); - NewInsts.clear(); - } void Mutate(Type *OrigTy, SmallPtrSetImpl<Value*> &Visited, - SmallPtrSetImpl<Value*> &Leaves, - SmallPtrSetImpl<Instruction*> &Roots); + SmallPtrSetImpl<Value*> &Sources, + SmallPtrSetImpl<Instruction*> &Sinks, + SmallPtrSetImpl<Instruction*> &SafeToPromote); }; class ARMCodeGenPrepare : public FunctionPass { const ARMSubtarget *ST = nullptr; IRPromoter *Promoter = nullptr; std::set<Value*> AllVisited; - Type *OrigTy = nullptr; - unsigned TypeSize = 0; + SmallPtrSet<Instruction*, 8> SafeToPromote; - bool isNarrowInstSupported(Instruction *I); + bool isSafeOverflow(Instruction *I); bool isSupportedValue(Value *V); bool isLegalToPromote(Value *V); bool TryToPromote(Value *V); public: static char ID; + static unsigned TypeSize; + Type *OrigTy = nullptr; ARMCodeGenPrepare() : FunctionPass(ID) {} @@ -111,8 +172,7 @@ public: } -/// Can the given value generate sign bits. -static bool isSigned(Value *V) { +static bool generateSignBits(Value *V) { if (!isa<Instruction>(V)) return false; @@ -121,120 +181,226 @@ static bool isSigned(Value *V) { Opc == Instruction::SRem; } +static bool EqualTypeSize(Value *V) { + return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize; +} + +static bool LessOrEqualTypeSize(Value *V) { + return V->getType()->getScalarSizeInBits() <= ARMCodeGenPrepare::TypeSize; +} + +static bool GreaterThanTypeSize(Value *V) { + return V->getType()->getScalarSizeInBits() > ARMCodeGenPrepare::TypeSize; +} + +static bool LessThanTypeSize(Value *V) { + return V->getType()->getScalarSizeInBits() < ARMCodeGenPrepare::TypeSize; +} + /// Some instructions can use 8- and 16-bit operands, and we don't need to /// promote anything larger. We disallow booleans to make life easier when /// dealing with icmps but allow any other integer that is <= 16 bits. Void /// types are accepted so we can handle switches. static bool isSupportedType(Value *V) { - if (V->getType()->isVoidTy()) + Type *Ty = V->getType(); + + // Allow voids and pointers, these won't be promoted. + if (Ty->isVoidTy() || Ty->isPointerTy()) return true; - const IntegerType *IntTy = dyn_cast<IntegerType>(V->getType()); - if (!IntTy) - return false; + if (auto *Ld = dyn_cast<LoadInst>(V)) + Ty = cast<PointerType>(Ld->getPointerOperandType())->getElementType(); - // Don't try to promote boolean values. - if (IntTy->getBitWidth() == 1) + if (!isa<IntegerType>(Ty) || + cast<IntegerType>(V->getType())->getBitWidth() == 1) return false; - if (auto *ZExt = dyn_cast<ZExtInst>(V)) - return isSupportedType(ZExt->getOperand(0)); + return LessOrEqualTypeSize(V); +} - return IntTy->getBitWidth() <= 16; +/// Return true if the given value is a source in the use-def chain, producing +/// a narrow 'TypeSize' value. These values will be zext to start the promotion +/// of the tree to i32. We guarantee that these won't populate the upper bits +/// of the register. ZExt on the loads will be free, and the same for call +/// return values because we only accept ones that guarantee a zeroext ret val. +/// Many arguments will have the zeroext attribute too, so those would be free +/// too. +static bool isSource(Value *V) { + if (!isa<IntegerType>(V->getType())) + return false; + + // TODO Allow zext to be sources. + if (isa<Argument>(V)) + return true; + else if (isa<LoadInst>(V)) + return true; + else if (isa<BitCastInst>(V)) + return true; + else if (auto *Call = dyn_cast<CallInst>(V)) + return Call->hasRetAttr(Attribute::AttrKind::ZExt); + else if (auto *Trunc = dyn_cast<TruncInst>(V)) + return EqualTypeSize(Trunc); + return false; } /// Return true if V will require any promoted values to be truncated for the -/// use to be valid. +/// the IR to remain valid. We can't mutate the value type of these +/// instructions. static bool isSink(Value *V) { - auto UsesNarrowValue = [](Value *V) { - return V->getType()->getScalarSizeInBits() <= 32; - }; - + // TODO The truncate also isn't actually necessary because we would already + // proved that the data value is kept within the range of the original data + // type. + + // Sinks are: + // - points where the value in the register is being observed, such as an + // icmp, switch or store. + // - points where value types have to match, such as calls and returns. + // - zext are included to ease the transformation and are generally removed + // later on. if (auto *Store = dyn_cast<StoreInst>(V)) - return UsesNarrowValue(Store->getValueOperand()); + return LessOrEqualTypeSize(Store->getValueOperand()); if (auto *Return = dyn_cast<ReturnInst>(V)) - return UsesNarrowValue(Return->getReturnValue()); + return LessOrEqualTypeSize(Return->getReturnValue()); + if (auto *ZExt = dyn_cast<ZExtInst>(V)) + return GreaterThanTypeSize(ZExt); + if (auto *Switch = dyn_cast<SwitchInst>(V)) + return LessThanTypeSize(Switch->getCondition()); + if (auto *ICmp = dyn_cast<ICmpInst>(V)) + return ICmp->isSigned() || LessThanTypeSize(ICmp->getOperand(0)); return isa<CallInst>(V); } -/// Return true if the given value is a leaf that will need to be zext'd. -static bool isSource(Value *V) { - if (isa<Argument>(V) && isSupportedType(V)) - return true; - else if (isa<TruncInst>(V)) - return true; - else if (auto *ZExt = dyn_cast<ZExtInst>(V)) - // ZExt can be a leaf if its the only user of a load. - return isa<LoadInst>(ZExt->getOperand(0)) && - ZExt->getOperand(0)->hasOneUse(); - else if (auto *Call = dyn_cast<CallInst>(V)) - return Call->hasRetAttr(Attribute::AttrKind::ZExt); - else if (auto *Load = dyn_cast<LoadInst>(V)) { - if (!isa<IntegerType>(Load->getType())) - return false; - // A load is a leaf, unless its already just being zext'd. - if (Load->hasOneUse() && isa<ZExtInst>(*Load->use_begin())) - return false; - - return true; - } - return false; -} - /// Return whether the instruction can be promoted within any modifications to -/// it's operands or result. -static bool isSafeOverflow(Instruction *I) { +/// its operands or result. +bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) { + // FIXME Do we need NSW too? if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap()) return true; + // We can support a, potentially, overflowing instruction (I) if: + // - It is only used by an unsigned icmp. + // - The icmp uses a constant. + // - The overflowing value (I) is decreasing, i.e would underflow - wrapping + // around zero to become a larger number than before. + // - The underflowing instruction (I) also uses a constant. + // + // We can then use the two constants to calculate whether the result would + // wrap in respect to itself in the original bitwidth. If it doesn't wrap, + // just underflows the range, the icmp would give the same result whether the + // result has been truncated or not. We calculate this by: + // - Zero extending both constants, if needed, to 32-bits. + // - Take the absolute value of I's constant, adding this to the icmp const. + // - Check that this value is not out of range for small type. If it is, it + // means that it has underflowed enough to wrap around the icmp constant. + // + // For example: + // + // %sub = sub i8 %a, 2 + // %cmp = icmp ule i8 %sub, 254 + // + // If %a = 0, %sub = -2 == FE == 254 + // But if this is evalulated as a i32 + // %sub = -2 == FF FF FF FE == 4294967294 + // So the unsigned compares (i8 and i32) would not yield the same result. + // + // Another way to look at it is: + // %a - 2 <= 254 + // %a + 2 <= 254 + 2 + // %a <= 256 + // And we can't represent 256 in the i8 format, so we don't support it. + // + // Whereas: + // + // %sub i8 %a, 1 + // %cmp = icmp ule i8 %sub, 254 + // + // If %a = 0, %sub = -1 == FF == 255 + // As i32: + // %sub = -1 == FF FF FF FF == 4294967295 + // + // In this case, the unsigned compare results would be the same and this + // would also be true for ult, uge and ugt: + // - (255 < 254) == (0xFFFFFFFF < 254) == false + // - (255 <= 254) == (0xFFFFFFFF <= 254) == false + // - (255 > 254) == (0xFFFFFFFF > 254) == true + // - (255 >= 254) == (0xFFFFFFFF >= 254) == true + // + // To demonstrate why we can't handle increasing values: + // + // %add = add i8 %a, 2 + // %cmp = icmp ult i8 %add, 127 + // + // If %a = 254, %add = 256 == (i8 1) + // As i32: + // %add = 256 + // + // (1 < 127) != (256 < 127) + unsigned Opc = I->getOpcode(); - if (Opc == Instruction::Add || Opc == Instruction::Sub) { - // We don't care if the add or sub could wrap if the value is decreasing - // and is only being used by an unsigned compare. - if (!I->hasOneUse() || - !isa<ICmpInst>(*I->user_begin()) || - !isa<ConstantInt>(I->getOperand(1))) - return false; + if (Opc != Instruction::Add && Opc != Instruction::Sub) + return false; - auto *CI = cast<ICmpInst>(*I->user_begin()); - if (CI->isSigned()) - return false; + if (!I->hasOneUse() || + !isa<ICmpInst>(*I->user_begin()) || + !isa<ConstantInt>(I->getOperand(1))) + return false; - bool NegImm = cast<ConstantInt>(I->getOperand(1))->isNegative(); - bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) || - ((Opc == Instruction::Add) && NegImm); - if (!IsDecreasing) - return false; + ConstantInt *OverflowConst = cast<ConstantInt>(I->getOperand(1)); + bool NegImm = OverflowConst->isNegative(); + bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) || + ((Opc == Instruction::Add) && NegImm); + if (!IsDecreasing) + return false; - LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n"); - return true; - } + // Don't support an icmp that deals with sign bits. + auto *CI = cast<ICmpInst>(*I->user_begin()); + if (CI->isSigned() || CI->isEquality()) + return false; - // Otherwise, if an instruction is using a negative immediate we will need - // to fix it up during the promotion. - for (auto &Op : I->operands()) { - if (auto *Const = dyn_cast<ConstantInt>(Op)) - if (Const->isNegative()) - return false; - } - return false; + ConstantInt *ICmpConst = nullptr; + if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(0))) + ICmpConst = Const; + else if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(1))) + ICmpConst = Const; + else + return false; + + // Now check that the result can't wrap on itself. + APInt Total = ICmpConst->getValue().getBitWidth() < 32 ? + ICmpConst->getValue().zext(32) : ICmpConst->getValue(); + + Total += OverflowConst->getValue().getBitWidth() < 32 ? + OverflowConst->getValue().abs().zext(32) : OverflowConst->getValue().abs(); + + APInt Max = APInt::getAllOnesValue(ARMCodeGenPrepare::TypeSize); + + if (Total.getBitWidth() > Max.getBitWidth()) { + if (Total.ugt(Max.zext(Total.getBitWidth()))) + return false; + } else if (Max.getBitWidth() > Total.getBitWidth()) { + if (Total.zext(Max.getBitWidth()).ugt(Max)) + return false; + } else if (Total.ugt(Max)) + return false; + + LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n"); + return true; } static bool shouldPromote(Value *V) { - auto *I = dyn_cast<Instruction>(V); - if (!I) + if (!isa<IntegerType>(V->getType()) || isSink(V)) return false; - if (!isa<IntegerType>(V->getType())) - return false; + if (isSource(V)) + return true; - if (isa<StoreInst>(I) || isa<TerminatorInst>(I) || isa<TruncInst>(I) || - isa<ICmpInst>(I)) + auto *I = dyn_cast<Instruction>(V); + if (!I) return false; - if (auto *ZExt = dyn_cast<ZExtInst>(I)) - return !ZExt->getDestTy()->isIntegerTy(32); + if (isa<ICmpInst>(I)) + return false; return true; } @@ -245,24 +411,16 @@ static bool isPromotedResultSafe(Value *V) { if (!isa<Instruction>(V)) return true; - if (isSigned(V)) + if (generateSignBits(V)) return false; - // If I is only being used by something that will require its value to be - // truncated, then we don't care about the promoted result. - auto *I = cast<Instruction>(V); - if (I->hasOneUse() && isSink(*I->use_begin())) - return true; - - if (isa<OverflowingBinaryOperator>(I)) - return isSafeOverflow(I); - return true; + return !isa<OverflowingBinaryOperator>(V); } /// Return the intrinsic for the instruction that can perform the same /// operation but on a narrow type. This is using the parallel dsp intrinsics /// on scalar values. -static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) { +static Intrinsic::ID getNarrowIntrinsic(Instruction *I) { // Whether we use the signed or unsigned versions of these intrinsics // doesn't matter because we're not using the GE bits that they set in // the APSR. @@ -270,124 +428,163 @@ static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) { default: break; case Instruction::Add: - return TypeSize == 16 ? Intrinsic::arm_uadd16 : + return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_uadd16 : Intrinsic::arm_uadd8; case Instruction::Sub: - return TypeSize == 16 ? Intrinsic::arm_usub16 : + return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_usub16 : Intrinsic::arm_usub8; } llvm_unreachable("unhandled opcode for narrow intrinsic"); } -void IRPromoter::Mutate(Type *OrigTy, - SmallPtrSetImpl<Value*> &Visited, - SmallPtrSetImpl<Value*> &Leaves, - SmallPtrSetImpl<Instruction*> &Roots) { +void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) { + SmallVector<Instruction*, 4> Users; + Instruction *InstTo = dyn_cast<Instruction>(To); + bool ReplacedAll = true; + + LLVM_DEBUG(dbgs() << "ARM CGP: Replacing " << *From << " with " << *To + << "\n"); + + for (Use &U : From->uses()) { + auto *User = cast<Instruction>(U.getUser()); + if (InstTo && User->isIdenticalTo(InstTo)) { + ReplacedAll = false; + continue; + } + Users.push_back(User); + } + + for (auto *U : Users) + U->replaceUsesOfWith(From, To); + + if (ReplacedAll) + if (auto *I = dyn_cast<Instruction>(From)) + InstsToRemove.insert(I); +} + +void IRPromoter::PrepareConstants() { IRBuilder<> Builder{Ctx}; - Type *ExtTy = Type::getInt32Ty(M->getContext()); - unsigned TypeSize = OrigTy->getPrimitiveSizeInBits(); - SmallPtrSet<Value*, 8> Promoted; - LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << TypeSize - << " to 32-bits\n"); - - auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) { - SmallVector<Instruction*, 4> Users; - Instruction *InstTo = dyn_cast<Instruction>(To); - for (Use &U : From->uses()) { - auto *User = cast<Instruction>(U.getUser()); - if (InstTo && User->isIdenticalTo(InstTo)) + // First step is to prepare the instructions for mutation. Most constants + // just need to be zero extended into their new type, but complications arise + // because: + // - For nuw binary operators, negative immediates would need sign extending; + // however, instead we'll change them to positive and zext them. We can do + // this because: + // > The operators that can wrap are: add, sub, mul and shl. + // > shl interprets its second operand as unsigned and if the first operand + // is an immediate, it will need zext to be nuw. + // > I'm assuming mul has to interpret immediates as unsigned for nuw. + // > Which leaves the nuw add and sub to be handled; as with shl, if an + // immediate is used as operand 0, it will need zext to be nuw. + // - We also allow add and sub to safely overflow in certain circumstances + // and only when the value (operand 0) is being decreased. + // + // For adds and subs, that are either nuw or safely wrap and use a negative + // immediate as operand 1, we create an equivalent instruction using a + // positive immediate. That positive immediate can then be zext along with + // all the other immediates later. + for (auto *V : *Visited) { + if (!isa<Instruction>(V)) + continue; + + auto *I = cast<Instruction>(V); + if (SafeToPromote->count(I)) { + + if (!isa<OverflowingBinaryOperator>(I)) continue; - Users.push_back(User); - } - for (auto &U : Users) - U->replaceUsesOfWith(From, To); - }; + if (auto *Const = dyn_cast<ConstantInt>(I->getOperand(1))) { + if (!Const->isNegative()) + break; - auto FixConst = [&](ConstantInt *Const, Instruction *I) { - Constant *NewConst = nullptr; - if (isSafeOverflow(I)) { - NewConst = (Const->isNegative()) ? - ConstantExpr::getSExt(Const, ExtTy) : - ConstantExpr::getZExt(Const, ExtTy); - } else { - uint64_t NewVal = *Const->getValue().getRawData(); - if (Const->getType() == Type::getInt16Ty(Ctx)) - NewVal &= 0xFFFF; - else - NewVal &= 0xFF; - NewConst = ConstantInt::get(ExtTy, NewVal); + unsigned Opc = I->getOpcode(); + if (Opc != Instruction::Add && Opc != Instruction::Sub) + continue; + + LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n"); + auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs()); + Builder.SetInsertPoint(I); + Value *NewVal = Opc == Instruction::Sub ? + Builder.CreateAdd(I->getOperand(0), NewConst) : + Builder.CreateSub(I->getOperand(0), NewConst); + LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n"); + + if (auto *NewInst = dyn_cast<Instruction>(NewVal)) { + NewInst->copyIRFlags(I); + NewInsts.insert(NewInst); + } + InstsToRemove.insert(I); + I->replaceAllUsesWith(NewVal); + } } - I->replaceUsesOfWith(Const, NewConst); - }; + } + for (auto *I : NewInsts) + Visited->insert(I); +} - auto InsertDSPIntrinsic = [&](Instruction *I) { - LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for " - << *I << "\n"); - Function *DSPInst = - Intrinsic::getDeclaration(M, getNarrowIntrinsic(I, TypeSize)); - Builder.SetInsertPoint(I); - Builder.SetCurrentDebugLocation(I->getDebugLoc()); - Value *Args[] = { I->getOperand(0), I->getOperand(1) }; - CallInst *Call = Builder.CreateCall(DSPInst, Args); - ReplaceAllUsersOfWith(I, Call); - InstsToRemove.push_back(I); - NewInsts.insert(Call); - }; +void IRPromoter::ExtendSources() { + IRBuilder<> Builder{Ctx}; auto InsertZExt = [&](Value *V, Instruction *InsertPt) { + assert(V->getType() != ExtTy && "zext already extends to i32"); LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n"); Builder.SetInsertPoint(InsertPt); if (auto *I = dyn_cast<Instruction>(V)) Builder.SetCurrentDebugLocation(I->getDebugLoc()); - auto *ZExt = cast<Instruction>(Builder.CreateZExt(V, ExtTy)); - if (isa<Argument>(V)) - ZExt->moveBefore(InsertPt); - else - ZExt->moveAfter(InsertPt); + + Value *ZExt = Builder.CreateZExt(V, ExtTy); + if (auto *I = dyn_cast<Instruction>(ZExt)) { + if (isa<Argument>(V)) + I->moveBefore(InsertPt); + else + I->moveAfter(InsertPt); + NewInsts.insert(I); + } + ReplaceAllUsersOfWith(V, ZExt); - NewInsts.insert(ZExt); }; - // First, insert extending instructions between the leaves and their users. - LLVM_DEBUG(dbgs() << "ARM CGP: Promoting leaves:\n"); - for (auto V : Leaves) { + // Now, insert extending instructions between the sources and their users. + LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n"); + for (auto V : *Sources) { LLVM_DEBUG(dbgs() << " - " << *V << "\n"); - if (auto *ZExt = dyn_cast<ZExtInst>(V)) - ZExt->mutateType(ExtTy); - else if (auto *I = dyn_cast<Instruction>(V)) + if (auto *I = dyn_cast<Instruction>(V)) InsertZExt(I, I); else if (auto *Arg = dyn_cast<Argument>(V)) { BasicBlock &BB = Arg->getParent()->front(); InsertZExt(Arg, &*BB.getFirstInsertionPt()); } else { - llvm_unreachable("unhandled leaf that needs extending"); + llvm_unreachable("unhandled source that needs extending"); } Promoted.insert(V); } +} +void IRPromoter::PromoteTree() { LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n"); - // Then mutate the types of the instructions within the tree. Here we handle - // constant operands. - for (auto *V : Visited) { - if (Leaves.count(V)) - continue; - if (!isa<Instruction>(V)) + IRBuilder<> Builder{Ctx}; + + // Mutate the types of the instructions within the tree. Here we handle + // constant operands. + for (auto *V : *Visited) { + if (Sources->count(V)) continue; auto *I = cast<Instruction>(V); - if (Roots.count(I)) + if (Sinks->count(I)) continue; - for (auto &U : I->operands()) { - if ((U->getType() == ExtTy) || !isSupportedType(&*U)) + for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) { + Value *Op = I->getOperand(i); + if ((Op->getType() == ExtTy) || !isa<IntegerType>(Op->getType())) continue; - if (auto *Const = dyn_cast<ConstantInt>(&*U)) - FixConst(Const, I); - else if (isa<UndefValue>(&*U)) - U->mutateType(ExtTy); + if (auto *Const = dyn_cast<ConstantInt>(Op)) { + Constant *NewConst = ConstantExpr::getZExt(Const, ExtTy); + I->setOperand(i, NewConst); + } else if (isa<UndefValue>(Op)) + I->setOperand(i, UndefValue::get(ExtTy)); } if (shouldPromote(I)) { @@ -396,91 +593,215 @@ void IRPromoter::Mutate(Type *OrigTy, } } - // Now we need to remove any zexts that have become unnecessary, as well - // as insert any intrinsics. - for (auto *V : Visited) { - if (Leaves.count(V)) + // Finally, any instructions that should be promoted but haven't yet been, + // need to be handled using intrinsics. + for (auto *V : *Visited) { + auto *I = dyn_cast<Instruction>(V); + if (!I) continue; - if (auto *ZExt = dyn_cast<ZExtInst>(V)) { - if (ZExt->getDestTy() != ExtTy) { - ZExt->mutateType(ExtTy); - Promoted.insert(ZExt); - } - else if (ZExt->getSrcTy() == ExtTy) { - ReplaceAllUsersOfWith(V, ZExt->getOperand(0)); - InstsToRemove.push_back(ZExt); - } + + if (Sources->count(I) || Sinks->count(I)) continue; - } - if (!shouldPromote(V) || isPromotedResultSafe(V)) + if (!shouldPromote(I) || SafeToPromote->count(I) || NewInsts.count(I)) continue; + + assert(EnableDSP && "DSP intrinisc insertion not enabled!"); // Replace unsafe instructions with appropriate intrinsic calls. - InsertDSPIntrinsic(cast<Instruction>(V)); + LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for " + << *I << "\n"); + Function *DSPInst = + Intrinsic::getDeclaration(M, getNarrowIntrinsic(I)); + Builder.SetInsertPoint(I); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + Value *Args[] = { I->getOperand(0), I->getOperand(1) }; + CallInst *Call = Builder.CreateCall(DSPInst, Args); + NewInsts.insert(Call); + ReplaceAllUsersOfWith(I, Call); } +} + +void IRPromoter::TruncateSinks() { + LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n"); + + IRBuilder<> Builder{Ctx}; + + auto InsertTrunc = [&](Value *V, Type *TruncTy) -> Instruction* { + if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType())) + return nullptr; + + if ((!Promoted.count(V) && !NewInsts.count(V)) || Sources->count(V)) + return nullptr; + + LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for " + << *V << "\n"); + Builder.SetInsertPoint(cast<Instruction>(V)); + auto *Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(V, TruncTy)); + if (Trunc) + NewInsts.insert(Trunc); + return Trunc; + }; - LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the roots:\n"); // Fix up any stores or returns that use the results of the promoted // chain. - for (auto I : Roots) { - LLVM_DEBUG(dbgs() << " - " << *I << "\n"); - Type *TruncTy = OrigTy; - if (auto *Store = dyn_cast<StoreInst>(I)) { - auto *PtrTy = cast<PointerType>(Store->getPointerOperandType()); - TruncTy = PtrTy->getElementType(); - } else if (isa<ReturnInst>(I)) { - Function *F = I->getParent()->getParent(); - TruncTy = F->getFunctionType()->getReturnType(); + for (auto I : *Sinks) { + LLVM_DEBUG(dbgs() << "ARM CGP: For Sink: " << *I << "\n"); + + // Handle calls separately as we need to iterate over arg operands. + if (auto *Call = dyn_cast<CallInst>(I)) { + for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) { + Value *Arg = Call->getArgOperand(i); + Type *Ty = TruncTysMap[Call][i]; + if (Instruction *Trunc = InsertTrunc(Arg, Ty)) { + Trunc->moveBefore(Call); + Call->setArgOperand(i, Trunc); + } + } + continue; } + // Special case switches because we need to truncate the condition. + if (auto *Switch = dyn_cast<SwitchInst>(I)) { + Type *Ty = TruncTysMap[Switch][0]; + if (Instruction *Trunc = InsertTrunc(Switch->getCondition(), Ty)) { + Trunc->moveBefore(Switch); + Switch->setCondition(Trunc); + } + continue; + } + + // Now handle the others. for (unsigned i = 0; i < I->getNumOperands(); ++i) { - Value *V = I->getOperand(i); - if (Promoted.count(V) || NewInsts.count(V)) { - if (auto *Op = dyn_cast<Instruction>(V)) { - - if (auto *Call = dyn_cast<CallInst>(I)) - TruncTy = Call->getFunctionType()->getParamType(i); - - if (TruncTy == ExtTy) - continue; - - LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy - << " Trunc for " << *Op << "\n"); - Builder.SetInsertPoint(Op); - auto *Trunc = cast<Instruction>(Builder.CreateTrunc(Op, TruncTy)); - Trunc->moveBefore(I); - I->setOperand(i, Trunc); - NewInsts.insert(Trunc); - } + Type *Ty = TruncTysMap[I][i]; + if (Instruction *Trunc = InsertTrunc(I->getOperand(i), Ty)) { + Trunc->moveBefore(I); + I->setOperand(i, Trunc); } } } - LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete.\n"); } -bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) { - if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I)) - return false; +void IRPromoter::Cleanup() { + // Some zexts will now have become redundant, along with their trunc + // operands, so remove them + for (auto V : *Visited) { + if (!isa<CastInst>(V)) + continue; - if (ST->isThumb() && !ST->hasThumb2()) - return false; + auto ZExt = cast<CastInst>(V); + if (ZExt->getDestTy() != ExtTy) + continue; - if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub) - return false; + Value *Src = ZExt->getOperand(0); + if (ZExt->getSrcTy() == ZExt->getDestTy()) { + LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast: " << *ZExt + << "\n"); + ReplaceAllUsersOfWith(ZExt, Src); + continue; + } - // TODO - // Would it be profitable? For Thumb code, these parallel DSP instructions - // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For - // Cortex-A, specifically Cortex-A72, the latency is double and throughput is - // halved. They also do not take immediates as operands. - for (auto &Op : I->operands()) { - if (isa<Constant>(Op)) { - if (!EnableDSPWithImms) - return false; + // For any truncs that we insert to handle zexts, we can replace the + // result of the zext with the input to the trunc. + if (NewInsts.count(Src) && isa<ZExtInst>(V) && isa<TruncInst>(Src)) { + auto *Trunc = cast<TruncInst>(Src); + assert(Trunc->getOperand(0)->getType() == ExtTy && + "expected inserted trunc to be operating on i32"); + ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0)); } } - return true; + + for (auto *I : InstsToRemove) { + LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n"); + I->dropAllReferences(); + I->eraseFromParent(); + } + + InstsToRemove.clear(); + NewInsts.clear(); + TruncTysMap.clear(); + Promoted.clear(); +} + +void IRPromoter::ConvertTruncs() { + IRBuilder<> Builder{Ctx}; + + for (auto *V : *Visited) { + if (!isa<TruncInst>(V) || Sources->count(V)) + continue; + + auto *Trunc = cast<TruncInst>(V); + assert(LessThanTypeSize(Trunc) && "expected narrow trunc"); + + Builder.SetInsertPoint(Trunc); + unsigned NumBits = + cast<IntegerType>(Trunc->getType())->getScalarSizeInBits(); + ConstantInt *Mask = ConstantInt::get(Ctx, APInt::getMaxValue(NumBits)); + Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask); + + if (auto *I = dyn_cast<Instruction>(Masked)) + NewInsts.insert(I); + + ReplaceAllUsersOfWith(Trunc, Masked); + } +} + +void IRPromoter::Mutate(Type *OrigTy, + SmallPtrSetImpl<Value*> &Visited, + SmallPtrSetImpl<Value*> &Sources, + SmallPtrSetImpl<Instruction*> &Sinks, + SmallPtrSetImpl<Instruction*> &SafeToPromote) { + LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " + << ARMCodeGenPrepare::TypeSize << " to 32-bits\n"); + + assert(isa<IntegerType>(OrigTy) && "expected integer type"); + this->OrigTy = cast<IntegerType>(OrigTy); + assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits() && + "original type not smaller than extended type"); + + this->Visited = &Visited; + this->Sources = &Sources; + this->Sinks = &Sinks; + this->SafeToPromote = &SafeToPromote; + + // Cache original types of the values that will likely need truncating + for (auto *I : Sinks) { + if (auto *Call = dyn_cast<CallInst>(I)) { + for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) { + Value *Arg = Call->getArgOperand(i); + TruncTysMap[Call].push_back(Arg->getType()); + } + } else if (auto *Switch = dyn_cast<SwitchInst>(I)) + TruncTysMap[I].push_back(Switch->getCondition()->getType()); + else { + for (unsigned i = 0; i < I->getNumOperands(); ++i) + TruncTysMap[I].push_back(I->getOperand(i)->getType()); + } + } + + // Convert adds and subs using negative immediates to equivalent instructions + // that use positive constants. + PrepareConstants(); + + // Insert zext instructions between sources and their users. + ExtendSources(); + + // Convert any truncs, that aren't sources, into AND masks. + ConvertTruncs(); + + // Promote visited instructions, mutating their types in place. Also insert + // DSP intrinsics, if enabled, for adds and subs which would be unsafe to + // promote. + PromoteTree(); + + // Insert trunc instructions for use by calls, stores etc... + TruncateSinks(); + + // Finally, remove unecessary zexts and truncs, delete old instructions and + // clear the data structures. + Cleanup(); + + LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete\n"); } /// We accept most instructions, as well as Arguments and ConstantInsts. We @@ -488,102 +809,133 @@ bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) { /// return value is zeroext. We don't allow opcodes that can introduce sign /// bits. bool ARMCodeGenPrepare::isSupportedValue(Value *V) { - LLVM_DEBUG(dbgs() << "ARM CGP: Is " << *V << " supported?\n"); - - // Non-instruction values that we can handle. - if (isa<ConstantInt>(V) || isa<Argument>(V)) - return true; + if (auto *I = dyn_cast<ICmpInst>(V)) { + // Now that we allow small types than TypeSize, only allow icmp of + // TypeSize because they will require a trunc to be legalised. + // TODO: Allow icmp of smaller types, and calculate at the end + // whether the transform would be beneficial. + if (isa<PointerType>(I->getOperand(0)->getType())) + return true; + return EqualTypeSize(I->getOperand(0)); + } // Memory instructions - if (isa<StoreInst>(V) || isa<LoadInst>(V) || isa<GetElementPtrInst>(V)) + if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V)) return true; // Branches and targets. - if (auto *ICmp = dyn_cast<ICmpInst>(V)) - return ICmp->isEquality() || !ICmp->isSigned(); - if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V)) return true; - if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V)) - return true; + // Non-instruction values that we can handle. + if ((isa<Constant>(V) && !isa<ConstantExpr>(V)) || isa<Argument>(V)) + return isSupportedType(V); + + if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V) || + isa<LoadInst>(V)) + return isSupportedType(V); + + if (isa<SExtInst>(V)) + return false; + + if (auto *Cast = dyn_cast<CastInst>(V)) + return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0)); // Special cases for calls as we need to check for zeroext // TODO We should accept calls even if they don't have zeroext, as they can - // still be roots. + // still be sinks. if (auto *Call = dyn_cast<CallInst>(V)) - return Call->hasRetAttr(Attribute::AttrKind::ZExt); - else if (auto *Cast = dyn_cast<CastInst>(V)) { - if (isa<ZExtInst>(Cast)) - return Cast->getDestTy()->getScalarSizeInBits() <= 32; - else if (auto *Trunc = dyn_cast<TruncInst>(V)) - return Trunc->getDestTy()->getScalarSizeInBits() <= TypeSize; - else { - LLVM_DEBUG(dbgs() << "ARM CGP: No, unsupported cast.\n"); - return false; - } - } else if (!isa<BinaryOperator>(V)) { - LLVM_DEBUG(dbgs() << "ARM CGP: No, not a binary operator.\n"); + return isSupportedType(Call) && + Call->hasRetAttr(Attribute::AttrKind::ZExt); + + if (!isa<BinaryOperator>(V)) + return false; + + if (!isSupportedType(V)) return false; - } - bool res = !isSigned(V); - if (!res) - LLVM_DEBUG(dbgs() << "ARM CGP: No, it's a signed instruction.\n"); - return res; + if (generateSignBits(V)) { + LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n"); + return false; + } + return true; } /// Check that the type of V would be promoted and that the original type is /// smaller than the targeted promoted type. Check that we're not trying to /// promote something larger than our base 'TypeSize' type. bool ARMCodeGenPrepare::isLegalToPromote(Value *V) { - if (!isSupportedType(V)) - return false; - unsigned VSize = 0; - if (auto *Ld = dyn_cast<LoadInst>(V)) { - auto *PtrTy = cast<PointerType>(Ld->getPointerOperandType()); - VSize = PtrTy->getElementType()->getPrimitiveSizeInBits(); - } else if (auto *ZExt = dyn_cast<ZExtInst>(V)) { - VSize = ZExt->getOperand(0)->getType()->getPrimitiveSizeInBits(); - } else { - VSize = V->getType()->getPrimitiveSizeInBits(); + auto *I = dyn_cast<Instruction>(V); + if (!I) + return true; + + if (SafeToPromote.count(I)) + return true; + + if (isPromotedResultSafe(V) || isSafeOverflow(I)) { + SafeToPromote.insert(I); + return true; } - if (VSize > TypeSize) + if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub) return false; - if (isPromotedResultSafe(V)) - return true; + // If promotion is not safe, can we use a DSP instruction to natively + // handle the narrow type? + if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I)) + return false; - if (auto *I = dyn_cast<Instruction>(V)) - return isNarrowInstSupported(I); + if (ST->isThumb() && !ST->hasThumb2()) + return false; - return false; + // TODO + // Would it be profitable? For Thumb code, these parallel DSP instructions + // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For + // Cortex-A, specifically Cortex-A72, the latency is double and throughput is + // halved. They also do not take immediates as operands. + for (auto &Op : I->operands()) { + if (isa<Constant>(Op)) { + if (!EnableDSPWithImms) + return false; + } + } + LLVM_DEBUG(dbgs() << "ARM CGP: Will use an intrinsic for: " << *I << "\n"); + return true; } bool ARMCodeGenPrepare::TryToPromote(Value *V) { OrigTy = V->getType(); TypeSize = OrigTy->getPrimitiveSizeInBits(); + if (TypeSize > 16 || TypeSize < 8) + return false; + + SafeToPromote.clear(); if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V)) return false; - LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << "\n"); + LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << ", TypeSize = " + << TypeSize << "\n"); SetVector<Value*> WorkList; - SmallPtrSet<Value*, 8> Leaves; - SmallPtrSet<Instruction*, 4> Roots; - WorkList.insert(V); + SmallPtrSet<Value*, 8> Sources; + SmallPtrSet<Instruction*, 4> Sinks; SmallPtrSet<Value*, 16> CurrentVisited; - CurrentVisited.clear(); + WorkList.insert(V); - // Return true if the given value can, or has been, visited. Add V to the - // worklist if needed. + // Return true if V was added to the worklist as a supported instruction, + // if it was already visited, or if we don't need to explore it (e.g. + // pointer values and GEPs), and false otherwise. auto AddLegalInst = [&](Value *V) { if (CurrentVisited.count(V)) return true; + // Ignore GEPs because they don't need promoting and the constant indices + // will prevent the transformation. + if (isa<GetElementPtrInst>(V)) + return true; + if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) { LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n"); return false; @@ -600,6 +952,7 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) { if (CurrentVisited.count(V)) continue; + // Ignore non-instructions, other than arguments. if (!isa<Instruction>(V) && !isSource(V)) continue; @@ -607,24 +960,26 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) { // the tree has already been explored. // TODO: This could limit the transform, ie if we try to promote something // from an i8 and fail first, before trying an i16. - if (AllVisited.count(V)) { - LLVM_DEBUG(dbgs() << "ARM CGP: Already visited this: " << *V << "\n"); + if (AllVisited.count(V)) return false; - } CurrentVisited.insert(V); AllVisited.insert(V); // Calls can be both sources and sinks. if (isSink(V)) - Roots.insert(cast<Instruction>(V)); + Sinks.insert(cast<Instruction>(V)); + if (isSource(V)) - Leaves.insert(V); - else if (auto *I = dyn_cast<Instruction>(V)) { - // Visit operands of any instruction visited. - for (auto &U : I->operands()) { - if (!AddLegalInst(U)) - return false; + Sources.insert(V); + + if (!isSink(V) && !isSource(V)) { + if (auto *I = dyn_cast<Instruction>(V)) { + // Visit operands of any instruction visited. + for (auto &U : I->operands()) { + if (!AddLegalInst(U)) + return false; + } } } @@ -638,43 +993,23 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) { } } - unsigned NumToPromote = 0; - unsigned Cost = 0; + LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n"; + for (auto *I : CurrentVisited) + I->dump(); + ); + unsigned ToPromote = 0; for (auto *V : CurrentVisited) { - // Truncs will cause a uxt and no zeroext arguments will often require - // a uxt somewhere. - if (isa<TruncInst>(V)) - ++Cost; - else if (auto *Arg = dyn_cast<Argument>(V)) { - if (!Arg->hasZExtAttr()) - ++Cost; - } - - // Mem ops can automatically be extended/truncated and non-instructions - // don't need anything done. - if (Leaves.count(V) || isa<StoreInst>(V) || !isa<Instruction>(V)) + if (Sources.count(V)) continue; - - // Will need to truncate calls args and returns. - if (Roots.count(cast<Instruction>(V))) { - ++Cost; + if (Sinks.count(cast<Instruction>(V))) continue; - } - - if (shouldPromote(V)) - ++NumToPromote; + ++ToPromote; } - LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n"; - for (auto *I : CurrentVisited) - I->dump(); - ); - LLVM_DEBUG(dbgs() << "ARM CGP: Cost of promoting " << NumToPromote - << " instructions = " << Cost << "\n"); - if (Cost > NumToPromote || (NumToPromote == 0)) + if (ToPromote < 2) return false; - Promoter->Mutate(OrigTy, CurrentVisited, Leaves, Roots); + Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote); return true; } @@ -711,19 +1046,15 @@ bool ARMCodeGenPrepare::runOnFunction(Function &F) { continue; LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n"); + for (auto &Op : CI.operands()) { - if (auto *I = dyn_cast<Instruction>(Op)) { - if (isa<ZExtInst>(I)) - MadeChange |= TryToPromote(I->getOperand(0)); - else - MadeChange |= TryToPromote(I); - } + if (auto *I = dyn_cast<Instruction>(Op)) + MadeChange |= TryToPromote(I); } } } - Promoter->Cleanup(); LLVM_DEBUG(if (verifyFunction(F, &dbgs())) { - dbgs(); + dbgs() << F; report_fatal_error("Broken function after type promotion"); }); } @@ -744,6 +1075,7 @@ INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations", false, false) char ARMCodeGenPrepare::ID = 0; +unsigned ARMCodeGenPrepare::TypeSize = 0; FunctionPass *llvm::createARMCodeGenPreparePass() { return new ARMCodeGenPrepare(); diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index 2c4738d3cb74..5e97c4cb35e3 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -1420,6 +1420,22 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, MI = LastIT; } + // Avoid splitting a MOVW+MOVT pair with a relocation on Windows. + // On Windows, this instruction pair is covered by one single + // IMAGE_REL_ARM_MOV32T relocation which covers both instructions. If a + // constant island is injected inbetween them, the relocation will clobber + // the instruction and fail to update the MOVT instruction. + // (These instructions are bundled up until right before the ConstantIslands + // pass.) + if (STI->isTargetWindows() && isThumb && MI->getOpcode() == ARM::t2MOVTi16 && + (MI->getOperand(2).getTargetFlags() & ARMII::MO_OPTION_MASK) == + ARMII::MO_HI16) { + --MI; + assert(MI->getOpcode() == ARM::t2MOVi16 && + (MI->getOperand(1).getTargetFlags() & ARMII::MO_OPTION_MASK) == + ARMII::MO_LO16); + } + // We really must not split an IT block. LLVM_DEBUG(unsigned PredReg; assert( !isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL)); diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 5dac6ec0b799..eecd0a10dc7d 100644 --- a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -570,7 +570,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { TransferImpOps(MI, MIB, MIB); // Transfer memoperands. - MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB.cloneMemRefs(MI); MI.eraseFromParent(); } @@ -645,7 +645,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { TransferImpOps(MI, MIB, MIB); // Transfer memoperands. - MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB.cloneMemRefs(MI); MI.eraseFromParent(); } @@ -735,7 +735,7 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) { MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); TransferImpOps(MI, MIB, MIB); // Transfer memoperands. - MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB.cloneMemRefs(MI); MI.eraseFromParent(); } @@ -848,8 +848,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); LO16 = LO16.addImm(SOImmValV1); HI16 = HI16.addImm(SOImmValV2); - LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16.cloneMemRefs(MI); + HI16.cloneMemRefs(MI); LO16.addImm(Pred).addReg(PredReg).add(condCodeOp()); HI16.addImm(Pred).addReg(PredReg).add(condCodeOp()); if (isCC) @@ -899,8 +899,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, } } - LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16.cloneMemRefs(MI); + HI16.cloneMemRefs(MI); LO16.addImm(Pred).addReg(PredReg); HI16.addImm(Pred).addReg(PredReg); @@ -1030,10 +1030,10 @@ static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg, if (IsThumb) { unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0); unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1); - MIB.addReg(RegLo, Flags | getKillRegState(Reg.isDead())); - MIB.addReg(RegHi, Flags | getKillRegState(Reg.isDead())); + MIB.addReg(RegLo, Flags); + MIB.addReg(RegHi, Flags); } else - MIB.addReg(Reg.getReg(), Flags | getKillRegState(Reg.isDead())); + MIB.addReg(Reg.getReg(), Flags); } /// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop. @@ -1103,7 +1103,8 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, // bne .Lloadcmp unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD; MIB = BuildMI(StoreBB, DL, TII->get(STREXD), TempReg); - addExclusiveRegPair(MIB, New, 0, IsThumb, TRI); + unsigned Flags = getKillRegState(New.isDead()); + addExclusiveRegPair(MIB, New, Flags, IsThumb, TRI); MIB.addReg(AddrReg).add(predOps(ARMCC::AL)); unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; @@ -1425,7 +1426,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB.addExternalSymbol("__aeabi_read_tp", 0); } - MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB.cloneMemRefs(MI); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); return true; @@ -1440,7 +1441,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg) .add(MI.getOperand(1)) .add(predOps(ARMCC::AL)); - MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB1.cloneMemRefs(MI); MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPICADD)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) @@ -1544,7 +1545,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, if (isARM) { MIB3.add(predOps(ARMCC::AL)); if (Opcode == ARM::MOV_ga_pcrel_ldr) - MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB3.cloneMemRefs(MI); } TransferImpOps(MI, MIB1, MIB3); MI.eraseFromParent(); @@ -1596,7 +1597,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // Add an implicit def for the super-register. MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); TransferImpOps(MI, MIB, MIB); - MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB.cloneMemRefs(MI); MI.eraseFromParent(); return true; } @@ -1629,7 +1630,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB->addRegisterKilled(SrcReg, TRI, true); TransferImpOps(MI, MIB, MIB); - MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB.cloneMemRefs(MI); MI.eraseFromParent(); return true; } diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp index a66cd7053c0a..a50abfdbee44 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -2951,7 +2951,8 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, unsigned ResultReg = MI->getOperand(0).getReg(); if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false)) return false; - MI->eraseFromParent(); + MachineBasicBlock::iterator I(MI); + removeDeadCode(I, std::next(I)); return true; } @@ -2970,12 +2971,16 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, unsigned ConstAlign = MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context)); unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign); + MachineMemOperand *CPMMO = + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), + MachineMemOperand::MOLoad, 4, 4); unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp; MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg) - .addConstantPoolIndex(Idx); + .addConstantPoolIndex(Idx) + .addMemOperand(CPMMO); if (Opc == ARM::LDRcp) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); @@ -2988,6 +2993,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) .addReg(TempReg) .addImm(ARMPCLabelIndex); + if (!Subtarget->isThumb()) MIB.add(predOps(ARMCC::AL)); diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 56ad7a0f0446..a9d87ced31f3 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -79,12 +79,11 @@ ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti) : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4), STI(sti) {} -bool ARMFrameLowering::noFramePointerElim(const MachineFunction &MF) const { +bool ARMFrameLowering::keepFramePointer(const MachineFunction &MF) const { // iOS always has a FP for backtracking, force other targets to keep their FP // when doing FastISel. The emitted code is currently superior, and in cases // like test-suite's lencod FastISel isn't quite correct when FP is eliminated. - return TargetFrameLowering::noFramePointerElim(MF) || - MF.getSubtarget<ARMSubtarget>().useFastISel(); + return MF.getSubtarget<ARMSubtarget>().useFastISel(); } /// Returns true if the target can safely skip saving callee-saved registers @@ -526,6 +525,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlags(MachineInstr::FrameSetup); switch (TM.getCodeModel()) { + case CodeModel::Tiny: + llvm_unreachable("Tiny code model not available on ARM."); case CodeModel::Small: case CodeModel::Medium: case CodeModel::Kernel: @@ -909,6 +910,7 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, assert(RegInfo->hasBasePointer(MF) && "VLAs and dynamic stack alignment, but missing base pointer!"); FrameReg = RegInfo->getBaseRegister(); + Offset -= SPAdj; } return Offset; } @@ -1006,8 +1008,7 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, if (Regs.empty()) continue; - llvm::sort(Regs.begin(), Regs.end(), [&](const RegAndKill &LHS, - const RegAndKill &RHS) { + llvm::sort(Regs, [&](const RegAndKill &LHS, const RegAndKill &RHS) { return TRI.getEncodingValue(LHS.first) < TRI.getEncodingValue(RHS.first); }); @@ -1103,7 +1104,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, if (Regs.empty()) continue; - llvm::sort(Regs.begin(), Regs.end(), [&](unsigned LHS, unsigned RHS) { + llvm::sort(Regs, [&](unsigned LHS, unsigned RHS) { return TRI.getEncodingValue(LHS) < TRI.getEncodingValue(RHS); }); @@ -1921,9 +1922,13 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, << "\n"); } + // Avoid spilling LR in Thumb1 if there's a tail call: it's expensive to + // restore LR in that case. + bool ExpensiveLRRestore = AFI->isThumb1OnlyFunction() && MFI.hasTailCall(); + // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. // Spill LR as well so we can fold BX_RET to the registers restore (LDM). - if (!LRSpilled && CS1Spilled) { + if (!LRSpilled && CS1Spilled && !ExpensiveLRRestore) { SavedRegs.set(ARM::LR); NumGPRSpills++; SmallVectorImpl<unsigned>::iterator LRPos; @@ -1949,7 +1954,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // Windows on ARM, accept R11 (frame pointer) if (!AFI->isThumbFunction() || (STI.isTargetWindows() && Reg == ARM::R11) || - isARMLowRegister(Reg) || Reg == ARM::LR) { + isARMLowRegister(Reg) || + (Reg == ARM::LR && !ExpensiveLRRestore)) { SavedRegs.set(Reg); LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI) << " to make up alignment\n"); @@ -2151,9 +2157,15 @@ void ARMFrameLowering::adjustForSegmentedStacks( // Do not generate a prologue for leaf functions with a stack of size zero. // For non-leaf functions we have to allow for the possibility that the - // call is to a non-split function, as in PR37807. - if (StackSize == 0 && !MFI.hasTailCall()) + // callis to a non-split function, as in PR37807. This function could also + // take the address of a non-split function. When the linker tries to adjust + // its non-existent prologue, it would fail with an error. Mark the object + // file so that such failures are not errors. See this Go language bug-report + // https://go-review.googlesource.com/c/go/+/148819/ + if (StackSize == 0 && !MFI.hasTailCall()) { + MF.getMMI().setHasNosplitStack(true); return; + } // Use R4 and R5 as scratch registers. // We save R4 and R5 before use and restore them before leaving the function. diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h index e994cab28fe7..2f7e23840e75 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -42,7 +42,7 @@ public: std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const override; - bool noFramePointerElim(const MachineFunction &MF) const override; + bool keepFramePointer(const MachineFunction &MF) const override; bool enableCalleeSaveSkip(const MachineFunction &MF) const override; diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 9592dd53c347..8e0e82388251 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1345,9 +1345,8 @@ static inline SDValue getAL(SelectionDAG *CurDAG, const SDLoc &dl) { } void ARMDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); - cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1); + MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(Result), {MemOp}); } bool ARMDAGToDAGISel::tryARMIndexedLoad(SDNode *N) { @@ -1764,12 +1763,14 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, default: llvm_unreachable("unhandled vld type"); // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4f16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; case MVT::v1i64: OpcodeIndex = 3; break; // Quad-register operations: case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v8f16: case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; @@ -1854,9 +1855,8 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, } // Transfer memoperands. - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); - cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1); + MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(VLd), {MemOp}); if (NumVecs == 1) { ReplaceNode(N, VLd); @@ -1893,8 +1893,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand(); SDValue Chain = N->getOperand(0); EVT VT = N->getOperand(Vec0Idx).getValueType(); @@ -1983,7 +1982,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); // Transfer memoperands. - cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(VSt), {MemOp}); ReplaceNode(N, VSt); return; @@ -2007,7 +2006,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, MemAddr.getValueType(), MVT::Other, OpsA); - cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(VStA), {MemOp}); Chain = SDValue(VStA, 1); // Store the odd D registers. @@ -2026,7 +2025,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, Ops.push_back(Chain); SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, Ops); - cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(VStB), {MemOp}); ReplaceNode(N, VStB); } @@ -2045,8 +2044,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return; - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand(); SDValue Chain = N->getOperand(0); unsigned Lane = @@ -2135,7 +2133,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : QOpcodes[OpcodeIndex]); SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); - cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(VLdLn), {MemOp}); if (!IsLoad) { ReplaceNode(N, VLdLn); return; @@ -2264,9 +2262,8 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, } // Transfer memoperands. - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); - cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1); + MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(VLdDup), {MemOp}); // Extract the subregisters. if (NumVecs == 1) { @@ -2309,6 +2306,11 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) { Srl_imm)) { assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); + // Mask off the unnecessary bits of the AND immediate; normally + // DAGCombine will do this, but that might not happen if + // targetShrinkDemandedConstant chooses a different immediate. + And_imm &= -1U >> Srl_imm; + // Note: The width operand is encoded as width-1. unsigned Width = countTrailingOnes(And_imm) - 1; unsigned LSB = Srl_imm; @@ -2476,9 +2478,8 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) { Opcode, SDLoc(N), CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other), Ops); - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); - cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1); + MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0)); ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2)); @@ -2627,12 +2628,11 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // queries work properly. This e.g. gives the register allocation the // required information for rematerialization. MachineFunction& MF = CurDAG->getMachineFunction(); - MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); - MemOp[0] = MF.getMachineMemOperand( - MachinePointerInfo::getConstantPool(MF), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand *MemOp = + MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF), + MachineMemOperand::MOLoad, 4, 4); - cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp+1); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(ResNode), {MemOp}); ReplaceNode(N, ResNode); return; @@ -3030,11 +3030,13 @@ void ARMDAGToDAGISel::Select(SDNode *N) { switch (VT.getSimpleVT().SimpleTy) { default: return; case MVT::v8i8: Opc = ARM::VZIPd8; break; + case MVT::v4f16: case MVT::v4i16: Opc = ARM::VZIPd16; break; case MVT::v2f32: // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. case MVT::v2i32: Opc = ARM::VTRNd32; break; case MVT::v16i8: Opc = ARM::VZIPq8; break; + case MVT::v8f16: case MVT::v8i16: Opc = ARM::VZIPq16; break; case MVT::v4f32: case MVT::v4i32: Opc = ARM::VZIPq32; break; @@ -3051,11 +3053,13 @@ void ARMDAGToDAGISel::Select(SDNode *N) { switch (VT.getSimpleVT().SimpleTy) { default: return; case MVT::v8i8: Opc = ARM::VUZPd8; break; + case MVT::v4f16: case MVT::v4i16: Opc = ARM::VUZPd16; break; case MVT::v2f32: // vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. case MVT::v2i32: Opc = ARM::VTRNd32; break; case MVT::v16i8: Opc = ARM::VUZPq8; break; + case MVT::v8f16: case MVT::v8i16: Opc = ARM::VUZPq16; break; case MVT::v4f32: case MVT::v4i32: Opc = ARM::VUZPq32; break; @@ -3072,10 +3076,12 @@ void ARMDAGToDAGISel::Select(SDNode *N) { switch (VT.getSimpleVT().SimpleTy) { default: return; case MVT::v8i8: Opc = ARM::VTRNd8; break; + case MVT::v4f16: case MVT::v4i16: Opc = ARM::VTRNd16; break; case MVT::v2f32: case MVT::v2i32: Opc = ARM::VTRNd32; break; case MVT::v16i8: Opc = ARM::VTRNq8; break; + case MVT::v8f16: case MVT::v8i16: Opc = ARM::VTRNq16; break; case MVT::v4f32: case MVT::v4i32: Opc = ARM::VTRNq32; break; @@ -3410,9 +3416,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) { CurDAG->getRegister(0, MVT::i32), Chain}; SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops); // Transfer memoperands. - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); - cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1); + MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp}); // Remap uses. SDValue OutChain = isThumb ? SDValue(Ld, 2) : SDValue(Ld, 1); @@ -3478,9 +3483,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) { SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops); // Transfer memoperands. - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); - cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1); + MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp}); ReplaceNode(N, St); return; diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index ede276dd91bb..21de0f6a7630 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -651,9 +651,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // it have a FP_TO_[SU]INT instruction with a narrower destination than // source. setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); @@ -665,8 +669,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); - setOperationAction(ISD::CTPOP, MVT::v1i64, Expand); - setOperationAction(ISD::CTPOP, MVT::v2i64, Expand); + setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); @@ -846,8 +850,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } setOperationAction(ISD::CTTZ, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i32, Expand); - if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) + if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { setOperationAction(ISD::CTLZ, MVT::i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); + } // @llvm.readcyclecounter requires the Performance Monitors extension. // Default to the 0 expansion on unsupported platforms. @@ -950,6 +956,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BlockAddress, MVT::i32, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // Use the default implementation. setOperationAction(ISD::VASTART, MVT::Other, Custom); @@ -977,7 +984,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // On v8, we have particularly efficient implementations of atomic fences // if they can be combined with nearby atomic loads and stores. - if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) { + if (!Subtarget->hasAcquireRelease() || + getTargetMachine().getOptLevel() == 0) { // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. InsertFencesForAtomic = true; } @@ -1136,14 +1144,26 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->hasNEON()) { // vmin and vmax aren't available in a scalar form, so we use // a NEON instruction with an undef lane instead. - setOperationAction(ISD::FMINNAN, MVT::f16, Legal); - setOperationAction(ISD::FMAXNAN, MVT::f16, Legal); - setOperationAction(ISD::FMINNAN, MVT::f32, Legal); - setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); - setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); - setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); - setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); - setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); + setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); + setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); + setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); + + if (Subtarget->hasFullFP16()) { + setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); + setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); + + setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); + setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); + } } // We have target-specific dag combine patterns for the following nodes: @@ -1181,6 +1201,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // Prefer likely predicted branches to selects on out-of-order cores. PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); + setPrefLoopAlignment(Subtarget->getPrefLoopAlignment()); + setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } @@ -1261,6 +1283,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; case ARMISD::CMOV: return "ARMISD::CMOV"; + case ARMISD::SUBS: return "ARMISD::SUBS"; case ARMISD::SSAT: return "ARMISD::SSAT"; case ARMISD::USAT: return "ARMISD::USAT"; @@ -3052,41 +3075,8 @@ static bool allUsersAreInFunction(const Value *V, const Function *F) { return true; } -/// Return true if all users of V are within some (any) function, looking through -/// ConstantExprs. In other words, are there any global constant users? -static bool allUsersAreInFunctions(const Value *V) { - SmallVector<const User*,4> Worklist; - for (auto *U : V->users()) - Worklist.push_back(U); - while (!Worklist.empty()) { - auto *U = Worklist.pop_back_val(); - if (isa<ConstantExpr>(U)) { - for (auto *UU : U->users()) - Worklist.push_back(UU); - continue; - } - - if (!isa<Instruction>(U)) - return false; - } - return true; -} - -// Return true if T is an integer, float or an array/vector of either. -static bool isSimpleType(Type *T) { - if (T->isIntegerTy() || T->isFloatingPointTy()) - return true; - Type *SubT = nullptr; - if (T->isArrayTy()) - SubT = T->getArrayElementType(); - else if (T->isVectorTy()) - SubT = T->getVectorElementType(); - else - return false; - return SubT->isIntegerTy() || SubT->isFloatingPointTy(); -} - -static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG, +static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, + const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl) { // If we're creating a pool entry for a constant global with unnamed address, // and the global is small enough, we can emit it inline into the constant pool @@ -3113,11 +3103,11 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG, !GVar->hasLocalLinkage()) return SDValue(); - // Ensure that we don't try and inline any type that contains pointers. If - // we inline a value that contains relocations, we move the relocations from - // .data to .text which is not ideal. + // If we inline a value that contains relocations, we move the relocations + // from .data to .text. This is not allowed in position-independent code. auto *Init = GVar->getInitializer(); - if (!isSimpleType(Init->getType())) + if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && + Init->needsRelocation()) return SDValue(); // The constant islands pass can only really deal with alignment requests @@ -3128,7 +3118,7 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG, // that are strings for simplicity. auto *CDAInit = dyn_cast<ConstantDataArray>(Init); unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); - unsigned Align = GVar->getAlignment(); + unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); unsigned RequiredPadding = 4 - (Size % 4); bool PaddingPossible = RequiredPadding == 4 || (CDAInit && CDAInit->isString()); @@ -3149,12 +3139,14 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG, ConstpoolPromotionMaxTotal) return SDValue(); - // This is only valid if all users are in a single function OR it has users - // in multiple functions but it no larger than a pointer. We also check if - // GVar has constant (non-ConstantExpr) users. If so, it essentially has its - // address taken. - if (!allUsersAreInFunction(GVar, &F) && - !(Size <= 4 && allUsersAreInFunctions(GVar))) + // This is only valid if all users are in a single function; we can't clone + // the constant in general. The LLVM IR unnamed_addr allows merging + // constants, but not cloning them. + // + // We could potentially allow cloning if we could prove all uses of the + // constant in the current function don't care about the address, like + // printf format strings. But that isn't implemented for now. + if (!allUsersAreInFunction(GVar, &F)) return SDValue(); // We're going to inline this global. Pad it out if needed. @@ -3182,9 +3174,11 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG, bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - GV = GA->getBaseObject(); - return (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) || - isa<Function>(GV); + if (!(GV = GA->getBaseObject())) + return false; + if (const auto *V = dyn_cast<GlobalVariable>(GV)) + return V->isConstant(); + return isa<Function>(GV); } SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, @@ -3210,7 +3204,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, // promoteToConstantPool only if not generating XO text section if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) - if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl)) + if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) return V; if (isPositionIndependent()) { @@ -3299,9 +3293,13 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && "ROPI/RWPI not currently supported for Windows"); + const TargetMachine &TM = getTargetMachine(); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); - const ARMII::TOF TargetFlags = - (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); + ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; + if (GV->hasDLLImportStorageClass()) + TargetFlags = ARMII::MO_DLLIMPORT; + else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) + TargetFlags = ARMII::MO_COFFSTUB; EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; SDLoc DL(Op); @@ -3313,7 +3311,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, TargetFlags)); - if (GV->hasDLLImportStorageClass()) + if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; @@ -3412,7 +3410,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, Op.getOperand(1), Op.getOperand(2)); } unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) - ? ISD::FMINNAN : ISD::FMAXNAN; + ? ISD::FMINIMUM : ISD::FMAXIMUM; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } @@ -4832,12 +4830,24 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { return DAG.UnrollVectorOp(Op.getNode()); } - assert(Op.getOperand(0).getValueType() == MVT::v4f32 && - "Invalid type for custom lowering!"); - if (VT != MVT::v4i16) + const bool HasFullFP16 = + static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); + + EVT NewTy; + const EVT OpTy = Op.getOperand(0).getValueType(); + if (OpTy == MVT::v4f32) + NewTy = MVT::v4i32; + else if (OpTy == MVT::v4f16 && HasFullFP16) + NewTy = MVT::v4i16; + else if (OpTy == MVT::v8f16 && HasFullFP16) + NewTy = MVT::v8i16; + else + llvm_unreachable("Invalid type for custom lowering!"); + + if (VT != MVT::v4i16 && VT != MVT::v8i16) return DAG.UnrollVectorOp(Op.getNode()); - Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0)); + Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); } @@ -4870,9 +4880,21 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { return DAG.UnrollVectorOp(Op.getNode()); } - assert(Op.getOperand(0).getValueType() == MVT::v4i16 && + assert((Op.getOperand(0).getValueType() == MVT::v4i16 || + Op.getOperand(0).getValueType() == MVT::v8i16) && "Invalid type for custom lowering!"); - if (VT != MVT::v4f32) + + const bool HasFullFP16 = + static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); + + EVT DestVecType; + if (VT == MVT::v4f32) + DestVecType = MVT::v4i32; + else if (VT == MVT::v4f16 && HasFullFP16) + DestVecType = MVT::v4i16; + else if (VT == MVT::v8f16 && HasFullFP16) + DestVecType = MVT::v8i16; + else return DAG.UnrollVectorOp(Op.getNode()); unsigned CastOpc; @@ -4889,7 +4911,7 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { break; } - Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0)); + Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); return DAG.getNode(Opc, dl, VT, Op); } @@ -5392,10 +5414,6 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, // Compute with: cttz(x) = ctpop(lsb - 1) - // Since we can only compute the number of bits in a byte with vcnt.8, we - // have to gather the result with pairwise addition (vpaddl) for i16, i32, - // and i64. - // Compute LSB - 1. SDValue Bits; if (ElemTy == MVT::i64) { @@ -5408,32 +5426,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, DAG.getTargetConstant(1, dl, ElemTy)); Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); } - - // Count #bits with vcnt.8. - EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; - SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); - SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); - - // Gather the #bits with vpaddl (pairwise add.) - EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; - SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, - DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), - Cnt8); - if (ElemTy == MVT::i16) - return Cnt16; - - EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; - SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, - DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), - Cnt16); - if (ElemTy == MVT::i32) - return Cnt32; - - assert(ElemTy == MVT::i64); - SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), - Cnt32); - return Cnt64; + return DAG.getNode(ISD::CTPOP, dl, VT, Bits); } if (!ST->hasV6T2Ops()) @@ -5443,112 +5436,37 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::CTLZ, dl, VT, rbit); } -/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count -/// for each 16-bit element from operand, repeated. The basic idea is to -/// leverage vcnt to get the 8-bit counts, gather and add the results. -/// -/// Trace for v4i16: -/// input = [v0 v1 v2 v3 ] (vi 16-bit element) -/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) -/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) -/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] -/// [b0 b1 b2 b3 b4 b5 b6 b7] -/// +[b1 b0 b3 b2 b5 b4 b7 b6] -/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, -/// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) -static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - SDLoc DL(N); - - EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; - SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); - SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); - SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); - SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); - return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); -} - -/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the -/// bit-count for each 16-bit element from the operand. We need slightly -/// different sequencing for v4i16 and v8i16 to stay within NEON's available -/// 64/128-bit registers. -/// -/// Trace for v4i16: -/// input = [v0 v1 v2 v3 ] (vi 16-bit element) -/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) -/// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] -/// v4i16:Extracted = [k0 k1 k2 k3 ] -static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { +static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc DL(N); - SDValue BitCounts = getCTPOP16BitCounts(N, DAG); - if (VT.is64BitVector()) { - SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, - DAG.getIntPtrConstant(0, DL)); - } else { - SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, - BitCounts, DAG.getIntPtrConstant(0, DL)); - return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); - } -} - -/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the -/// bit-count for each 32-bit element from the operand. The idea here is -/// to split the vector into 16-bit elements, leverage the 16-bit count -/// routine, and then combine the results. -/// -/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): -/// input = [v0 v1 ] (vi: 32-bit elements) -/// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) -/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) -/// vrev: N0 = [k1 k0 k3 k2 ] -/// [k0 k1 k2 k3 ] -/// N1 =+[k1 k0 k3 k2 ] -/// [k0 k2 k1 k3 ] -/// N2 =+[k1 k3 k0 k2 ] -/// [k0 k2 k1 k3 ] -/// Extended =+[k1 k3 k0 k2 ] -/// [k0 k2 ] -/// Extracted=+[k1 k3 ] -/// -static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - SDLoc DL(N); + assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); + assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || + VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && + "Unexpected type for custom ctpop lowering"); - EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); + Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); - SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); - SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); - SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); - SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); - SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); + // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. + unsigned EltSize = 8; + unsigned NumElts = VT.is64BitVector() ? 8 : 16; + while (EltSize != VT.getScalarSizeInBits()) { + SmallVector<SDValue, 8> Ops; + Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, + TLI.getPointerTy(DAG.getDataLayout()))); + Ops.push_back(Res); - if (VT.is64BitVector()) { - SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, - DAG.getIntPtrConstant(0, DL)); - } else { - SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, - DAG.getIntPtrConstant(0, DL)); - return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); + EltSize *= 2; + NumElts /= 2; + MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); + Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); } -} -static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - EVT VT = N->getValueType(0); - - assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); - assert((VT == MVT::v2i32 || VT == MVT::v4i32 || - VT == MVT::v4i16 || VT == MVT::v8i16) && - "Unexpected type for custom ctpop lowering"); - - if (VT.getVectorElementType() == MVT::i32) - return lowerCTPOP32BitElements(N, DAG); - else - return lowerCTPOP16BitElements(N, DAG); + return Res; } static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, @@ -7878,6 +7796,50 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, return LowerCallTo(CLI).first; } +// This is a code size optimisation: return the original SDIV node to +// DAGCombiner when we don't want to expand SDIV into a sequence of +// instructions, and an empty node otherwise which will cause the +// SDIV to be expanded in DAGCombine. +SDValue +ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + SmallVectorImpl<SDNode *> &Created) const { + // TODO: Support SREM + if (N->getOpcode() != ISD::SDIV) + return SDValue(); + + const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); + const auto &MF = DAG.getMachineFunction(); + const bool MinSize = MF.getFunction().optForMinSize(); + const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() + : ST.hasDivideInARMMode(); + + // Don't touch vector types; rewriting this may lead to scalarizing + // the int divs. + if (N->getOperand(0).getValueType().isVector()) + return SDValue(); + + // Bail if MinSize is not set, and also for both ARM and Thumb mode we need + // hwdiv support for this to be really profitable. + if (!(MinSize && HasDivide)) + return SDValue(); + + // ARM mode is a bit simpler than Thumb: we can handle large power + // of 2 immediates with 1 mov instruction; no further checks required, + // just return the sdiv node. + if (!ST.isThumb()) + return SDValue(N, 0); + + // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, + // and thus lose the code size benefits of a MOVS that requires only 2. + // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, + // but as it's doing exactly this, it's not worth the trouble to get TTI. + if (Divisor.sgt(128)) + return SDValue(); + + return SDValue(N, 0); +} + SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const { assert(Op.getValueType() == MVT::i32 && @@ -7990,10 +7952,8 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N, ARM::CMP_SWAP_64, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); - MachineFunction &MF = DAG.getMachineFunction(); - MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); - MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); - cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1); + MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); + DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); bool isBigEndian = DAG.getDataLayout().isBigEndian(); @@ -9169,6 +9129,8 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, // IP. switch (TM.getCodeModel()) { + case CodeModel::Tiny: + llvm_unreachable("Tiny code model not available on ARM."); case CodeModel::Small: case CodeModel::Medium: case CodeModel::Kernel: @@ -9244,6 +9206,42 @@ ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, return ContBB; } +// The CPSR operand of SelectItr might be missing a kill marker +// because there were multiple uses of CPSR, and ISel didn't know +// which to mark. Figure out whether SelectItr should have had a +// kill marker, and set it if it should. Returns the correct kill +// marker value. +static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, + MachineBasicBlock* BB, + const TargetRegisterInfo* TRI) { + // Scan forward through BB for a use/def of CPSR. + MachineBasicBlock::iterator miI(std::next(SelectItr)); + for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { + const MachineInstr& mi = *miI; + if (mi.readsRegister(ARM::CPSR)) + return false; + if (mi.definesRegister(ARM::CPSR)) + break; // Should have kill-flag - update below. + } + + // If we hit the end of the block, check whether CPSR is live into a + // successor. + if (miI == BB->end()) { + for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), + sEnd = BB->succ_end(); + sItr != sEnd; ++sItr) { + MachineBasicBlock* succ = *sItr; + if (succ->isLiveIn(ARM::CPSR)) + return false; + } + } + + // We found a def, or hit the end of the basic block and CPSR wasn't live + // out. SelectMI should have a kill flag on CPSR. + SelectItr->addRegisterKilled(ARM::CPSR, TRI); + return true; +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -9343,6 +9341,14 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, F->insert(It, copy0MBB); F->insert(It, sinkMBB); + // Check whether CPSR is live past the tMOVCCr_pseudo. + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); + if (!MI.killsRegister(ARM::CPSR) && + !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { + copy0MBB->addLiveIn(ARM::CPSR); + sinkMBB->addLiveIn(ARM::CPSR); + } + // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); @@ -10407,6 +10413,37 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, return SDValue(); } +bool +ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, + CombineLevel Level) const { + if (Level == BeforeLegalizeTypes) + return true; + + if (Subtarget->isThumb() && Subtarget->isThumb1Only()) + return true; + + if (N->getOpcode() != ISD::SHL) + return true; + + // Turn off commute-with-shift transform after legalization, so it doesn't + // conflict with PerformSHLSimplify. (We could try to detect when + // PerformSHLSimplify would trigger more precisely, but it isn't + // really necessary.) + return false; +} + +bool +ARMTargetLowering::shouldFoldShiftPairToMask(const SDNode *N, + CombineLevel Level) const { + if (!Subtarget->isThumb1Only()) + return true; + + if (Level == BeforeLegalizeTypes) + return true; + + return false; +} + static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { @@ -10506,9 +10543,7 @@ static SDValue PerformSHLSimplify(SDNode *N, LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); SHL.dump(); N->dump()); LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); - - DAG.ReplaceAllUsesWith(SDValue(N, 0), Res); - return SDValue(N, 0); + return Res; } @@ -10712,6 +10747,12 @@ static SDValue CombineANDShift(SDNode *N, if (!C2 || C2 >= 32) return SDValue(); + // Clear irrelevant bits in the mask. + if (LeftShift) + C1 &= (-1U << C2); + else + C1 &= (-1U >> C2); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); @@ -10719,9 +10760,7 @@ static SDValue CombineANDShift(SDNode *N, // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to // transform to a pair of shifts, to save materializing c1. - // First pattern: right shift, and c1+1 is a power of two. - // FIXME: Also check reversed pattern (left shift, and ~c1+1 is a power - // of two). + // First pattern: right shift, then mask off leading bits. // FIXME: Use demanded bits? if (!LeftShift && isMask_32(C1)) { uint32_t C3 = countLeadingZeros(C1); @@ -10733,13 +10772,23 @@ static SDValue CombineANDShift(SDNode *N, } } - // Second pattern: left shift, and (c1>>c2)+1 is a power of two. - // FIXME: Also check reversed pattern (right shift, and ~(c1<<c2)+1 - // is a power of two). + // First pattern, reversed: left shift, then mask off trailing bits. + if (LeftShift && isMask_32(~C1)) { + uint32_t C3 = countTrailingZeros(C1); + if (C2 < C3) { + SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), + DAG.getConstant(C3 - C2, DL, MVT::i32)); + return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, + DAG.getConstant(C3, DL, MVT::i32)); + } + } + + // Second pattern: left shift, then mask off leading bits. // FIXME: Use demanded bits? if (LeftShift && isShiftedMask_32(C1)) { + uint32_t Trailing = countTrailingZeros(C1); uint32_t C3 = countLeadingZeros(C1); - if (C2 + C3 < 32 && C1 == ((-1U << (C2 + C3)) >> C3)) { + if (Trailing == C2 && C2 + C3 < 32) { SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), DAG.getConstant(C2 + C3, DL, MVT::i32)); return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, @@ -10747,6 +10796,19 @@ static SDValue CombineANDShift(SDNode *N, } } + // Second pattern, reversed: right shift, then mask off trailing bits. + // FIXME: Handle other patterns of known/demanded bits. + if (!LeftShift && isShiftedMask_32(C1)) { + uint32_t Leading = countLeadingZeros(C1); + uint32_t C3 = countTrailingZeros(C1); + if (Leading == C2 && C2 + C3 < 32) { + SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), + DAG.getConstant(C2 + C3, DL, MVT::i32)); + return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, + DAG.getConstant(C3, DL, MVT::i32)); + } + } + // FIXME: Transform "(and (shl x, c2) c1)" -> // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than // c1. @@ -11541,8 +11603,15 @@ static SDValue CombineBaseUpdate(SDNode *N, continue; // Check that the add is independent of the load/store. Otherwise, folding - // it would create a cycle. - if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + // it would create a cycle. We can avoid searching through Addr as it's a + // predecessor to both. + SmallPtrSet<const SDNode *, 32> Visited; + SmallVector<const SDNode *, 16> Worklist; + Visited.insert(Addr.getNode()); + Worklist.push_back(N); + Worklist.push_back(User); + if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || + SDNode::hasPredecessorHelper(User, Visited, Worklist)) continue; // Find the new opcode for the updating load/store. @@ -12507,8 +12576,7 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D // Lastly, can we determine that the bits defined by OrCI // are zero in Y? - KnownBits Known; - DAG.computeKnownBits(Y, Known); + KnownBits Known = DAG.computeKnownBits(Y); if ((OrCI & Known.Zero) != OrCI) return SDValue(); @@ -12679,30 +12747,38 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); } - } else if (CC == ARMCC::NE && LHS != RHS && + } else if (CC == ARMCC::NE && !isNullConstant(RHS) && (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { // This seems pointless but will allow us to combine it further below. - // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y) - SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); + // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 + SDValue Sub = + DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); + SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, + Sub.getValue(1), SDValue()); Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, - N->getOperand(3), Cmp); + N->getOperand(3), CPSRGlue.getValue(1)); + FalseVal = Sub; } } else if (isNullConstant(TrueVal)) { - if (CC == ARMCC::EQ && LHS != RHS && + if (CC == ARMCC::EQ && !isNullConstant(RHS) && (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { // This seems pointless but will allow us to combine it further below // Note that we change == for != as this is the dual for the case above. - // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y) - SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); + // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 + SDValue Sub = + DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); + SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, + Sub.getValue(1), SDValue()); Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, DAG.getConstant(ARMCC::NE, dl, MVT::i32), - N->getOperand(3), Cmp); + N->getOperand(3), CPSRGlue.getValue(1)); + FalseVal = Sub; } } // On Thumb1, the DAG above may be further combined if z is a power of 2 // (z == 2 ^ K). - // CMOV (SUB x, y), z, !=, (CMPZ x, y) -> + // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> // merge t3, t4 // where t1 = (SUBCARRY (SUB x, y), z, 0) // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) @@ -12710,8 +12786,8 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ] const APInt *TrueConst; if (Subtarget->isThumb1Only() && CC == ARMCC::NE && - (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) && - (FalseVal.getOperand(1) == RHS) && + (FalseVal.getOpcode() == ARMISD::SUBS) && + (FalseVal.getOperand(0) == LHS) && (FalseVal.getOperand(1) == RHS) && (TrueConst = isPowerOf2Constant(TrueVal))) { SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned ShiftAmount = TrueConst->logBase2(); @@ -12730,8 +12806,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { } if (Res.getNode()) { - KnownBits Known; - DAG.computeKnownBits(SDValue(N,0), Known); + KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); // Capture demanded bits information that would be otherwise lost. if (Known.Zero == 0xfffffffe) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, @@ -13522,12 +13597,11 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, break; case ARMISD::CMOV: { // Bits are known zero/one if known on the LHS and RHS. - DAG.computeKnownBits(Op.getOperand(0), Known, Depth+1); + Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); if (Known.isUnknown()) return; - KnownBits KnownRHS; - DAG.computeKnownBits(Op.getOperand(1), KnownRHS, Depth+1); + KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); Known.Zero &= KnownRHS.Zero; Known.One &= KnownRHS.One; return; @@ -13549,7 +13623,7 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case ARMISD::BFI: { // Conservatively, we can recurse down the first operand // and just mask out all affected bits. - DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1); + Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); // The operand to BFI is already a mask suitable for removing the bits it // sets. @@ -13559,9 +13633,120 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.One &= Mask; return; } + case ARMISD::VGETLANEs: + case ARMISD::VGETLANEu: { + const SDValue &SrcSV = Op.getOperand(0); + EVT VecVT = SrcSV.getValueType(); + assert(VecVT.isVector() && "VGETLANE expected a vector type"); + const unsigned NumSrcElts = VecVT.getVectorNumElements(); + ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); + assert(Pos->getAPIntValue().ult(NumSrcElts) && + "VGETLANE index out of bounds"); + unsigned Idx = Pos->getZExtValue(); + APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); + Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); + + EVT VT = Op.getValueType(); + const unsigned DstSz = VT.getScalarSizeInBits(); + const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); + assert(SrcSz == Known.getBitWidth()); + assert(DstSz > SrcSz); + if (Op.getOpcode() == ARMISD::VGETLANEs) + Known = Known.sext(DstSz); + else { + Known = Known.zext(DstSz); + Known.Zero.setBitsFrom(SrcSz); + } + assert(DstSz == Known.getBitWidth()); + break; + } } } +bool +ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, + const APInt &DemandedAPInt, + TargetLoweringOpt &TLO) const { + // Delay optimization, so we don't have to deal with illegal types, or block + // optimizations. + if (!TLO.LegalOps) + return false; + + // Only optimize AND for now. + if (Op.getOpcode() != ISD::AND) + return false; + + EVT VT = Op.getValueType(); + + // Ignore vectors. + if (VT.isVector()) + return false; + + assert(VT == MVT::i32 && "Unexpected integer type"); + + // Make sure the RHS really is a constant. + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!C) + return false; + + unsigned Mask = C->getZExtValue(); + + unsigned Demanded = DemandedAPInt.getZExtValue(); + unsigned ShrunkMask = Mask & Demanded; + unsigned ExpandedMask = Mask | ~Demanded; + + // If the mask is all zeros, let the target-independent code replace the + // result with zero. + if (ShrunkMask == 0) + return false; + + // If the mask is all ones, erase the AND. (Currently, the target-independent + // code won't do this, so we have to do it explicitly to avoid an infinite + // loop in obscure cases.) + if (ExpandedMask == ~0U) + return TLO.CombineTo(Op, Op.getOperand(0)); + + auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { + return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; + }; + auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { + if (NewMask == Mask) + return true; + SDLoc DL(Op); + SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); + SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); + return TLO.CombineTo(Op, NewOp); + }; + + // Prefer uxtb mask. + if (IsLegalMask(0xFF)) + return UseMask(0xFF); + + // Prefer uxth mask. + if (IsLegalMask(0xFFFF)) + return UseMask(0xFFFF); + + // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. + // FIXME: Prefer a contiguous sequence of bits for other optimizations. + if (ShrunkMask < 256) + return UseMask(ShrunkMask); + + // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. + // FIXME: Prefer a contiguous sequence of bits for other optimizations. + if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) + return UseMask(ExpandedMask); + + // Potential improvements: + // + // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. + // We could try to prefer Thumb1 immediates which can be lowered to a + // two-instruction sequence. + // We could try to recognize more legal ARM/Thumb2 immediates here. + + return false; +} + + //===----------------------------------------------------------------------===// // ARM Inline Assembly Support //===----------------------------------------------------------------------===// @@ -14412,16 +14597,18 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { : AtomicExpansionKind::None; } -bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( - AtomicCmpXchgInst *AI) const { +TargetLowering::AtomicExpansionKind +ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement cmpxchg without spilling. If the address being exchanged is also // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. - bool hasAtomicCmpXchg = + bool HasAtomicCmpXchg = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); - return getTargetMachine().getOptLevel() != 0 && hasAtomicCmpXchg; + if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg) + return AtomicExpansionKind::LLSC; + return AtomicExpansionKind::None; } bool ARMTargetLowering::shouldInsertFencesForAtomic( @@ -14548,6 +14735,11 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Addr}); } + +bool ARMTargetLowering::alignLoopsWithOptSize() const { + return Subtarget->isMClass(); +} + /// A helper function for determining the number of interleaved accesses we /// will generate when lowering accesses of the given type. unsigned diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h index 50b4c2977fb5..7a9fc739fc13 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h @@ -85,6 +85,7 @@ class VectorType; FMSTAT, // ARM fmstat instruction. CMOV, // ARM conditional move instructions. + SUBS, // Flag-setting subtraction. SSAT, // Signed saturation USAT, // Unsigned saturation @@ -389,6 +390,9 @@ class VectorType; const SelectionDAG &DAG, unsigned Depth) const override; + bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const override; + bool ExpandInlineAsm(CallInst *CI) const override; @@ -535,7 +539,8 @@ class VectorType; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; - bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; bool useLoadStackGuardNode() const override; @@ -572,6 +577,8 @@ class VectorType; bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL) const; + bool alignLoopsWithOptSize() const override; + /// Returns the number of interleaved accesses that will be generated when /// lowering accesses of the given type. unsigned getNumInterleavedAccesses(VectorType *VecTy, @@ -583,6 +590,11 @@ class VectorType; unsigned getABIAlignmentForCallingConv(Type *ArgTy, DataLayout DL) const override; + bool isDesirableToCommuteWithShift(const SDNode *N, + CombineLevel Level) const override; + + bool shouldFoldShiftPairToMask(const SDNode *N, + CombineLevel Level) const override; protected: std::pair<const TargetRegisterClass *, uint8_t> findRepresentativeClass(const TargetRegisterInfo *TRI, @@ -685,6 +697,9 @@ class VectorType; unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, + SmallVectorImpl<SDNode *> &Created) const override; + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be /// expanded to FMAs when this method returns true, otherwise fmuladd is @@ -764,6 +779,8 @@ class VectorType; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; + bool shouldConsiderGEPOffsetSplit() const override { return true; } + SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, SelectionDAG &DAG) const; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td index 1d3b1414f090..0df48ba61299 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -2580,6 +2580,37 @@ class N3VLaneCP8<bit op23, bits<2> op21_20, bit op6, bit op4, let Inst{3-0} = Vm{3-0}; } +// In Armv8.2-A, some NEON instructions are added that encode Vn and Vm +// differently: +// if Q == ‘1’ then UInt(N:Vn) else UInt(Vn:N); +// if Q == ‘1’ then UInt(M:Vm) else UInt(Vm:M); +// Class N3VCP8 above describes the Q=1 case, and this class the Q=0 case. +class N3VCP8Q0<bits<2> op24_23, bits<2> op21_20, bit op6, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N3RegCplxFrm, itin, opc, dt, asm, cstr, pattern> { + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + + let DecoderNamespace = "VFPV8"; + // These have the same encodings in ARM and Thumb2 + let PostEncoderMethod = ""; + + let Inst{31-25} = 0b1111110; + let Inst{24-23} = op24_23; + let Inst{22} = Vd{4}; + let Inst{21-20} = op21_20; + let Inst{19-16} = Vn{4-1}; + let Inst{15-12} = Vd{3-0}; + let Inst{11-8} = 0b1000; + let Inst{7} = Vn{0}; + let Inst{6} = op6; + let Inst{5} = Vm{0}; + let Inst{4} = op4; + let Inst{3-0} = Vm{4-1}; +} + // Operand types for complex instructions class ComplexRotationOperand<int Angle, int Remainder, string Type, string Diag> : AsmOperandClass { diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp index 397c9dadb4ac..bcc31f5fa4cc 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -132,34 +132,6 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const { BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg) .addReg(Reg, RegState::Kill) .addImm(0) - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()) + .cloneMemRefs(*MI) .add(predOps(ARMCC::AL)); } - -std::pair<unsigned, unsigned> -ARMInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { - const unsigned Mask = ARMII::MO_OPTION_MASK; - return std::make_pair(TF & Mask, TF & ~Mask); -} - -ArrayRef<std::pair<unsigned, const char *>> -ARMInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { - using namespace ARMII; - - static const std::pair<unsigned, const char *> TargetFlags[] = { - {MO_LO16, "arm-lo16"}, {MO_HI16, "arm-hi16"}}; - return makeArrayRef(TargetFlags); -} - -ArrayRef<std::pair<unsigned, const char *>> -ARMInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { - using namespace ARMII; - - static const std::pair<unsigned, const char *> TargetFlags[] = { - {MO_GOT, "arm-got"}, - {MO_SBREL, "arm-sbrel"}, - {MO_DLLIMPORT, "arm-dllimport"}, - {MO_SECREL, "arm-secrel"}, - {MO_NONLAZY, "arm-nonlazy"}}; - return makeArrayRef(TargetFlags); -} diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h index c54c987134df..c87fb97448c9 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h @@ -38,13 +38,6 @@ public: /// const ARMRegisterInfo &getRegisterInfo() const override { return RI; } - std::pair<unsigned, unsigned> - decomposeMachineOperandsTargetFlags(unsigned TF) const override; - ArrayRef<std::pair<unsigned, const char *>> - getSerializableDirectMachineOperandTargetFlags() const override; - ArrayRef<std::pair<unsigned, const char *>> - getSerializableBitmaskMachineOperandTargetFlags() const override; - private: void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override; }; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td index d4c342cee5c0..13abdc9687ec 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -144,6 +144,7 @@ def ARMintretflag : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, [SDNPInGlue]>; +def ARMsubs : SDNode<"ARMISD::SUBS", SDTIntBinOp, [SDNPOutGlue]>; def ARMssatnoshift : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; @@ -221,6 +222,7 @@ def HasV4T : Predicate<"Subtarget->hasV4TOps()">, def NoV4T : Predicate<"!Subtarget->hasV4TOps()">; def HasV5T : Predicate<"Subtarget->hasV5TOps()">, AssemblerPredicate<"HasV5TOps", "armv5t">; +def NoV5T : Predicate<"!Subtarget->hasV5TOps()">; def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">, AssemblerPredicate<"HasV5TEOps", "armv5te">; def HasV6 : Predicate<"Subtarget->hasV6Ops()">, @@ -255,6 +257,8 @@ def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">, AssemblerPredicate<"HasV8_3aOps", "armv8.3a">; def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">, AssemblerPredicate<"HasV8_4aOps", "armv8.4a">; +def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, + AssemblerPredicate<"HasV8_5aOps", "armv8.5a">; def NoVFP : Predicate<"!Subtarget->hasVFP2()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, AssemblerPredicate<"FeatureVFP2", "VFP2">; @@ -285,6 +289,8 @@ def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<"FeatureFP16","half-float conversions">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16","full half-float">; +def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">, + AssemblerPredicate<"FeatureFP16FML","full half-float fml">; def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">, AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">; def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, @@ -351,23 +357,24 @@ def UseNegativeImmediates : let RecomputePerFunction = 1 in { def UseMovt : Predicate<"Subtarget->useMovt(*MF)">; def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">; - def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">; - def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">; + def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">; + def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">; + + def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&" + " TM.Options.AllowFPOpFusion != FPOpFusion::Fast) ||" + "MF->getFunction().optForMinSize())">; } -def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; def UseMulOps : Predicate<"Subtarget->useMulOps()">; // Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available. -// But only select them if more precision in FP computation is allowed. +// But only select them if more precision in FP computation is allowed, and when +// they are not slower than a mul + add sequence. // Do not use them for Darwin platforms. def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" " FPOpFusion::Fast && " " Subtarget->hasVFP4()) && " - "!Subtarget->isTargetDarwin()">; -def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion ==" - " FPOpFusion::Fast &&" - " Subtarget->hasVFP4()) || " - "Subtarget->isTargetDarwin()">; + "!Subtarget->isTargetDarwin() &&" + "Subtarget->useFPVMLx()">; def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">; def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">; @@ -387,6 +394,10 @@ let RecomputePerFunction = 1 in { def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">; +// Armv8.5-A extensions +def HasSB : Predicate<"Subtarget->hasSB()">, + AssemblerPredicate<"FeatureSB", "sb">; + //===----------------------------------------------------------------------===// // ARM Flag Definitions. @@ -415,24 +426,22 @@ def imm16_31 : ImmLeaf<i32, [{ // sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits. def sext_16_node : PatLeaf<(i32 GPR:$a), [{ - if (CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17) - return true; - - if (N->getOpcode() != ISD::SRA) - return false; - if (N->getOperand(0).getOpcode() != ISD::SHL) - return false; - - auto *ShiftVal = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (!ShiftVal || ShiftVal->getZExtValue() != 16) - return false; + return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17; +}]>; - ShiftVal = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1)); - if (!ShiftVal || ShiftVal->getZExtValue() != 16) - return false; +def sext_bottom_16 : PatFrag<(ops node:$a), + (sext_inreg node:$a, i16)>; +def sext_top_16 : PatFrag<(ops node:$a), + (i32 (sra node:$a, (i32 16)))>; - return true; -}]>; +def bb_mul : PatFrag<(ops node:$a, node:$b), + (mul (sext_bottom_16 node:$a), (sext_bottom_16 node:$b))>; +def bt_mul : PatFrag<(ops node:$a, node:$b), + (mul (sext_bottom_16 node:$a), (sra node:$b, (i32 16)))>; +def tb_mul : PatFrag<(ops node:$a, node:$b), + (mul (sra node:$a, (i32 16)), (sext_bottom_16 node:$b))>; +def tt_mul : PatFrag<(ops node:$a, node:$b), + (mul (sra node:$a, (i32 16)), (sra node:$b, (i32 16)))>; /// Split a 32-bit immediate into two 16 bit parts. def hi16 : SDNodeXForm<imm, [{ @@ -713,7 +722,20 @@ def arm_i32imm : PatLeaf<(imm), [{ if (Subtarget->useMovt(*MF)) return true; return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue()); -}]>; +}]> { + // Ideally this would be an IntImmLeaf, but then we wouldn't have access to + // the MachineFunction. + let GISelPredicateCode = [{ + const auto &MF = *MI.getParent()->getParent(); + if (STI.useMovt(MF)) + return true; + + const auto &MO = MI.getOperand(1); + if (!MO.isCImm()) + return false; + return ARM_AM::isSOImmTwoPartVal(MO.getCImm()->getZExtValue()); + }]; +} /// imm0_1 predicate - Immediate in the range [0,1]. def Imm0_1AsmOperand: ImmAsmOperand<0,1> { let Name = "Imm0_1"; } @@ -2191,6 +2213,9 @@ def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary, let Inst = 0xe7ffdefe; } +def : Pat<(debugtrap), (BKPT 0)>, Requires<[IsARM, HasV5T]>; +def : Pat<(debugtrap), (UDF 254)>, Requires<[IsARM, NoV5T]>; + // Address computation and loads and stores in PIC mode. let isNotDuplicable = 1 in { def PICADD : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p), @@ -3321,7 +3346,7 @@ multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f, let hasSideEffects = 0 in { -let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m, IIC_iLoad_mu>, ComplexDeprecationPredicate<"ARMLoad">; @@ -3519,10 +3544,14 @@ def : ARMV6Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, imm8_or_16:$rot), def SXTB16 : AI_ext_rrot_np<0b01101000, "sxtb16">; def : ARMV6Pat<(int_arm_sxtb16 GPR:$Src), (SXTB16 GPR:$Src, 0)>; +def : ARMV6Pat<(int_arm_sxtb16 (rotr GPR:$Src, rot_imm:$rot)), + (SXTB16 GPR:$Src, rot_imm:$rot)>; def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">; def : ARMV6Pat<(int_arm_sxtab16 GPR:$LHS, GPR:$RHS), (SXTAB16 GPR:$LHS, GPR:$RHS, 0)>; +def : ARMV6Pat<(int_arm_sxtab16 GPR:$LHS, (rotr GPR:$RHS, rot_imm:$rot)), + (SXTAB16 GPR:$LHS, GPR:$RHS, rot_imm:$rot)>; // Zero extenders @@ -3544,6 +3573,8 @@ def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF), (UXTB16 GPR:$Src, 1)>; def : ARMV6Pat<(int_arm_uxtb16 GPR:$Src), (UXTB16 GPR:$Src, 0)>; +def : ARMV6Pat<(int_arm_uxtb16 (rotr GPR:$Src, rot_imm:$rot)), + (UXTB16 GPR:$Src, rot_imm:$rot)>; def UXTAB : AI_exta_rrot<0b01101110, "uxtab", BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; @@ -3560,6 +3591,8 @@ def : ARMV6Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), 0xFFFF)), def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">; def : ARMV6Pat<(int_arm_uxtab16 GPR:$LHS, GPR:$RHS), (UXTAB16 GPR:$LHS, GPR:$RHS, 0)>; +def : ARMV6Pat<(int_arm_uxtab16 GPR:$LHS, (rotr GPR:$RHS, rot_imm:$rot)), + (UXTAB16 GPR:$LHS, GPR:$RHS, rot_imm:$rot)>; def SBFX : I<(outs GPRnopc:$Rd), @@ -3620,6 +3653,14 @@ let isAdd = 1 in defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMaddc, 1>; defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>; +def : ARMPat<(ARMsubs GPR:$Rn, mod_imm:$imm), (SUBSri $Rn, mod_imm:$imm)>; +def : ARMPat<(ARMsubs GPR:$Rn, GPR:$Rm), (SUBSrr $Rn, $Rm)>; +def : ARMPat<(ARMsubs GPR:$Rn, so_reg_imm:$shift), + (SUBSrsi $Rn, so_reg_imm:$shift)>; +def : ARMPat<(ARMsubs GPR:$Rn, so_reg_reg:$shift), + (SUBSrsr $Rn, so_reg_reg:$shift)>; + + let isAdd = 1 in defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>; defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>; @@ -4211,29 +4252,25 @@ def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd), multiclass AI_smul<string opc> { def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm", - [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16), - (sext_inreg GPR:$Rm, i16)))]>, + [(set GPR:$Rd, (bb_mul GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm", - [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16), - (sra GPR:$Rm, (i32 16))))]>, + [(set GPR:$Rd, (bt_mul GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm", - [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)), - (sext_inreg GPR:$Rm, i16)))]>, + [(set GPR:$Rd, (tb_mul GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm", - [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)), - (sra GPR:$Rm, (i32 16))))]>, + [(set GPR:$Rd, (tt_mul GPR:$Rn, GPR:$Rm))]>, Requires<[IsARM, HasV5TE]>, Sched<[WriteMUL16, ReadMUL, ReadMUL]>; @@ -4257,35 +4294,31 @@ multiclass AI_smla<string opc> { (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra", [(set GPRnopc:$Rd, (add GPR:$Ra, - (mul (sext_inreg GPRnopc:$Rn, i16), - (sext_inreg GPRnopc:$Rm, i16))))]>, + (bb_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra", - [(set GPRnopc:$Rd, - (add GPR:$Ra, (mul (sext_inreg GPRnopc:$Rn, i16), - (sra GPRnopc:$Rm, (i32 16)))))]>, + [(set GPRnopc:$Rd, (add GPR:$Ra, + (bt_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra", - [(set GPRnopc:$Rd, - (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)), - (sext_inreg GPRnopc:$Rm, i16))))]>, + [(set GPRnopc:$Rd, (add GPR:$Ra, + (tb_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra", - [(set GPRnopc:$Rd, - (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)), - (sra GPRnopc:$Rm, (i32 16)))))]>, + [(set GPRnopc:$Rd, (add GPR:$Ra, + (tt_mul GPRnopc:$Rn, GPRnopc:$Rm)))]>, Requires<[IsARM, HasV5TE, UseMulOps]>, Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>; @@ -4863,6 +4896,14 @@ def TSB : AInoP<(outs), (ins tsb_opt:$opt), MiscFrm, NoItinerary, } +// Armv8.5-A speculation barrier +def SB : AInoP<(outs), (ins), MiscFrm, NoItinerary, "sb", "", []>, + Requires<[IsARM, HasSB]>, Sched<[]> { + let Inst{31-0} = 0xf57ff070; + let Unpredictable = 0x000fff0f; + let hasSideEffects = 1; +} + let usesCustomInserter = 1, Defs = [CPSR] in { // Pseudo instruction that combines movs + predicated rsbmi @@ -4870,7 +4911,7 @@ let usesCustomInserter = 1, Defs = [CPSR] in { def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>; } -let usesCustomInserter = 1 in { +let usesCustomInserter = 1, Defs = [CPSR] in { def COPY_STRUCT_BYVAL_I32 : PseudoInst< (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment), NoItinerary, @@ -5778,26 +5819,21 @@ def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>; // smul* and smla* def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b), - (SMULBB GPR:$a, GPR:$b)>, - Sched<[WriteMUL32, ReadMUL, ReadMUL]>; -def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))), - (SMULBT GPR:$a, GPR:$b)>, - Sched<[WriteMUL32, ReadMUL, ReadMUL]>; -def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b), - (SMULTB GPR:$a, GPR:$b)>, - Sched<[WriteMUL32, ReadMUL, ReadMUL]>; -def : ARMV5MOPat<(add GPR:$acc, - (mul sext_16_node:$a, sext_16_node:$b)), - (SMLABB GPR:$a, GPR:$b, GPR:$acc)>, - Sched<[WriteMUL32, ReadMUL, ReadMUL]>; -def : ARMV5MOPat<(add GPR:$acc, - (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))), - (SMLABT GPR:$a, GPR:$b, GPR:$acc)>, - Sched<[WriteMUL32, ReadMUL, ReadMUL]>; -def : ARMV5MOPat<(add GPR:$acc, - (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)), - (SMLATB GPR:$a, GPR:$b, GPR:$acc)>, - Sched<[WriteMUL32, ReadMUL, ReadMUL]>; + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, (sext_bottom_16 GPR:$b)), + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, (sext_top_16 GPR:$b)), + (SMULBT GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sext_top_16 GPR:$a), sext_16_node:$b), + (SMULTB GPR:$a, GPR:$b)>; +def : ARMV5MOPat<(add GPR:$acc, (mul sext_16_node:$a, sext_16_node:$b)), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5MOPat<(add GPR:$acc, (mul sext_16_node:$a, (sext_bottom_16 GPR:$b))), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5MOPat<(add GPR:$acc, (mul sext_16_node:$a, (sext_top_16 GPR:$b))), + (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5MOPat<(add GPR:$acc, (mul (sext_top_16 GPR:$a), sext_16_node:$b)), + (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; def : ARMV5TEPat<(int_arm_smulbb GPR:$a, GPR:$b), (SMULBB GPR:$a, GPR:$b)>; @@ -5902,6 +5938,8 @@ include "ARMInstrNEON.td" // Memory barriers def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>; def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>; +def : InstAlias<"ssbb", (DSB 0x0), 1>, Requires<[IsARM, HasDB]>; +def : InstAlias<"pssbb", (DSB 0x4), 1>, Requires<[IsARM, HasDB]>; def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>; // Armv8-R 'Data Full Barrier' def : InstAlias<"dfb", (DSB 0xc), 1>, Requires<[IsARM, HasDFB]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td index 4525eec8da03..96986e74415b 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -4305,17 +4305,29 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), (v2f32 (EXTRACT_SUBREG QPR:$src2, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; - +def : Pat<(v8f16 (fmul (v8f16 QPR:$src1), + (v8f16 (NEONvduplane (v8f16 QPR:$src2), imm:$lane)))), + (v8f16 (VMULslhq(v8f16 QPR:$src1), + (v4f16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), (VMULslfd DPR:$Rn, (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), (i32 0))>; +def : Pat<(v4f16 (fmul DPR:$Rn, (NEONvdup (f16 HPR:$Rm)))), + (VMULslhd DPR:$Rn, + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0), + (i32 0))>; def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), (VMULslfq QPR:$Rn, (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), (i32 0))>; - +def : Pat<(v8f16 (fmul QPR:$Rn, (NEONvdup (f16 HPR:$Rm)))), + (VMULslhq QPR:$Rn, + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0), + (i32 0))>; // VQDMULH : Vector Saturating Doubling Multiply Returning High Half defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D, @@ -4390,16 +4402,16 @@ defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", v2f32, fmul_su, fadd_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", v4f32, fmul_su, fadd_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLAhd : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16", v4f16, fmul_su, fadd_mlx>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; def VMLAhq : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16", v8f16, fmul_su, fadd_mlx>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", @@ -4620,16 +4632,16 @@ defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", v2f32, fmul_su, fsub_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", v4f32, fmul_su, fsub_mlx>, - Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseFPVMLx]>; def VMLShd : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16", v4f16, fmul, fsub>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; def VMLShq : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16", v8f16, fmul, fsub>, - Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", @@ -4734,6 +4746,12 @@ def VFMShq : N3VQMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACQ, "vfms", "f16", Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; // Match @llvm.fma.* intrinsics +def : Pat<(v4f16 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), + (VFMAhd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON,HasFullFP16]>; +def : Pat<(v8f16 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)), + (VFMAhq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON,HasFullFP16]>; def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, Requires<[HasVFP4]>; @@ -5066,7 +5084,7 @@ def VACGThd : N3VDInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", "f16", v4i16, v4f16, int_arm_neon_vacgt, 0>, Requires<[HasNEON, HasFullFP16]>; def VACGThq : N3VQInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", - "f16", v8f16, v8f16, int_arm_neon_vacgt, 0>, + "f16", v8i16, v8f16, int_arm_neon_vacgt, 0>, Requires<[HasNEON, HasFullFP16]>; // VTST : Vector Test Bits defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, @@ -5091,6 +5109,54 @@ def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm", (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; } +// +fp16fml Floating Point Multiplication Variants +let Predicates = [HasNEON, HasFP16FML], DecoderNamespace= "VFPV8" in { + +class N3VCP8F16Q1<string asm, RegisterClass Td, RegisterClass Tn, + RegisterClass Tm, bits<2> op1, bits<2> op2, bit op3> + : N3VCP8<op1, op2, 1, op3, (outs Td:$Vd), (ins Tn:$Vn, Tm:$Vm), NoItinerary, + asm, "f16", "$Vd, $Vn, $Vm", "", []>; + +class N3VCP8F16Q0<string asm, RegisterClass Td, RegisterClass Tn, + RegisterClass Tm, bits<2> op1, bits<2> op2, bit op3> + : N3VCP8Q0<op1, op2, 0, op3, (outs Td:$Vd), (ins Tn:$Vn, Tm:$Vm), NoItinerary, + asm, "f16", "$Vd, $Vn, $Vm", "", []>; + +class VFMQ0<string opc, bits<2> S> + : N3VLaneCP8<0, S, 0, 1, (outs DPR:$Vd), + (ins SPR:$Vn, SPR:$Vm, VectorIndex32:$idx), + IIC_VMACD, opc, "f16", "$Vd, $Vn, $Vm$idx", "", []> { + bit idx; + let Inst{3} = idx; + let Inst{19-16} = Vn{4-1}; + let Inst{7} = Vn{0}; + let Inst{5} = Vm{0}; + let Inst{2-0} = Vm{3-1}; +} + +class VFMQ1<string opc, bits<2> S> + : N3VLaneCP8<0, S, 1, 1, (outs QPR:$Vd), + (ins DPR:$Vn, DPR:$Vm, VectorIndex16:$idx), + IIC_VMACD, opc, "f16", "$Vd, $Vn, $Vm$idx", "", []> { + bits<2> idx; + let Inst{5} = idx{1}; + let Inst{3} = idx{0}; +} + +let hasNoSchedulingInfo = 1 in { +// op1 op2 op3 +def VFMALD : N3VCP8F16Q0<"vfmal", DPR, SPR, SPR, 0b00, 0b10, 1>; +def VFMSLD : N3VCP8F16Q0<"vfmsl", DPR, SPR, SPR, 0b01, 0b10, 1>; +def VFMALQ : N3VCP8F16Q1<"vfmal", QPR, DPR, DPR, 0b00, 0b10, 1>; +def VFMSLQ : N3VCP8F16Q1<"vfmsl", QPR, DPR, DPR, 0b01, 0b10, 1>; +def VFMALDI : VFMQ0<"vfmal", 0b00>; +def VFMSLDI : VFMQ0<"vfmsl", 0b01>; +def VFMALQI : VFMQ1<"vfmal", 0b00>; +def VFMSLQI : VFMQ1<"vfmsl", 0b01>; +} +} // HasNEON, HasFP16FML + + def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm", (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm", @@ -5455,17 +5521,17 @@ defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm, "vmax", "u", umax, 1>; def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmax", "f32", - v2f32, v2f32, fmaxnan, 1>; + v2f32, v2f32, fmaximum, 1>; def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmax", "f32", - v4f32, v4f32, fmaxnan, 1>; + v4f32, v4f32, fmaximum, 1>; def VMAXhd : N3VDInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmax", "f16", - v4f16, v4f16, fmaxnan, 1>, + v4f16, v4f16, fmaximum, 1>, Requires<[HasNEON, HasFullFP16]>; def VMAXhq : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmax", "f16", - v8f16, v8f16, fmaxnan, 1>, + v8f16, v8f16, fmaximum, 1>, Requires<[HasNEON, HasFullFP16]>; // VMAXNM @@ -5497,17 +5563,17 @@ defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm, "vmin", "u", umin, 1>; def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmin", "f32", - v2f32, v2f32, fminnan, 1>; + v2f32, v2f32, fminimum, 1>; def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmin", "f32", - v4f32, v4f32, fminnan, 1>; + v4f32, v4f32, fminimum, 1>; def VMINhd : N3VDInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmin", "f16", - v4f16, v4f16, fminnan, 1>, + v4f16, v4f16, fminimum, 1>, Requires<[HasNEON, HasFullFP16]>; def VMINhq : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmin", "f16", - v8f16, v8f16, fminnan, 1>, + v8f16, v8f16, fminimum, 1>, Requires<[HasNEON, HasFullFP16]>; // VMINNM @@ -6318,6 +6384,9 @@ def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> { let Inst{19} = lane{0}; } +def : Pat<(v4f16 (NEONvduplane (v4f16 DPR:$Vm), imm:$lane)), + (VDUPLN32d DPR:$Vm, imm:$lane)>; + def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), (VDUPLN32d DPR:$Vm, imm:$lane)>; @@ -6332,6 +6401,10 @@ def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)), (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v8f16 (NEONvduplane (v8f16 QPR:$src), imm:$lane)), + (v8f16 (VDUPLN16q (v4f16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)), (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src, (DSubReg_i32_reg imm:$lane))), @@ -6341,12 +6414,18 @@ def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)), (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; +def : Pat<(v4f16 (NEONvdup HPR:$src)), + (v4f16 (VDUPLN16d (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), + HPR:$src, ssub_0), (i32 0)))>; def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))), (v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0), (i32 0)))>; def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))), (v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0), (i32 0)))>; +def : Pat<(v8f16 (NEONvdup HPR:$src)), + (v8f16 (VDUPLN16q (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), + HPR:$src, ssub_0), (i32 0)))>; // VMOVN : Vector Narrowing Move defm VMOVN : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN, @@ -6558,6 +6637,8 @@ def VREV64q8 : VREV64Q<0b00, "vrev64", "8", v16i8>; def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>; def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>; def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>; +def : Pat<(v8f16 (NEONvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>; +def : Pat<(v4f16 (NEONvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>; // VREV32 : Vector Reverse elements within 32-bit words @@ -6647,13 +6728,14 @@ def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> { let Inst{10-9} = index{1-0}; let Inst{8} = 0b0; } +def : Pat<(v4f16 (NEONvext (v4f16 DPR:$Vn), (v4f16 DPR:$Vm), (i32 imm:$index))), + (VEXTd16 DPR:$Vn, DPR:$Vm, imm:$index)>; + def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> { let Inst{10} = index{0}; let Inst{9-8} = 0b00; } -def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn), - (v2f32 DPR:$Vm), - (i32 imm:$index))), +def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn), (v2f32 DPR:$Vm), (i32 imm:$index))), (VEXTd32 DPR:$Vn, DPR:$Vm, imm:$index)>; def VEXTq8 : VEXTq<"vext", "8", v16i8, imm0_15> { @@ -6663,6 +6745,9 @@ def VEXTq16 : VEXTq<"vext", "16", v8i16, imm0_7> { let Inst{11-9} = index{2-0}; let Inst{8} = 0b0; } +def : Pat<(v8f16 (NEONvext (v8f16 QPR:$Vn), (v8f16 QPR:$Vm), (i32 imm:$index))), + (VEXTq16 QPR:$Vn, QPR:$Vm, imm:$index)>; + def VEXTq32 : VEXTq<"vext", "32", v4i32, imm0_3> { let Inst{11-10} = index{1-0}; let Inst{9-8} = 0b00; @@ -6671,9 +6756,7 @@ def VEXTq64 : VEXTq<"vext", "64", v2i64, imm0_1> { let Inst{11} = index{0}; let Inst{10-8} = 0b000; } -def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn), - (v4f32 QPR:$Vm), - (i32 imm:$index))), +def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn), (v4f32 QPR:$Vm), (i32 imm:$index))), (VEXTq32 QPR:$Vn, QPR:$Vm, imm:$index)>; // VTRN : Vector Transpose @@ -7001,19 +7084,19 @@ def : N3VSPat<fadd, VADDfd>; def : N3VSPat<fsub, VSUBfd>; def : N3VSPat<fmul, VMULfd>; def : N3VSMulOpPat<fmul, fadd, VMLAfd>, - Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; def : N3VSMulOpPat<fmul, fsub, VMLSfd>, - Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>; + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; def : N3VSMulOpPat<fmul, fadd, VFMAfd>, Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>; def : N3VSMulOpPat<fmul, fsub, VFMSfd>, Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>; def : N2VSPat<fabs, VABSfd>; def : N2VSPat<fneg, VNEGfd>; -def : N3VSPatFP16<fmaxnan, VMAXhd>, Requires<[HasFullFP16]>; -def : N3VSPatFP16<fminnan, VMINhd>, Requires<[HasFullFP16]>; -def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>; -def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>; +def : N3VSPatFP16<fmaximum, VMAXhd>, Requires<[HasFullFP16]>; +def : N3VSPatFP16<fminimum, VMINhd>, Requires<[HasFullFP16]>; +def : N3VSPat<fmaximum, VMAXfd>, Requires<[HasNEON]>; +def : N3VSPat<fminimum, VMINfd>, Requires<[HasNEON]>; def : NVCVTFIPat<fp_to_sint, VCVTf2sd>; def : NVCVTFIPat<fp_to_uint, VCVTf2ud>; def : NVCVTIFPat<sint_to_fp, VCVTs2fd>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td index 88aab47a79bf..b20b34eaa6a9 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -781,7 +781,7 @@ defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr, // These require base address to be written back or one of the loaded regs. let hasSideEffects = 0 in { -let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops), IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> { bits<3> Rn; @@ -826,7 +826,8 @@ def : InstAlias<"ldm${p} $Rn!, $regs", (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs), 0>, Requires<[IsThumb, IsThumb1Only]>; -let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in +let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1, + variadicOpsAreDefs = 1 in def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), IIC_iPop, "pop${p}\t$regs", []>, @@ -1343,8 +1344,20 @@ let hasPostISelHook = 1, Defs = [CPSR] in { tGPR:$Rm))]>, Requires<[IsThumb1Only]>, Sched<[WriteALU]>; + + def tRSBS : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn), + 2, IIC_iALUr, + [(set tGPR:$Rd, CPSR, (ARMsubc 0, tGPR:$Rn))]>, + Requires<[IsThumb1Only]>, + Sched<[WriteALU]>; } + +def : T1Pat<(ARMsubs tGPR:$Rn, tGPR:$Rm), (tSUBSrr $Rn, $Rm)>; +def : T1Pat<(ARMsubs tGPR:$Rn, imm0_7:$imm3), (tSUBSi3 $Rn, imm0_7:$imm3)>; +def : T1Pat<(ARMsubs tGPR:$Rn, imm0_255:$imm8), (tSUBSi8 $Rn, imm0_255:$imm8)>; + + // Sign-extend byte def tSXTB : // A8.6.222 T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), @@ -1380,6 +1393,9 @@ def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8", let Inst{7-0} = imm8; } +def : Pat<(debugtrap), (tBKPT 0)>, Requires<[IsThumb, HasV5T]>; +def : Pat<(debugtrap), (tUDF 254)>, Requires<[IsThumb, NoV5T]>; + def t__brkdiv0 : TI<(outs), (ins), IIC_Br, "__brkdiv0", [(int_arm_undefined 249)]>, Encoding16, Requires<[IsThumb, IsWindows]> { diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td index f67075fbf9fd..7a6673b49d57 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -1775,7 +1775,7 @@ multiclass thumb2_ld_mult<string asm, InstrItinClass itin, let hasSideEffects = 0 in { -let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>; multiclass thumb2_st_mult<string asm, InstrItinClass itin, @@ -1997,6 +1997,10 @@ def : Thumb2DSPPat<(int_arm_sxtb16 rGPR:$Rn), (t2SXTB16 rGPR:$Rn, 0)>; def : Thumb2DSPPat<(int_arm_sxtab16 rGPR:$Rn, rGPR:$Rm), (t2SXTAB16 rGPR:$Rn, rGPR:$Rm, 0)>; +def : Thumb2DSPPat<(int_arm_sxtb16 (rotr rGPR:$Rn, rot_imm:$rot)), + (t2SXTB16 rGPR:$Rn, rot_imm:$rot)>; +def : Thumb2DSPPat<(int_arm_sxtab16 rGPR:$Rn, (rotr rGPR:$Rm, rot_imm:$rot)), + (t2SXTAB16 rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>; // A simple right-shift can also be used in most cases (the exception is the @@ -2032,6 +2036,8 @@ def : Thumb2DSPPat<(and (rotr rGPR:$Rm, rot_imm:$rot), 0x00FF00FF), def : Thumb2DSPPat<(int_arm_uxtb16 rGPR:$Rm), (t2UXTB16 rGPR:$Rm, 0)>; +def : Thumb2DSPPat<(int_arm_uxtb16 (rotr rGPR:$Rn, rot_imm:$rot)), + (t2UXTB16 rGPR:$Rn, rot_imm:$rot)>; // FIXME: This pattern incorrectly assumes the shl operator is a rotate. // The transformation should probably be done as a combiner action @@ -2062,6 +2068,8 @@ def : Thumb2DSPPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), (t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>; def : Thumb2DSPPat<(int_arm_uxtab16 rGPR:$Rn, rGPR:$Rm), (t2UXTAB16 rGPR:$Rn, rGPR:$Rm, 0)>; +def : Thumb2DSPPat<(int_arm_uxtab16 rGPR:$Rn, (rotr rGPR:$Rm, rot_imm:$rot)), + (t2UXTAB16 rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>; } @@ -2086,6 +2094,12 @@ defm t2SUB : T2I_bin_ii12rs<0b101, "sub", sub>; defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMaddc, 1>; defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMsubc>; +def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_imm:$imm), + (t2SUBSri $Rn, t2_so_imm:$imm)>; +def : T2Pat<(ARMsubs GPRnopc:$Rn, rGPR:$Rm), (t2SUBSrr $Rn, $Rm)>; +def : T2Pat<(ARMsubs GPRnopc:$Rn, t2_so_reg:$ShiftedRm), + (t2SUBSrs $Rn, t2_so_reg:$ShiftedRm)>; + let hasPostISelHook = 1 in { defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1>; defm t2SBC : T2I_adde_sube_irs<0b1011, "sbc", ARMsube>; @@ -2718,28 +2732,25 @@ class T2ThreeRegSMUL<bits<3> op22_20, bits<2> op5_4, string opc, } def t2SMULBB : T2ThreeRegSMUL<0b001, 0b00, "smulbb", - [(set rGPR:$Rd, (mul (sext_inreg rGPR:$Rn, i16), - (sext_inreg rGPR:$Rm, i16)))]>; + [(set rGPR:$Rd, (bb_mul rGPR:$Rn, rGPR:$Rm))]>; def t2SMULBT : T2ThreeRegSMUL<0b001, 0b01, "smulbt", - [(set rGPR:$Rd, (mul (sext_inreg rGPR:$Rn, i16), - (sra rGPR:$Rm, (i32 16))))]>; + [(set rGPR:$Rd, (bt_mul rGPR:$Rn, rGPR:$Rm))]>; def t2SMULTB : T2ThreeRegSMUL<0b001, 0b10, "smultb", - [(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)), - (sext_inreg rGPR:$Rm, i16)))]>; + [(set rGPR:$Rd, (tb_mul rGPR:$Rn, rGPR:$Rm))]>; def t2SMULTT : T2ThreeRegSMUL<0b001, 0b11, "smultt", - [(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)), - (sra rGPR:$Rm, (i32 16))))]>; + [(set rGPR:$Rd, (tt_mul rGPR:$Rn, rGPR:$Rm))]>; def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb", [(set rGPR:$Rd, (ARMsmulwb rGPR:$Rn, rGPR:$Rm))]>; def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt", [(set rGPR:$Rd, (ARMsmulwt rGPR:$Rn, rGPR:$Rm))]>; -def : Thumb2DSPPat<(mul sext_16_node:$Rm, sext_16_node:$Rn), - (t2SMULBB rGPR:$Rm, rGPR:$Rn)>; -def : Thumb2DSPPat<(mul sext_16_node:$Rn, (sra rGPR:$Rm, (i32 16))), +def : Thumb2DSPPat<(mul sext_16_node:$Rn, (sext_bottom_16 rGPR:$Rm)), + (t2SMULBB rGPR:$Rn, rGPR:$Rm)>; +def : Thumb2DSPPat<(mul sext_16_node:$Rn, (sext_top_16 rGPR:$Rm)), (t2SMULBT rGPR:$Rn, rGPR:$Rm)>; -def : Thumb2DSPPat<(mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm), +def : Thumb2DSPPat<(mul (sext_top_16 rGPR:$Rn), sext_16_node:$Rm), (t2SMULTB rGPR:$Rn, rGPR:$Rm)>; + def : Thumb2DSPPat<(int_arm_smulbb rGPR:$Rn, rGPR:$Rm), (t2SMULBB rGPR:$Rn, rGPR:$Rm)>; def : Thumb2DSPPat<(int_arm_smulbt rGPR:$Rn, rGPR:$Rm), @@ -2767,18 +2778,13 @@ class T2FourRegSMLA<bits<3> op22_20, bits<2> op5_4, string opc, } def t2SMLABB : T2FourRegSMLA<0b001, 0b00, "smlabb", - [(set rGPR:$Rd, (add rGPR:$Ra, - (mul (sext_inreg rGPR:$Rn, i16), - (sext_inreg rGPR:$Rm, i16))))]>; + [(set rGPR:$Rd, (add rGPR:$Ra, (bb_mul rGPR:$Rn, rGPR:$Rm)))]>; def t2SMLABT : T2FourRegSMLA<0b001, 0b01, "smlabt", - [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sext_inreg rGPR:$Rn, i16), - (sra rGPR:$Rm, (i32 16)))))]>; + [(set rGPR:$Rd, (add rGPR:$Ra, (bt_mul rGPR:$Rn, rGPR:$Rm)))]>; def t2SMLATB : T2FourRegSMLA<0b001, 0b10, "smlatb", - [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)), - (sext_inreg rGPR:$Rm, i16))))]>; + [(set rGPR:$Rd, (add rGPR:$Ra, (tb_mul rGPR:$Rn, rGPR:$Rm)))]>; def t2SMLATT : T2FourRegSMLA<0b001, 0b11, "smlatt", - [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)), - (sra rGPR:$Rm, (i32 16)))))]>; + [(set rGPR:$Rd, (add rGPR:$Ra, (tt_mul rGPR:$Rn, rGPR:$Rm)))]>; def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb", [(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwb rGPR:$Rn, rGPR:$Rm)))]>; def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt", @@ -2786,11 +2792,14 @@ def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt", def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn, sext_16_node:$Rm)), (t2SMLABB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>; -def : Thumb2DSPMulPat<(add rGPR:$Ra, - (mul sext_16_node:$Rn, (sra rGPR:$Rm, (i32 16)))), +def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn, + (sext_bottom_16 rGPR:$Rm))), + (t2SMLABB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>; +def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn, + (sext_top_16 rGPR:$Rm))), (t2SMLABT rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>; -def : Thumb2DSPMulPat<(add rGPR:$Ra, - (mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm)), +def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul (sext_top_16 rGPR:$Rn), + sext_16_node:$Rm)), (t2SMLATB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>; def : Thumb2DSPPat<(int_arm_smlabb GPR:$a, GPR:$b, GPR:$acc), @@ -3223,6 +3232,14 @@ def t2TSB : T2I<(outs), (ins tsb_opt:$opt), NoItinerary, } } +// Armv8.5-A speculation barrier +def t2SB : Thumb2XI<(outs), (ins), AddrModeNone, 4, NoItinerary, "sb", "", []>, + Requires<[IsThumb2, HasSB]>, Sched<[]> { + let Inst{31-0} = 0xf3bf8f70; + let Unpredictable = 0x000f2f0f; + let hasSideEffects = 1; +} + class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz, InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern, bits<4> rt2 = 0b1111> @@ -4429,13 +4446,13 @@ def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val), def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val), (t2STRs GPR:$val, t2addrmode_so_reg:$addr)>; -let AddedComplexity = 8 in { - def : T2Pat<(atomic_load_acquire_8 addr_offset_none:$addr), (t2LDAB addr_offset_none:$addr)>; - def : T2Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>; - def : T2Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA addr_offset_none:$addr)>; - def : T2Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (t2STLB GPR:$val, addr_offset_none:$addr)>; - def : T2Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>; - def : T2Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL GPR:$val, addr_offset_none:$addr)>; +let AddedComplexity = 8, Predicates = [IsThumb, HasAcquireRelease, HasV7Clrex] in { + def : Pat<(atomic_load_acquire_8 addr_offset_none:$addr), (t2LDAB addr_offset_none:$addr)>; + def : Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>; + def : Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA addr_offset_none:$addr)>; + def : Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (t2STLB GPR:$val, addr_offset_none:$addr)>; + def : Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>; + def : Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL GPR:$val, addr_offset_none:$addr)>; } @@ -4538,6 +4555,12 @@ def : t2InstAlias<"tst${p} $Rn, $Rm", def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>; def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>; def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>; + +// Non-predicable aliases of a predicable DSB: the predicate is (14, 0) where +// 14 = AL (always execute) and 0 = "instruction doesn't read the CPSR". +def : InstAlias<"ssbb", (t2DSB 0x0, 14, 0), 1>, Requires<[HasDB, IsThumb2]>; +def : InstAlias<"pssbb", (t2DSB 0x4, 14, 0), 1>, Requires<[HasDB, IsThumb2]>; + // Armv8-R 'Data Full Barrier' def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td index 2f14b78c91fd..b58730c452f7 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -725,9 +725,11 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, } def : FullFP16Pat<(f64 (fpextend HPR:$Sm)), - (VCVTBHD (COPY_TO_REGCLASS HPR:$Sm, SPR))>; + (VCVTBHD (COPY_TO_REGCLASS HPR:$Sm, SPR))>, + Requires<[HasFPARMv8, HasDPVFP]>; def : FP16Pat<(f64 (f16_to_fp GPR:$a)), - (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>; + (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>, + Requires<[HasFPARMv8, HasDPVFP]>; def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins DPR:$Dm), @@ -746,9 +748,11 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, } def : FullFP16Pat<(f16 (fpround DPR:$Dm)), - (COPY_TO_REGCLASS (VCVTBDH DPR:$Dm), HPR)>; + (COPY_TO_REGCLASS (VCVTBDH DPR:$Dm), HPR)>, + Requires<[HasFPARMv8, HasDPVFP]>; def : FP16Pat<(fp_to_f16 (f64 DPR:$a)), - (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>; + (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>, + Requires<[HasFPARMv8, HasDPVFP]>; def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs DPR:$Dd), (ins SPR:$Sm), @@ -1810,7 +1814,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0, [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VMLAS : ASbIn<0b11100, 0b00, 0, 0, @@ -1819,7 +1823,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0, [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1832,17 +1836,17 @@ def VMLAH : AHbI<0b11100, 0b00, 0, 0, [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>; def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>; def VMLSD : ADbI<0b11100, 0b00, 1, 0, @@ -1851,7 +1855,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0, [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VMLSS : ASbIn<0b11100, 0b00, 1, 0, @@ -1860,7 +1864,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0, [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1873,17 +1877,17 @@ def VMLSH : AHbI<0b11100, 0b00, 1, 0, [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; def VNMLAD : ADbI<0b11100, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1891,7 +1895,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0, [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VNMLAS : ASbI<0b11100, 0b01, 1, 0, @@ -1900,7 +1904,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0, [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1913,29 +1917,29 @@ def VNMLAH : AHbI<0b11100, 0b01, 1, 0, [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; // (-(a * b) - dst) -> -(dst + (a * b)) def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin), (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; // (-dst - (a * b)) -> -(dst + (a * b)) def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))), (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)), (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; def VNMLSD : ADbI<0b11100, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1943,7 +1947,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0, [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>, Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VNMLSS : ASbI<0b11100, 0b01, 0, 0, @@ -1951,7 +1955,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0, IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm", [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. @@ -1963,17 +1967,17 @@ def VNMLSH : AHbI<0b11100, 0b01, 0, 0, IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm", [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,UseFPVMLx]>; def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin), (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>, - Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; //===----------------------------------------------------------------------===// // Fused FP Multiply-Accumulate Operations. diff --git a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp index 6692a4d41420..293e734c97cd 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp @@ -76,6 +76,42 @@ private: const ARMRegisterBankInfo &RBI; const ARMSubtarget &STI; + // Store the opcodes that we might need, so we don't have to check what kind + // of subtarget (ARM vs Thumb) we have all the time. + struct OpcodeCache { + unsigned ZEXT16; + unsigned SEXT16; + + unsigned ZEXT8; + unsigned SEXT8; + + // Used for implementing ZEXT/SEXT from i1 + unsigned AND; + unsigned RSB; + + unsigned STORE32; + unsigned LOAD32; + + unsigned STORE16; + unsigned LOAD16; + + unsigned STORE8; + unsigned LOAD8; + + OpcodeCache(const ARMSubtarget &STI); + } const Opcodes; + + // Select the opcode for simple extensions (that translate to a single SXT/UXT + // instruction). Extension operations more complicated than that should not + // invoke this. Returns the original opcode if it doesn't know how to select a + // better one. + unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) const; + + // Select the opcode for simple loads and stores. Returns the original opcode + // if it doesn't know how to select a better one. + unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank, + unsigned Size) const; + #define GET_GLOBALISEL_PREDICATES_DECL #include "ARMGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_DECL @@ -107,7 +143,7 @@ ARMInstructionSelector::ARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI, const ARMRegisterBankInfo &RBI) : InstructionSelector(), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), STI(STI), + TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), STI(STI), Opcodes(STI), #define GET_GLOBALISEL_PREDICATES_INIT #include "ARMGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_INIT @@ -225,41 +261,63 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB, return true; } -/// Select the opcode for simple extensions (that translate to a single SXT/UXT -/// instruction). Extension operations more complicated than that should not -/// invoke this. Returns the original opcode if it doesn't know how to select a -/// better one. -static unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) { +ARMInstructionSelector::OpcodeCache::OpcodeCache(const ARMSubtarget &STI) { + bool isThumb = STI.isThumb(); + + using namespace TargetOpcode; + +#define STORE_OPCODE(VAR, OPC) VAR = isThumb ? ARM::t2##OPC : ARM::OPC + STORE_OPCODE(SEXT16, SXTH); + STORE_OPCODE(ZEXT16, UXTH); + + STORE_OPCODE(SEXT8, SXTB); + STORE_OPCODE(ZEXT8, UXTB); + + STORE_OPCODE(AND, ANDri); + STORE_OPCODE(RSB, RSBri); + + STORE_OPCODE(STORE32, STRi12); + STORE_OPCODE(LOAD32, LDRi12); + + // LDRH/STRH are special... + STORE16 = isThumb ? ARM::t2STRHi12 : ARM::STRH; + LOAD16 = isThumb ? ARM::t2LDRHi12 : ARM::LDRH; + + STORE_OPCODE(STORE8, STRBi12); + STORE_OPCODE(LOAD8, LDRBi12); +#undef MAP_OPCODE +} + +unsigned ARMInstructionSelector::selectSimpleExtOpc(unsigned Opc, + unsigned Size) const { using namespace TargetOpcode; if (Size != 8 && Size != 16) return Opc; if (Opc == G_SEXT) - return Size == 8 ? ARM::SXTB : ARM::SXTH; + return Size == 8 ? Opcodes.SEXT8 : Opcodes.SEXT16; if (Opc == G_ZEXT) - return Size == 8 ? ARM::UXTB : ARM::UXTH; + return Size == 8 ? Opcodes.ZEXT8 : Opcodes.ZEXT16; return Opc; } -/// Select the opcode for simple loads and stores. For types smaller than 32 -/// bits, the value will be zero extended. Returns the original opcode if it -/// doesn't know how to select a better one. -static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank, - unsigned Size) { +unsigned ARMInstructionSelector::selectLoadStoreOpCode(unsigned Opc, + unsigned RegBank, + unsigned Size) const { bool isStore = Opc == TargetOpcode::G_STORE; if (RegBank == ARM::GPRRegBankID) { switch (Size) { case 1: case 8: - return isStore ? ARM::STRBi12 : ARM::LDRBi12; + return isStore ? Opcodes.STORE8 : Opcodes.LOAD8; case 16: - return isStore ? ARM::STRH : ARM::LDRH; + return isStore ? Opcodes.STORE16 : Opcodes.LOAD16; case 32: - return isStore ? ARM::STRi12 : ARM::LDRi12; + return isStore ? Opcodes.STORE32 : Opcodes.LOAD32; default: return Opc; } @@ -702,7 +760,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, switch (SrcSize) { case 1: { // ZExt boils down to & 0x1; for SExt we also subtract that from 0 - I.setDesc(TII.get(ARM::ANDri)); + I.setDesc(TII.get(Opcodes.AND)); MIB.addImm(1).add(predOps(ARMCC::AL)).add(condCodeOp()); if (isSExt) { @@ -714,7 +772,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, auto InsertBefore = std::next(I.getIterator()); auto SubI = - BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::RSBri)) + BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(Opcodes.RSB)) .addDef(SExtResult) .addUse(AndResult) .addImm(0) diff --git a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp index 891418306903..4a0c24d58474 100644 --- a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -75,13 +75,48 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); - getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); - getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); + if (ST.isThumb1Only()) { + // Thumb1 is not supported yet. + computeTables(); + verify(*ST.getInstrInfo()); + return; + } + + getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) + .legalForCartesianProduct({s32}, {s1, s8, s16}); getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) .legalFor({s32}) .minScalar(0, s32); + getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}}); + getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}}); + + getActionDefinitionsBuilder(G_CONSTANT) + .legalFor({s32, p0}) + .clampScalar(0, s32, s32); + + // We're keeping these builders around because we'll want to add support for + // floating point to them. + auto &LoadStoreBuilder = + getActionDefinitionsBuilder({G_LOAD, G_STORE}) + .legalForTypesWithMemSize({ + {s1, p0, 8}, + {s8, p0, 8}, + {s16, p0, 16}, + {s32, p0, 32}, + {p0, p0, 32}}); + + if (ST.isThumb()) { + // FIXME: merge with the code for non-Thumb. + computeTables(); + verify(*ST.getInstrInfo()); + return; + } + + getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); + getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); + if (ST.hasDivideInARMMode()) getActionDefinitionsBuilder({G_SDIV, G_UDIV}) .legalFor({s32}) @@ -101,14 +136,24 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({Op, s32}, Libcall); } - getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) - .legalForCartesianProduct({s32}, {s1, s8, s16}); - - getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, s32}}); - getActionDefinitionsBuilder(G_PTRTOINT).legalFor({{s32, p0}}); - getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL}).legalFor({s32}); + if (ST.hasV5TOps()) { + getActionDefinitionsBuilder(G_CTLZ) + .legalFor({s32}) + .clampScalar(0, s32, s32); + getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) + .lowerFor({s32}) + .clampScalar(0, s32, s32); + } else { + getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) + .libcallFor({s32}) + .clampScalar(0, s32, s32); + getActionDefinitionsBuilder(G_CTLZ) + .lowerFor({s32}) + .clampScalar(0, s32, s32); + } + getActionDefinitionsBuilder(G_GEP).legalFor({{p0, s32}}); getActionDefinitionsBuilder(G_SELECT).legalForCartesianProduct({s32, p0}, @@ -116,20 +161,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { getActionDefinitionsBuilder(G_BRCOND).legalFor({s1}); - getActionDefinitionsBuilder(G_CONSTANT) - .legalFor({s32, p0}) - .clampScalar(0, s32, s32); - getActionDefinitionsBuilder(G_ICMP) .legalForCartesianProduct({s1}, {s32, p0}) .minScalar(1, s32); // We're keeping these builders around because we'll want to add support for // floating point to them. - auto &LoadStoreBuilder = - getActionDefinitionsBuilder({G_LOAD, G_STORE}) - .legalForCartesianProduct({s1, s8, s16, s32, p0}, {p0}); - auto &PhiBuilder = getActionDefinitionsBuilder(G_PHI).legalFor({s32, p0}).minScalar(0, s32); @@ -302,7 +339,8 @@ ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate, bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { using namespace TargetOpcode; MIRBuilder.setInstr(MI); diff --git a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h index 78ab9412c04b..527bf87f1093 100644 --- a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H #include "llvm/ADT/IndexedMap.h" +#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/IR/Instructions.h" @@ -29,7 +30,8 @@ public: ARMLegalizerInfo(const ARMSubtarget &ST); bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const override; + MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const override; private: void setFCmpLibcallsGNU(); diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index db5f28480e90..6da7430a8e51 100644 --- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -1027,6 +1027,18 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI)) CanMergeToLSMulti = CanMergeToLSDouble = false; + // vldm / vstm limit are 32 for S variants, 16 for D variants. + unsigned Limit; + switch (Opcode) { + default: + Limit = UINT_MAX; + break; + case ARM::VLDRD: + case ARM::VSTRD: + Limit = 16; + break; + } + // Merge following instructions where possible. for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) { int NewOffset = MemOps[I].Offset; @@ -1036,6 +1048,8 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { unsigned Reg = MO.getReg(); if (Reg == ARM::SP || Reg == ARM::PC) break; + if (Count == Limit) + break; // See if the current load/store may be part of a multi load/store. unsigned RegNum = MO.isUndef() ? std::numeric_limits<unsigned>::max() @@ -1303,7 +1317,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { MIB.add(MI->getOperand(OpNum)); // Transfer memoperands. - MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MIB.setMemRefs(MI->memoperands()); MBB.erase(MBBI); return true; @@ -1527,7 +1541,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { // Transfer implicit operands. for (const MachineOperand &MO : MI.implicit_operands()) MIB.add(MO); - MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB.setMemRefs(MI.memoperands()); MBB.erase(MBBI); return true; @@ -1834,7 +1848,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) { return M0->InsertPos < M1->InsertPos; }; - llvm::sort(Candidates.begin(), Candidates.end(), LessThan); + llvm::sort(Candidates, LessThan); // Go through list of candidates and merge. bool Changed = false; @@ -2172,13 +2186,12 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, bool RetVal = false; // Sort by offset (in reverse order). - llvm::sort(Ops.begin(), Ops.end(), - [](const MachineInstr *LHS, const MachineInstr *RHS) { - int LOffset = getMemoryOpOffset(*LHS); - int ROffset = getMemoryOpOffset(*RHS); - assert(LHS == RHS || LOffset != ROffset); - return LOffset > ROffset; - }); + llvm::sort(Ops, [](const MachineInstr *LHS, const MachineInstr *RHS) { + int LOffset = getMemoryOpOffset(*LHS); + int ROffset = getMemoryOpOffset(*RHS); + assert(LHS == RHS || LOffset != ROffset); + return LOffset > ROffset; + }); // The loads / stores of the same base are in order. Scan them from first to // last and check for the following: @@ -2290,7 +2303,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, if (!isT2) MIB.addReg(0); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); - MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1)); + MIB.cloneMergedMemRefs({Op0, Op1}); LLVM_DEBUG(dbgs() << "Formed " << *MIB << "\n"); ++NumLDRDFormed; } else { @@ -2304,7 +2317,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, if (!isT2) MIB.addReg(0); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); - MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1)); + MIB.cloneMergedMemRefs({Op0, Op1}); LLVM_DEBUG(dbgs() << "Formed " << *MIB << "\n"); ++NumSTRDFormed; } diff --git a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp index d11fe9d5c502..df1da9d8e474 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp @@ -23,20 +23,13 @@ namespace llvm { static bool isAESPair(const MachineInstr *FirstMI, const MachineInstr &SecondMI) { // Assume the 1st instr to be a wildcard if it is unspecified. - unsigned FirstOpcode = - FirstMI ? FirstMI->getOpcode() - : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END); - unsigned SecondOpcode = SecondMI.getOpcode(); - - switch(SecondOpcode) { + switch(SecondMI.getOpcode()) { // AES encode. case ARM::AESMC : - return FirstOpcode == ARM::AESE || - FirstOpcode == ARM::INSTRUCTION_LIST_END; + return FirstMI == nullptr || FirstMI->getOpcode() == ARM::AESE; // AES decode. case ARM::AESIMC: - return FirstOpcode == ARM::AESD || - FirstOpcode == ARM::INSTRUCTION_LIST_END; + return FirstMI == nullptr || FirstMI->getOpcode() == ARM::AESD; } return false; @@ -46,15 +39,8 @@ static bool isAESPair(const MachineInstr *FirstMI, static bool isLiteralsPair(const MachineInstr *FirstMI, const MachineInstr &SecondMI) { // Assume the 1st instr to be a wildcard if it is unspecified. - unsigned FirstOpcode = - FirstMI ? FirstMI->getOpcode() - : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END); - unsigned SecondOpcode = SecondMI.getOpcode(); - - // 32 bit immediate. - if ((FirstOpcode == ARM::INSTRUCTION_LIST_END || - FirstOpcode == ARM::MOVi16) && - SecondOpcode == ARM::MOVTi16) + if ((FirstMI == nullptr || FirstMI->getOpcode() == ARM::MOVi16) && + SecondMI.getOpcode() == ARM::MOVTi16) return true; return false; diff --git a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.h b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.h index 1e4fc6687eae..b3abd7b593a1 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.h +++ b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.h @@ -12,6 +12,9 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_ARM_ARMMACROFUSION_H +#define LLVM_LIB_TARGET_ARM_ARMMACROFUSION_H + #include "llvm/CodeGen/MachineScheduler.h" namespace llvm { @@ -22,3 +25,5 @@ namespace llvm { std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation(); } // llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp index 9d5478b76c18..fc3258914f92 100644 --- a/contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp @@ -42,6 +42,10 @@ using namespace PatternMatch; STATISTIC(NumSMLAD , "Number of smlad instructions generated"); +static cl::opt<bool> +DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), + cl::desc("Disable the ARM Parallel DSP pass")); + namespace { struct OpChain; struct BinOpChain; @@ -67,7 +71,7 @@ namespace { virtual ~OpChain() = default; void SetMemoryLocations() { - const auto Size = MemoryLocation::UnknownSize; + const auto Size = LocationSize::unknown(); for (auto *V : AllValues) { if (auto *I = dyn_cast<Instruction>(V)) { if (I->mayWriteToMemory()) @@ -88,12 +92,15 @@ namespace { struct BinOpChain : public OpChain { ValueList LHS; // List of all (narrow) left hand operands. ValueList RHS; // List of all (narrow) right hand operands. + bool Exchange = false; BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) : OpChain(I, lhs), LHS(lhs), RHS(rhs) { for (auto *V : RHS) AllValues.push_back(V); } + + bool AreSymmetrical(BinOpChain *Other); }; struct Reduction { @@ -101,9 +108,9 @@ namespace { // pattern matching. Instruction *AccIntAdd; // The accumulating integer add statement, // i.e, the reduction statement. - OpChainList MACCandidates; // The MAC candidates associated with // this reduction statement. + PMACPairList PMACPairs; Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { }; }; @@ -116,12 +123,16 @@ namespace { Loop *L; const DataLayout *DL; Module *M; + std::map<LoadInst*, LoadInst*> LoadPairs; + std::map<LoadInst*, SmallVector<LoadInst*, 4>> SequentialLoads; - bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs); + bool RecordSequentialLoads(BasicBlock *Header); + bool InsertParallelMACs(Reduction &Reduction); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); - PMACPairList CreateParallelMACPairs(OpChainList &Candidates); + void CreateParallelMACPairs(Reduction &R); Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, - Instruction *Acc, Instruction *InsertAfter); + Instruction *Acc, bool Exchange, + Instruction *InsertAfter); /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate /// Dual performs two signed 16x16-bit multiplications. It adds the @@ -149,6 +160,8 @@ namespace { } bool runOnLoop(Loop *TheLoop, LPPassManager &) override { + if (DisableParallelDSP) + return false; L = TheLoop; SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); @@ -192,7 +205,14 @@ namespace { LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI); bool Changes = false; - LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n\n"); + LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); + LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); + + if (!RecordSequentialLoads(Header)) { + LLVM_DEBUG(dbgs() << " - No sequential loads found.\n"); + return false; + } + Changes = MatchSMLAD(F); return Changes; } @@ -245,57 +265,14 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) { return false; } -// Element-by-element comparison of Value lists returning true if they are -// instructions with the same opcode or constants with the same value. -static bool AreSymmetrical(const ValueList &VL0, - const ValueList &VL1) { - if (VL0.size() != VL1.size()) { - LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: " - << VL0.size() << " != " << VL1.size() << "\n"); - return false; - } - - const unsigned Pairs = VL0.size(); - LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n"); - - for (unsigned i = 0; i < Pairs; ++i) { - const Value *V0 = VL0[i]; - const Value *V1 = VL1[i]; - const auto *Inst0 = dyn_cast<Instruction>(V0); - const auto *Inst1 = dyn_cast<Instruction>(V1); - - LLVM_DEBUG(dbgs() << "Pair " << i << ":\n"; - dbgs() << "mul1: "; V0->dump(); - dbgs() << "mul2: "; V1->dump()); - - if (!Inst0 || !Inst1) - return false; - - if (Inst0->isSameOperationAs(Inst1)) { - LLVM_DEBUG(dbgs() << "OK: same operation found!\n"); - continue; - } - - const APInt *C0, *C1; - if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1)) - return false; - } - - LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n"); - return true; -} - template<typename MemInst> static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1, - MemInstList &VecMem, const DataLayout &DL, - ScalarEvolution &SE) { + const DataLayout &DL, ScalarEvolution &SE) { if (!MemOp0->isSimple() || !MemOp1->isSimple()) { LLVM_DEBUG(dbgs() << "No, not touching volatile access\n"); return false; } if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) { - VecMem.push_back(MemOp0); - VecMem.push_back(MemOp1); LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n"); return true; } @@ -318,82 +295,156 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, return false; } - return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, *DL, *SE); + if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1) + return false; + + VecMem.clear(); + VecMem.push_back(Ld0); + VecMem.push_back(Ld1); + return true; } -PMACPairList -ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) { +/// Iterate through the block and record base, offset pairs of loads as well as +/// maximal sequences of sequential loads. +bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) { + SmallVector<LoadInst*, 8> Loads; + for (auto &I : *Header) { + auto *Ld = dyn_cast<LoadInst>(&I); + if (!Ld) + continue; + Loads.push_back(Ld); + } + + std::map<LoadInst*, LoadInst*> BaseLoads; + + for (auto *Ld0 : Loads) { + for (auto *Ld1 : Loads) { + if (Ld0 == Ld1) + continue; + + if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) { + LoadPairs[Ld0] = Ld1; + if (BaseLoads.count(Ld0)) { + LoadInst *Base = BaseLoads[Ld0]; + BaseLoads[Ld1] = Base; + SequentialLoads[Base].push_back(Ld1); + } else { + BaseLoads[Ld1] = Ld0; + SequentialLoads[Ld0].push_back(Ld1); + } + } + } + } + return LoadPairs.size() > 1; +} + +void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) { + OpChainList &Candidates = R.MACCandidates; + PMACPairList &PMACPairs = R.PMACPairs; const unsigned Elems = Candidates.size(); - PMACPairList PMACPairs; if (Elems < 2) - return PMACPairs; + return; - // TODO: for now we simply try to match consecutive pairs i and i+1. - // We can compare all elements, but then we need to compare and evaluate - // different solutions. - for(unsigned i=0; i<Elems-1; i+=2) { - BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i].get()); - BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[i+1].get()); - const Instruction *Mul0 = PMul0->Root; - const Instruction *Mul1 = PMul1->Root; + auto CanPair = [&](BinOpChain *PMul0, BinOpChain *PMul1) { + if (!PMul0->AreSymmetrical(PMul1)) + return false; + + // The first elements of each vector should be loads with sexts. If we + // find that its two pairs of consecutive loads, then these can be + // transformed into two wider loads and the users can be replaced with + // DSP intrinsics. + for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) { + auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]); + auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]); + auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]); + auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]); + + if (!Ld0 || !Ld1 || !Ld2 || !Ld3) + return false; - if (Mul0 == Mul1) + LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n" + << "\t Ld0: " << *Ld0 << "\n" + << "\t Ld1: " << *Ld1 << "\n" + << "and operands " << x + 2 << ":\n" + << "\t Ld2: " << *Ld2 << "\n" + << "\t Ld3: " << *Ld3 << "\n"); + + if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { + if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { + LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); + PMACPairs.push_back(std::make_pair(PMul0, PMul1)); + return true; + } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) { + LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); + LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n"); + PMul1->Exchange = true; + PMACPairs.push_back(std::make_pair(PMul0, PMul1)); + return true; + } + } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) && + AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { + LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); + LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n"); + LLVM_DEBUG(dbgs() << " and swapping muls\n"); + PMul0->Exchange = true; + // Only the second operand can be exchanged, so swap the muls. + PMACPairs.push_back(std::make_pair(PMul1, PMul0)); + return true; + } + } + return false; + }; + + SmallPtrSet<const Instruction*, 4> Paired; + for (unsigned i = 0; i < Elems; ++i) { + BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i].get()); + if (Paired.count(PMul0->Root)) continue; - LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n"; - dbgs() << "- "; Mul0->dump(); - dbgs() << "- "; Mul1->dump()); + for (unsigned j = 0; j < Elems; ++j) { + if (i == j) + continue; - const ValueList &Mul0_LHS = PMul0->LHS; - const ValueList &Mul0_RHS = PMul0->RHS; - const ValueList &Mul1_LHS = PMul1->LHS; - const ValueList &Mul1_RHS = PMul1->RHS; + BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[j].get()); + if (Paired.count(PMul1->Root)) + continue; - if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) || - !AreSymmetrical(Mul0_RHS, Mul1_RHS)) - continue; + const Instruction *Mul0 = PMul0->Root; + const Instruction *Mul1 = PMul1->Root; + if (Mul0 == Mul1) + continue; - LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n"); - // The first elements of each vector should be loads with sexts. If we find - // that its two pairs of consecutive loads, then these can be transformed - // into two wider loads and the users can be replaced with DSP - // intrinsics. - for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) { - auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]); - auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]); - auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]); - auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]); - - LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"; - dbgs() << "\t mul1: "; Mul0_LHS[x]->dump(); - dbgs() << "\t mul2: "; Mul1_LHS[x]->dump(); - dbgs() << "and operands " << x + 2 << ":\n"; - dbgs() << "\t mul1: "; Mul0_RHS[x]->dump(); - dbgs() << "\t mul2: "; Mul1_RHS[x]->dump()); - - if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd) && - AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { - LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - PMACPairs.push_back(std::make_pair(PMul0, PMul1)); + assert(PMul0 != PMul1 && "expected different chains"); + + LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n"; + dbgs() << "- "; Mul0->dump(); + dbgs() << "- "; Mul1->dump()); + + LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n"); + if (CanPair(PMul0, PMul1)) { + Paired.insert(Mul0); + Paired.insert(Mul1); + break; } } } - return PMACPairs; } -bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction, - PMACPairList &PMACPairs) { +bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) { Instruction *Acc = Reduction.Phi; Instruction *InsertAfter = Reduction.AccIntAdd; - for (auto &Pair : PMACPairs) { + for (auto &Pair : Reduction.PMACPairs) { + BinOpChain *PMul0 = Pair.first; + BinOpChain *PMul1 = Pair.second; LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n"; - dbgs() << "- "; Pair.first->Root->dump(); - dbgs() << "- "; Pair.second->Root->dump()); - auto *VecLd0 = cast<LoadInst>(Pair.first->VecLd[0]); - auto *VecLd1 = cast<LoadInst>(Pair.second->VecLd[0]); - Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter); + dbgs() << "- "; PMul0->Root->dump(); + dbgs() << "- "; PMul1->Root->dump()); + + auto *VecLd0 = cast<LoadInst>(PMul0->VecLd[0]); + auto *VecLd1 = cast<LoadInst>(PMul1->VecLd[0]); + Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter); InsertAfter = Acc; } @@ -420,7 +471,7 @@ static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header, for (PHINode &Phi : Header->phis()) { const auto *Ty = Phi.getType(); - if (!Ty->isIntegerTy(32)) + if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) continue; const bool IsReduction = @@ -447,10 +498,11 @@ static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header, } static void AddMACCandidate(OpChainList &Candidates, - const Instruction *Acc, - Value *MulOp0, Value *MulOp1, int MulOpNum) { - Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum)); + Instruction *Mul, + Value *MulOp0, Value *MulOp1) { LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump()); + assert(Mul->getOpcode() == Instruction::Mul && + "expected mul instruction"); ValueList LHS; ValueList RHS; if (IsNarrowSequence<16>(MulOp0, LHS) && @@ -462,31 +514,38 @@ static void AddMACCandidate(OpChainList &Candidates, static void MatchParallelMACSequences(Reduction &R, OpChainList &Candidates) { - const Instruction *Acc = R.AccIntAdd; - Value *A, *MulOp0, *MulOp1; - LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump()); - - // Pattern 1: the accumulator is the RHS of the mul. - while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), - m_Value(A)))){ - AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0); - Acc = dyn_cast<Instruction>(A); - } - // Pattern 2: the accumulator is the LHS of the mul. - while(match(Acc, m_Add(m_Value(A), - m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) { - AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 1); - Acc = dyn_cast<Instruction>(A); - } + Instruction *Acc = R.AccIntAdd; + LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc); - // The last mul in the chain has a slightly different pattern: - // the mul is the first operand - if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A)))) - AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0); + // Returns false to signal the search should be stopped. + std::function<bool(Value*)> Match = + [&Candidates, &Match](Value *V) -> bool { - // Because we start at the bottom of the chain, and we work our way up, - // the muls are added in reverse program order to the list. - std::reverse(Candidates.begin(), Candidates.end()); + auto *I = dyn_cast<Instruction>(V); + if (!I) + return false; + + switch (I->getOpcode()) { + case Instruction::Add: + if (Match(I->getOperand(0)) || (Match(I->getOperand(1)))) + return true; + break; + case Instruction::Mul: { + Value *MulOp0 = I->getOperand(0); + Value *MulOp1 = I->getOperand(1); + if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) + AddMACCandidate(Candidates, I, MulOp0, MulOp1); + return false; + } + case Instruction::SExt: + return Match(I->getOperand(0)); + } + return false; + }; + + while (Match (Acc)); + LLVM_DEBUG(dbgs() << "Finished matching MAC sequences, found " + << Candidates.size() << " candidates.\n"); } // Collects all instructions that are not part of the MAC chains, which is the @@ -621,45 +680,100 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) { for (auto &R : Reductions) { if (AreAliased(AA, Reads, Writes, R.MACCandidates)) return false; - PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates); - Changed |= InsertParallelMACs(R, PMACPairs); + CreateParallelMACPairs(R); + Changed |= InsertParallelMACs(R); } LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump();); return Changed; } -static void CreateLoadIns(IRBuilder<NoFolder> &IRB, Instruction *Acc, - LoadInst **VecLd) { - const Type *AccTy = Acc->getType(); - const unsigned AddrSpace = (*VecLd)->getPointerAddressSpace(); +static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad, + const Type *LoadTy) { + const unsigned AddrSpace = BaseLoad.getPointerAddressSpace(); - Value *VecPtr = IRB.CreateBitCast((*VecLd)->getPointerOperand(), - AccTy->getPointerTo(AddrSpace)); - *VecLd = IRB.CreateAlignedLoad(VecPtr, (*VecLd)->getAlignment()); + Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(), + LoadTy->getPointerTo(AddrSpace)); + return IRB.CreateAlignedLoad(VecPtr, BaseLoad.getAlignment()); } Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, - Instruction *Acc, + Instruction *Acc, bool Exchange, Instruction *InsertAfter) { - LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n"; - dbgs() << "- "; VecLd0->dump(); - dbgs() << "- "; VecLd1->dump(); - dbgs() << "- "; Acc->dump()); + LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n" + << "- " << *VecLd0 << "\n" + << "- " << *VecLd1 << "\n" + << "- " << *Acc << "\n" + << "Exchange: " << Exchange << "\n"); IRBuilder<NoFolder> Builder(InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); // Replace the reduction chain with an intrinsic call - CreateLoadIns(Builder, Acc, &VecLd0); - CreateLoadIns(Builder, Acc, &VecLd1); - Value* Args[] = { VecLd0, VecLd1, Acc }; - Function *SMLAD = Intrinsic::getDeclaration(M, Intrinsic::arm_smlad); + const Type *Ty = IntegerType::get(M->getContext(), 32); + LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty); + LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty); + Value* Args[] = { NewLd0, NewLd1, Acc }; + Function *SMLAD = nullptr; + if (Exchange) + SMLAD = Acc->getType()->isIntegerTy(32) ? + Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) : + Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx); + else + SMLAD = Acc->getType()->isIntegerTy(32) ? + Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) : + Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); CallInst *Call = Builder.CreateCall(SMLAD, Args); NumSMLAD++; return Call; } +// Compare the value lists in Other to this chain. +bool BinOpChain::AreSymmetrical(BinOpChain *Other) { + // Element-by-element comparison of Value lists returning true if they are + // instructions with the same opcode or constants with the same value. + auto CompareValueList = [](const ValueList &VL0, + const ValueList &VL1) { + if (VL0.size() != VL1.size()) { + LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: " + << VL0.size() << " != " << VL1.size() << "\n"); + return false; + } + + const unsigned Pairs = VL0.size(); + LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n"); + + for (unsigned i = 0; i < Pairs; ++i) { + const Value *V0 = VL0[i]; + const Value *V1 = VL1[i]; + const auto *Inst0 = dyn_cast<Instruction>(V0); + const auto *Inst1 = dyn_cast<Instruction>(V1); + + LLVM_DEBUG(dbgs() << "Pair " << i << ":\n"; + dbgs() << "mul1: "; V0->dump(); + dbgs() << "mul2: "; V1->dump()); + + if (!Inst0 || !Inst1) + return false; + + if (Inst0->isSameOperationAs(Inst1)) { + LLVM_DEBUG(dbgs() << "OK: same operation found!\n"); + continue; + } + + const APInt *C0, *C1; + if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1)) + return false; + } + + LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n"); + return true; + }; + + return CompareValueList(LHS, Other->LHS) && + CompareValueList(RHS, Other->RHS); +} + Pass *llvm::createARMParallelDSPPass() { return new ARMParallelDSP(); } diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp index 0e16d6bcfe2b..4f28f2dafc70 100644 --- a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -234,6 +234,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_GEP: case G_INTTOPTR: case G_PTRTOINT: + case G_CTLZ: // FIXME: We're abusing the fact that everything lives in a GPR for now; in // the real world we would use different mappings. OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx]; diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp index f42cbbda1b71..b1d0761e3231 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -188,8 +188,10 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { assert(hasV6T2Ops() || !hasThumb2()); // Execute only support requires movt support - if (genExecuteOnly()) - assert(hasV8MBaselineOps() && !NoMovt && "Cannot generate execute-only code for this target"); + if (genExecuteOnly()) { + NoMovt = false; + assert(hasV8MBaselineOps() && "Cannot generate execute-only code for this target"); + } // Keep a pointer to static instruction cost data for the specified CPU. SchedModel = getSchedModelForCPU(CPUString); @@ -287,7 +289,13 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { case CortexR7: case CortexM3: case CortexR52: - case ExynosM1: + break; + case Exynos: + LdStMultipleTiming = SingleIssuePlusExtras; + MaxInterleaveFactor = 4; + if (!isThumb()) + PrefLoopAlignment = 3; + break; case Kryo: break; case Krait: @@ -370,7 +378,8 @@ bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const { // For general targets, the prologue can grow when VFPs are allocated with // stride 4 (more vpush instructions). But WatchOS uses a compact unwind // format which it's more important to get right. - return isTargetWatchABI() || (isSwift() && !MF.getFunction().optForMinSize()); + return isTargetWatchABI() || + (useWideStrideVFP() && !MF.getFunction().optForMinSize()); } bool ARMSubtarget::useMovt(const MachineFunction &MF) const { diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h index 74aee9a8ed38..11841b4467a2 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h @@ -68,7 +68,7 @@ protected: CortexR5, CortexR52, CortexR7, - ExynosM1, + Exynos, Krait, Kryo, Swift @@ -106,6 +106,7 @@ protected: ARMv82a, ARMv83a, ARMv84a, + ARMv85a, ARMv8a, ARMv8mBaseline, ARMv8mMainline, @@ -153,6 +154,7 @@ protected: bool HasV8_2aOps = false; bool HasV8_3aOps = false; bool HasV8_4aOps = false; + bool HasV8_5aOps = false; bool HasV8MBaselineOps = false; bool HasV8MMainlineOps = false; @@ -227,6 +229,9 @@ protected: /// HasFullFP16 - True if subtarget supports half-precision FP operations bool HasFullFP16 = false; + /// HasFP16FML - True if subtarget supports half-precision FP fml operations + bool HasFP16FML = false; + /// HasD16 - True if subtarget is limited to 16 double precision /// FP registers for VFPv3. bool HasD16 = false; @@ -353,6 +358,9 @@ protected: /// If true, loading into a D subregister will be penalized. bool SlowLoadDSubregister = false; + /// If true, use a wider stride when allocating VFP registers. + bool UseWideStrideVFP = false; + /// If true, the AGU and NEON/FPU units are multiplexed. bool HasMuxedUnits = false; @@ -408,6 +416,9 @@ protected: /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS). bool UseSjLjEH = false; + /// Has speculation barrier + bool HasSB = false; + /// Implicitly convert an instruction to a different one if its immediates /// cannot be encoded. For example, ADD r0, r1, #FFFFFFFF -> SUB r0, r1, #1. bool NegativeImmediates = true; @@ -432,6 +443,9 @@ protected: /// operand cycle returned by the itinerary data for pre-ISel operands. int PreISelOperandLatencyAdjustment = 2; + /// What alignment is preferred for loop bodies, in log2(bytes). + unsigned PrefLoopAlignment = 0; + /// IsLittle - The target is Little Endian bool IsLittle; @@ -529,6 +543,7 @@ public: bool hasV8_2aOps() const { return HasV8_2aOps; } bool hasV8_3aOps() const { return HasV8_3aOps; } bool hasV8_4aOps() const { return HasV8_4aOps; } + bool hasV8_5aOps() const { return HasV8_5aOps; } bool hasV8MBaselineOps() const { return HasV8MBaselineOps; } bool hasV8MMainlineOps() const { return HasV8MMainlineOps; } @@ -596,6 +611,7 @@ public: bool hasVMLxHazards() const { return HasVMLxHazards; } bool hasSlowOddRegister() const { return SlowOddRegister; } bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; } + bool useWideStrideVFP() const { return UseWideStrideVFP; } bool hasMuxedUnits() const { return HasMuxedUnits; } bool dontWidenVMOVS() const { return DontWidenVMOVS; } bool useSplatVFPToNeon() const { return SplatVFPToNeon; } @@ -612,12 +628,14 @@ public: bool hasDSP() const { return HasDSP; } bool useNaClTrap() const { return UseNaClTrap; } bool useSjLjEH() const { return UseSjLjEH; } + bool hasSB() const { return HasSB; } bool genLongCalls() const { return GenLongCalls; } bool genExecuteOnly() const { return GenExecuteOnly; } bool hasFP16() const { return HasFP16; } bool hasD16() const { return HasD16; } bool hasFullFP16() const { return HasFullFP16; } + bool hasFP16FML() const { return HasFP16FML; } bool hasFuseAES() const { return HasFuseAES; } bool hasFuseLiterals() const { return HasFuseLiterals; } @@ -796,6 +814,10 @@ public: bool allowPositionIndependentMovt() const { return isROPI() || !isTargetELF(); } + + unsigned getPrefLoopAlignment() const { + return PrefLoopAlignment; + } }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 519f789fc215..ec02c840d5e1 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -194,12 +194,6 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, return *RM; } -static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - /// Create an ARM architecture model. /// ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, @@ -210,7 +204,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL, bool isLittle) : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM), - getEffectiveCodeModel(CM), OL), + getEffectiveCodeModel(CM, CodeModel::Small), OL), TargetABI(computeTargetABI(TT, CPU, Options)), TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) { diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp index d0620761ea9c..9c13359cba71 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -32,7 +32,8 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { const ARMBaseTargetMachine &ARM_TM = static_cast<const ARMBaseTargetMachine &>(TM); bool isAAPCS_ABI = ARM_TM.TargetABI == ARMBaseTargetMachine::ARMABI::ARM_ABI_AAPCS; - // genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly(); + bool genExecuteOnly = + ARM_TM.getMCSubtargetInfo()->hasFeature(ARM::FeatureExecuteOnly); TargetLoweringObjectFileELF::Initialize(Ctx, TM); InitializeELF(isAAPCS_ABI); @@ -40,6 +41,17 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, if (isAAPCS_ABI) { LSDASection = nullptr; } + + // Make code section unreadable when in execute-only mode + if (genExecuteOnly) { + unsigned Type = ELF::SHT_PROGBITS; + unsigned Flags = + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_ARM_PURECODE; + // Since we cannot modify flags for an existing section, we create a new + // section with the right flags, and use 0 as the unique ID for + // execute-only text + TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U); + } } const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference( diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 94f9cefe429c..f72bb8632eb7 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -77,8 +77,8 @@ int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { return 1; return ST->hasV6T2Ops() ? 2 : 3; } - // Thumb1. - if (SImmVal >= 0 && SImmVal < 256) + // Thumb1, any i8 imm cost 1. + if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256)) return 1; if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal)) return 2; @@ -400,10 +400,29 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - // We only handle costs of reverse and select shuffles for now. - if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Select) - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + if (Kind == TTI::SK_Broadcast) { + static const CostTblEntry NEONDupTbl[] = { + // VDUP handles these cases. + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + + if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, + LT.second)) + return LT.first * Entry->Cost; + + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + } if (Kind == TTI::SK_Reverse) { static const CostTblEntry NEONShuffleTbl[] = { // Reverse shuffle cost one instruction if we are shuffling within a @@ -412,6 +431,8 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, @@ -542,14 +563,17 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool UseMaskForCond, + bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); // vldN/vstN doesn't support vector types of i64/f64 element. bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; - if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) { + if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && + !UseMaskForCond && !UseMaskForGaps) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -562,7 +586,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index e0cd2d8e26a6..2dd143d48a15 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -57,7 +57,7 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> { const FeatureBitset InlineFeatureWhitelist = { ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2, ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8, - ARM::FeatureFullFP16, ARM::FeatureHWDivThumb, + ARM::FeatureFullFP16, ARM::FeatureFP16FML, ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM, ARM::FeatureDB, ARM::FeatureV7Clrex, ARM::FeatureAcquireRelease, ARM::FeatureSlowFPBrcc, ARM::FeaturePerfMon, ARM::FeatureTrustZone, ARM::Feature8MSecExt, @@ -169,7 +169,9 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, + bool UseMaskForCond = false, + bool UseMaskForGaps = false); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index a5fbbbf26be9..3832b0112b87 100644 --- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "ARMFeatures.h" +#include "InstPrinter/ARMInstPrinter.h" #include "Utils/ARMBaseInfo.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "MCTargetDesc/ARMBaseInfo.h" @@ -631,6 +632,8 @@ public: void ReportNearMisses(SmallVectorImpl<NearMissInfo> &NearMisses, SMLoc IDLoc, OperandVector &Operands); + void doBeforeLabelEmit(MCSymbol *Symbol) override; + void onLabelParsed(MCSymbol *Symbol) override; }; @@ -3203,17 +3206,26 @@ public: } // end anonymous namespace. void ARMOperand::print(raw_ostream &OS) const { + auto RegName = [](unsigned Reg) { + if (Reg) + return ARMInstPrinter::getRegisterName(Reg); + else + return "noreg"; + }; + switch (Kind) { case k_CondCode: OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">"; break; case k_CCOut: - OS << "<ccout " << getReg() << ">"; + OS << "<ccout " << RegName(getReg()) << ">"; break; case k_ITCondMask: { static const char *const MaskStr[] = { - "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)", - "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)" + "(invalid)", "(teee)", "(tee)", "(teet)", + "(te)", "(tete)", "(tet)", "(tett)", + "(t)", "(ttee)", "(tte)", "(ttet)", + "(tt)", "(ttte)", "(ttt)", "(tttt)" }; assert((ITMask.Mask & 0xf) == ITMask.Mask); OS << "<it-mask " << MaskStr[ITMask.Mask] << ">"; @@ -3247,13 +3259,25 @@ void ARMOperand::print(raw_ostream &OS) const { OS << "<ARM_TSB::" << TraceSyncBOptToString(getTraceSyncBarrierOpt()) << ">"; break; case k_Memory: - OS << "<memory " - << " base:" << Memory.BaseRegNum; + OS << "<memory"; + if (Memory.BaseRegNum) + OS << " base:" << RegName(Memory.BaseRegNum); + if (Memory.OffsetImm) + OS << " offset-imm:" << *Memory.OffsetImm; + if (Memory.OffsetRegNum) + OS << " offset-reg:" << (Memory.isNegative ? "-" : "") + << RegName(Memory.OffsetRegNum); + if (Memory.ShiftType != ARM_AM::no_shift) { + OS << " shift-type:" << ARM_AM::getShiftOpcStr(Memory.ShiftType); + OS << " shift-imm:" << Memory.ShiftImm; + } + if (Memory.Alignment) + OS << " alignment:" << Memory.Alignment; OS << ">"; break; case k_PostIndexRegister: OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-") - << PostIdxReg.RegNum; + << RegName(PostIdxReg.RegNum); if (PostIdxReg.ShiftTy != ARM_AM::no_shift) OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " " << PostIdxReg.ShiftImm; @@ -3269,23 +3293,21 @@ void ARMOperand::print(raw_ostream &OS) const { break; } case k_Register: - OS << "<register " << getReg() << ">"; + OS << "<register " << RegName(getReg()) << ">"; break; case k_ShifterImmediate: OS << "<shift " << (ShifterImm.isASR ? "asr" : "lsl") << " #" << ShifterImm.Imm << ">"; break; case k_ShiftedRegister: - OS << "<so_reg_reg " - << RegShiftedReg.SrcReg << " " - << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy) - << " " << RegShiftedReg.ShiftReg << ">"; + OS << "<so_reg_reg " << RegName(RegShiftedReg.SrcReg) << " " + << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy) << " " + << RegName(RegShiftedReg.ShiftReg) << ">"; break; case k_ShiftedImmediate: - OS << "<so_reg_imm " - << RegShiftedImm.SrcReg << " " - << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy) - << " #" << RegShiftedImm.ShiftImm << ">"; + OS << "<so_reg_imm " << RegName(RegShiftedImm.SrcReg) << " " + << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy) << " #" + << RegShiftedImm.ShiftImm << ">"; break; case k_RotateImmediate: OS << "<ror " << " #" << (RotImm.Imm * 8) << ">"; @@ -3309,7 +3331,7 @@ void ARMOperand::print(raw_ostream &OS) const { const SmallVectorImpl<unsigned> &RegList = getRegList(); for (SmallVectorImpl<unsigned>::const_iterator I = RegList.begin(), E = RegList.end(); I != E; ) { - OS << *I; + OS << RegName(*I); if (++I < E) OS << ", "; } @@ -3318,15 +3340,15 @@ void ARMOperand::print(raw_ostream &OS) const { } case k_VectorList: OS << "<vector_list " << VectorList.Count << " * " - << VectorList.RegNum << ">"; + << RegName(VectorList.RegNum) << ">"; break; case k_VectorListAllLanes: OS << "<vector_list(all lanes) " << VectorList.Count << " * " - << VectorList.RegNum << ">"; + << RegName(VectorList.RegNum) << ">"; break; case k_VectorListIndexed: OS << "<vector_list(lane " << VectorList.LaneIndex << ") " - << VectorList.Count << " * " << VectorList.RegNum << ">"; + << VectorList.Count << " * " << RegName(VectorList.RegNum) << ">"; break; case k_Token: OS << "'" << getToken() << "'"; @@ -5626,7 +5648,8 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" || Mnemonic == "bxns" || Mnemonic == "blxns" || Mnemonic == "vudot" || Mnemonic == "vsdot" || - Mnemonic == "vcmla" || Mnemonic == "vcadd") + Mnemonic == "vcmla" || Mnemonic == "vcadd" || + Mnemonic == "vfmal" || Mnemonic == "vfmsl") return Mnemonic; // First, split out any predication code. Ignore mnemonics we know aren't @@ -5716,7 +5739,10 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst, (FullInst.startswith("vmull") && FullInst.endswith(".p64")) || Mnemonic == "vmovx" || Mnemonic == "vins" || Mnemonic == "vudot" || Mnemonic == "vsdot" || - Mnemonic == "vcmla" || Mnemonic == "vcadd") { + Mnemonic == "vcmla" || Mnemonic == "vcadd" || + Mnemonic == "vfmal" || Mnemonic == "vfmsl" || + Mnemonic == "sb" || Mnemonic == "ssbb" || + Mnemonic == "pssbb") { // These mnemonics are never predicable CanAcceptPredicationCode = false; } else if (!isThumb()) { @@ -6819,6 +6845,26 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, "code specified"); break; } + case ARM::DSB: + case ARM::t2DSB: { + + if (Inst.getNumOperands() < 2) + break; + + unsigned Option = Inst.getOperand(0).getImm(); + unsigned Pred = Inst.getOperand(1).getImm(); + + // SSBB and PSSBB (DSB #0|#4) are not predicable (pred must be AL). + if (Option == 0 && Pred != ARMCC::AL) + return Error(Operands[1]->getStartLoc(), + "instruction 'ssbb' is not predicable, but condition code " + "specified"); + if (Option == 4 && Pred != ARMCC::AL) + return Error(Operands[1]->getStartLoc(), + "instruction 'pssbb' is not predicable, but condition code " + "specified"); + break; + } case ARM::VMOVRRS: { // Source registers must be sequential. const unsigned Sm = MRI->getEncodingValue(Inst.getOperand(2).getReg()); @@ -6837,6 +6883,15 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, "destination operands must be sequential"); break; } + case ARM::VLDMDIA: + case ARM::VSTMDIA: { + ARMOperand &Op = static_cast<ARMOperand&>(*Operands[3]); + auto &RegList = Op.getRegList(); + if (RegList.size() < 1 || RegList.size() > 16) + return Error(Operands[3]->getStartLoc(), + "list of registers must be at least 1 and at most 16"); + break; + } } return false; @@ -9122,33 +9177,9 @@ bool ARMAsmParser::isITBlockTerminator(MCInst &Inst) const { // Any arithmetic instruction which writes to the PC also terminates the IT // block. - for (unsigned OpIdx = 0; OpIdx < MCID.getNumDefs(); ++OpIdx) { - MCOperand &Op = Inst.getOperand(OpIdx); - if (Op.isReg() && Op.getReg() == ARM::PC) - return true; - } - - if (MCID.hasImplicitDefOfPhysReg(ARM::PC, MRI)) + if (MCID.hasDefOfPhysReg(Inst, ARM::PC, *MRI)) return true; - // Instructions with variable operand lists, which write to the variable - // operands. We only care about Thumb instructions here, as ARM instructions - // obviously can't be in an IT block. - switch (Inst.getOpcode()) { - case ARM::tLDMIA: - case ARM::t2LDMIA: - case ARM::t2LDMIA_UPD: - case ARM::t2LDMDB: - case ARM::t2LDMDB_UPD: - if (listContainsReg(Inst, 3, ARM::PC)) - return true; - break; - case ARM::tPOP: - if (listContainsReg(Inst, 2, ARM::PC)) - return true; - break; - } - return false; } @@ -9255,6 +9286,10 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, switch (MatchResult) { case Match_Success: + LLVM_DEBUG(dbgs() << "Parsed as: "; + Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode())); + dbgs() << "\n"); + // Context sensitive operand constraints aren't handled by the matcher, // so check them here. if (validateInstruction(Inst, Operands)) { @@ -9272,7 +9307,9 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // individual transformations can chain off each other. E.g., // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8) while (processInstruction(Inst, Operands, Out)) - ; + LLVM_DEBUG(dbgs() << "Changed to: "; + Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode())); + dbgs() << "\n"); // Only after the instruction is fully processed, we can validate it if (wasInITBlock && hasV8Ops() && isThumb() && @@ -9441,10 +9478,13 @@ bool ARMAsmParser::parseDirectiveARM(SMLoc L) { return false; } -void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) { +void ARMAsmParser::doBeforeLabelEmit(MCSymbol *Symbol) { // We need to flush the current implicit IT block on a label, because it is // not legal to branch into an IT block. flushPendingInstructions(getStreamer()); +} + +void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) { if (NextSymbolIsThumb) { getParser().getStreamer().EmitThumbFunc(Symbol); NextSymbolIsThumb = false; diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index bfc32073ba18..2f84719c4c4f 100644 --- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -273,6 +273,21 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O, case ARM::t2TSB: O << "\ttsb\tcsync"; return; + case ARM::t2DSB: + switch (MI->getOperand(0).getImm()) { + default: + if (!printAliasInstr(MI, STI, O)) + printInstruction(MI, STI, O); + break; + case 0: + O << "\tssbb"; + break; + case 4: + O << "\tpssbb"; + break; + } + printAnnotation(O, Annot); + return; } if (!printAliasInstr(MI, STI, O)) diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h index f472b2154314..e1ea5964cf67 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h @@ -16,6 +16,7 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" +#include "llvm/ADT/bit.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include <cassert> @@ -627,27 +628,22 @@ namespace ARM_AM { // inline float getFPImmFloat(unsigned Imm) { // We expect an 8-bit binary encoding of a floating-point number here. - union { - uint32_t I; - float F; - } FPUnion; uint8_t Sign = (Imm >> 7) & 0x1; uint8_t Exp = (Imm >> 4) & 0x7; uint8_t Mantissa = Imm & 0xf; - // 8-bit FP iEEEE Float Encoding + // 8-bit FP IEEE Float Encoding // abcd efgh aBbbbbbc defgh000 00000000 00000000 // // where B = NOT(b); - - FPUnion.I = 0; - FPUnion.I |= Sign << 31; - FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30; - FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25; - FPUnion.I |= (Exp & 0x3) << 23; - FPUnion.I |= Mantissa << 19; - return FPUnion.F; + uint32_t I = 0; + I |= Sign << 31; + I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30; + I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25; + I |= (Exp & 0x3) << 23; + I |= Mantissa << 19; + return bit_cast<float>(I); } /// getFP16Imm - Return an 8-bit floating-point version of the 16-bit diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index f524a0081301..c2a07d4ddcef 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -373,6 +373,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, // interfere with checking valid expressions. if (const MCSymbolRefExpr *A = Target.getSymA()) { if (A->hasSubsectionsViaSymbols() && Asm.isThumbFunc(&A->getSymbol()) && + A->getSymbol().isExternal() && (Kind == FK_Data_4 || Kind == ARM::fixup_arm_movw_lo16 || Kind == ARM::fixup_arm_movt_hi16 || Kind == ARM::fixup_t2_movw_lo16 || Kind == ARM::fixup_t2_movt_hi16)) diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h index beeb5dec4baf..33c32d5464af 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -248,6 +248,11 @@ namespace ARMII { /// just that part of the flag set. MO_OPTION_MASK = 0x3, + /// MO_COFFSTUB - On a symbol operand "FOO", this indicates that the + /// reference is actually to the ".refptrp.FOO" symbol. This is used for + /// stub symbols on windows. + MO_COFFSTUB = 0x4, + /// MO_GOT - On a symbol operand, this represents a GOT relative relocation. MO_GOT = 0x8, diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 7d04c73fb3f2..b8ba7584911b 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -14,6 +14,7 @@ #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" @@ -40,6 +41,8 @@ namespace { bool needsRelocateWithSymbol(const MCSymbol &Sym, unsigned Type) const override; + + void addTargetSectionFlags(MCContext &Ctx, MCSectionELF &Sec) override; }; } // end anonymous namespace @@ -236,6 +239,21 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, } } +void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx, + MCSectionELF &Sec) { + // The mix of execute-only and non-execute-only at link time is + // non-execute-only. To avoid the empty implicitly created .text + // section from making the whole .text section non-execute-only, we + // mark it execute-only if it is empty and there is at least one + // execute-only section in the object. + MCSectionELF *TextSection = + static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection()); + if (Sec.getKind().isExecuteOnly() && !TextSection->hasInstructions() && + !TextSection->hasData()) { + TextSection->setFlags(TextSection->getFlags() | ELF::SHF_ARM_PURECODE); + } +} + std::unique_ptr<MCObjectTargetWriter> llvm::createARMELFObjectWriter(uint8_t OSABI) { return llvm::make_unique<ARMELFObjectWriter>(OSABI); diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 3373d691db50..d3744fffac32 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -465,6 +465,11 @@ public: void emitPad(int64_t Offset); void emitRegSave(const SmallVectorImpl<unsigned> &RegList, bool isVector); void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes); + void emitFill(const MCExpr &NumBytes, uint64_t FillValue, + SMLoc Loc) override { + EmitDataMappingSymbol(); + MCObjectStreamer::emitFill(NumBytes, FillValue, Loc); + } void ChangeSection(MCSection *Section, const MCExpr *Subsection) override { LastMappingSymbols[getCurrentSection().first] = std::move(LastEMSInfo); @@ -861,6 +866,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::ArchKind::ARMV8_2A: case ARM::ArchKind::ARMV8_3A: case ARM::ArchKind::ARMV8_4A: + case ARM::ArchKind::ARMV8_5A: setAttributeItem(CPU_arch_profile, ApplicationProfile, false); setAttributeItem(ARM_ISA_use, Allowed, false); setAttributeItem(THUMB_ISA_use, AllowThumb32, false); @@ -1071,7 +1077,7 @@ void ARMTargetELFStreamer::finishAttributeSection() { if (Contents.empty()) return; - llvm::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag); + llvm::sort(Contents, AttributeItem::LessTag); ARMELFStreamer &Streamer = getStreamer(); diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp index 0cef683778e5..3ee63ac374b3 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -31,6 +31,9 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) { SupportsDebugInformation = true; + // Conditional Thumb 4-byte instructions can have an implicit IT. + MaxInstLength = 6; + // Exceptions handling ExceptionsType = (TheTriple.isOSDarwin() && !TheTriple.isWatchABI()) ? ExceptionHandling::SjLj @@ -56,6 +59,9 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo(const Triple &TheTriple) { SupportsDebugInformation = true; + // Conditional Thumb 4-byte instructions can have an implicit IT. + MaxInstLength = 6; + // Exceptions handling switch (TheTriple.getOS()) { case Triple::NetBSD: @@ -90,6 +96,9 @@ ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() { PrivateGlobalPrefix = "$M"; PrivateLabelPrefix = "$M"; CommentString = ";"; + + // Conditional Thumb 4-byte instructions can have an implicit IT. + MaxInstLength = 6; } void ARMCOFFMCAsmInfoGNU::anchor() { } @@ -110,5 +119,7 @@ ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() { UseIntegratedAssembler = true; DwarfRegNumForCFI = false; -} + // Conditional Thumb 4-byte instructions can have an implicit IT. + MaxInstLength = 6; +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 4b4956e914f2..0ced8195790d 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -22,6 +22,8 @@ #include "llvm/MC/MCSection.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ScopedPrinter.h" + using namespace llvm; namespace { @@ -144,6 +146,15 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, MCValue Target, uint64_t &FixedValue) { uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + + if (FixupOffset & 0xff000000) { + Asm.getContext().reportError(Fixup.getLoc(), + "can not encode offset '0x" + + to_hexString(FixupOffset) + + "' in resulting scattered relocation."); + return; + } + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); unsigned Type = MachO::ARM_RELOC_HALF; @@ -250,6 +261,15 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, unsigned Log2Size, uint64_t &FixedValue) { uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + + if (FixupOffset & 0xff000000) { + Asm.getContext().reportError(Fixup.getLoc(), + "can not encode offset '0x" + + to_hexString(FixupOffset) + + "' in resulting scattered relocation."); + return; + } + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); // See <reloc.h>. diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index 63aa9735e8a4..91836cff95c8 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "ARMTargetMachine.h" +#include "MCTargetDesc/ARMMCTargetDesc.h" #include "llvm/MC/ConstantPools.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp index 8ae713b7b489..30cbde1ca71f 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp @@ -75,8 +75,8 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx, case ARM::fixup_t2_condbranch: return COFF::IMAGE_REL_ARM_BRANCH20T; case ARM::fixup_t2_uncondbranch: - return COFF::IMAGE_REL_ARM_BRANCH24T; case ARM::fixup_arm_thumb_bl: + return COFF::IMAGE_REL_ARM_BRANCH24T; case ARM::fixup_arm_thumb_blx: return COFF::IMAGE_REL_ARM_BLX23T; case ARM::fixup_t2_movw_lo16: diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index 1a91a7030657..d567d3339049 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -146,9 +146,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); - if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || - RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || - RC == &ARM::GPRnopcRegClass) { + if (ARM::GPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::t2STRi12)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI) @@ -190,9 +188,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); - if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || - RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || - RC == &ARM::GPRnopcRegClass) { + if (ARM::GPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg) .addFrameIndex(FI) .addImm(0) diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index abf54ba7e87c..65889fc4e28b 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -122,6 +122,7 @@ namespace { { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, { ARM::t2SXTB, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, { ARM::t2SXTH, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, + { ARM::t2TEQrr, ARM::tEOR, 0, 0, 0, 1, 0, 2,0, 0,1,0 }, { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, { ARM::t2UXTB, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, { ARM::t2UXTH, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, @@ -485,7 +486,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, .addReg(Rt, IsStore ? 0 : RegState::Define); // Transfer memoperands. - MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MIB.setMemRefs(MI->memoperands()); // Transfer MI flags. MIB.setMIFlags(MI->getFlags()); @@ -605,7 +606,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, MIB.add(MI->getOperand(OpNum)); // Transfer memoperands. - MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MIB.setMemRefs(MI->memoperands()); // Transfer MI flags. MIB.setMIFlags(MI->getFlags()); @@ -717,6 +718,16 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, return true; return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); } + case ARM::t2TEQrr: { + unsigned PredReg = 0; + // Can only convert to eors if we're not in an IT block. + if (getInstrPredicate(*MI, PredReg) != ARMCC::AL) + break; + // TODO if Operand 0 is not killed but Operand 1 is, then we could write + // to Op1 instead. + if (MI->getOperand(0).isKill()) + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); + } } return false; } @@ -903,9 +914,24 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, // Add the 16-bit instruction. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID); - MIB.add(MI->getOperand(0)); - if (NewMCID.hasOptionalDef()) - MIB.add(HasCC ? t1CondCodeOp(CCDead) : condCodeOp()); + + // TEQ is special in that it doesn't define a register but we're converting + // it into an EOR which does. So add the first operand as a def and then + // again as a use. + if (MCID.getOpcode() == ARM::t2TEQrr) { + MIB.add(MI->getOperand(0)); + MIB->getOperand(0).setIsKill(false); + MIB->getOperand(0).setIsDef(true); + MIB->getOperand(0).setIsDead(true); + + if (NewMCID.hasOptionalDef()) + MIB.add(HasCC ? t1CondCodeOp(CCDead) : condCodeOp()); + MIB.add(MI->getOperand(0)); + } else { + MIB.add(MI->getOperand(0)); + if (NewMCID.hasOptionalDef()) + MIB.add(HasCC ? t1CondCodeOp(CCDead) : condCodeOp()); + } // Transfer the rest of operands. unsigned NumOps = MCID.getNumOperands(); |