diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
commit | 1d5ae1026e831016fc29fd927877c86af904481f (patch) | |
tree | 2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Target/ARM | |
parent | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff) |
Diffstat (limited to 'lib/Target/ARM')
69 files changed, 6194 insertions, 2411 deletions
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp index fb238bfc9cbc..30b9c8071ba2 100644 --- a/lib/Target/ARM/A15SDOptimizer.cpp +++ b/lib/Target/ARM/A15SDOptimizer.cpp @@ -133,9 +133,9 @@ bool A15SDOptimizer::usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC) { if (!MO.isReg()) return false; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return MRI->getRegClass(Reg)->hasSuperClassEq(TRC); else return TRC->contains(Reg); @@ -151,7 +151,7 @@ unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) { // Get the subreg type that is most likely to be coalesced // for an SPR register that will be used in VDUP32d pseudo. unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) { - if (!TRI->isVirtualRegister(SReg)) + if (!Register::isVirtualRegister(SReg)) return getDPRLaneFromSPR(SReg); MachineInstr *MI = MRI->getVRegDef(SReg); @@ -166,7 +166,7 @@ unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) { SReg = MI->getOperand(1).getReg(); } - if (TargetRegisterInfo::isVirtualRegister(SReg)) { + if (Register::isVirtualRegister(SReg)) { if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1; return ARM::ssub_0; } @@ -191,8 +191,8 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) { for (MachineOperand &MO : MI->operands()) { if ((!MO.isReg()) || (!MO.isUse())) continue; - unsigned Reg = MO.getReg(); - if (!TRI->isVirtualRegister(Reg)) + Register Reg = MO.getReg(); + if (!Register::isVirtualRegister(Reg)) continue; MachineOperand *Op = MI->findRegisterDefOperand(Reg); @@ -213,8 +213,8 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) { for (MachineOperand &MODef : Def->operands()) { if ((!MODef.isReg()) || (!MODef.isDef())) continue; - unsigned DefReg = MODef.getReg(); - if (!TRI->isVirtualRegister(DefReg)) { + Register DefReg = MODef.getReg(); + if (!Register::isVirtualRegister(DefReg)) { IsDead = false; break; } @@ -245,10 +245,10 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { } if (MI->isInsertSubreg()) { - unsigned DPRReg = MI->getOperand(1).getReg(); - unsigned SPRReg = MI->getOperand(2).getReg(); + Register DPRReg = MI->getOperand(1).getReg(); + Register SPRReg = MI->getOperand(2).getReg(); - if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) { + if (Register::isVirtualRegister(DPRReg) && Register::isVirtualRegister(SPRReg)) { MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg()); MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg()); @@ -267,7 +267,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { // Find the thing we're subreg copying out of - is it of the same // regclass as DPRMI? (i.e. a DPR or QPR). - unsigned FullReg = SPRMI->getOperand(1).getReg(); + Register FullReg = SPRMI->getOperand(1).getReg(); const TargetRegisterClass *TRC = MRI->getRegClass(MI->getOperand(1).getReg()); if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) { @@ -296,9 +296,9 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { if (!MI->getOperand(I).isReg()) continue; ++NumTotal; - unsigned OpReg = MI->getOperand(I).getReg(); + Register OpReg = MI->getOperand(I).getReg(); - if (!TRI->isVirtualRegister(OpReg)) + if (!Register::isVirtualRegister(OpReg)) break; MachineInstr *Def = MRI->getVRegDef(OpReg); @@ -342,7 +342,7 @@ bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) { MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) { if (!MI->isFullCopy()) return MI; - if (!TRI->isVirtualRegister(MI->getOperand(1).getReg())) + if (!Register::isVirtualRegister(MI->getOperand(1).getReg())) return nullptr; MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg()); if (!Def) @@ -369,8 +369,8 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI, Reached.insert(MI); if (MI->isPHI()) { for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { - unsigned Reg = MI->getOperand(I).getReg(); - if (!TRI->isVirtualRegister(Reg)) { + Register Reg = MI->getOperand(I).getReg(); + if (!Register::isVirtualRegister(Reg)) { continue; } MachineInstr *NewMI = MRI->getVRegDef(Reg); @@ -379,7 +379,7 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI, Front.push_back(NewMI); } } else if (MI->isFullCopy()) { - if (!TRI->isVirtualRegister(MI->getOperand(1).getReg())) + if (!Register::isVirtualRegister(MI->getOperand(1).getReg())) continue; MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg()); if (!NewMI) @@ -418,8 +418,8 @@ unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, unsigned Reg, unsigned Lane, bool QPR) { - unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : - &ARM::DPRRegClass); + Register Out = + MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : &ARM::DPRRegClass); BuildMI(MBB, InsertBefore, DL, TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d), Out) .addReg(Reg) @@ -434,7 +434,7 @@ unsigned A15SDOptimizer::createExtractSubreg( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, unsigned DReg, unsigned Lane, const TargetRegisterClass *TRC) { - unsigned Out = MRI->createVirtualRegister(TRC); + Register Out = MRI->createVirtualRegister(TRC); BuildMI(MBB, InsertBefore, DL, @@ -448,7 +448,7 @@ unsigned A15SDOptimizer::createExtractSubreg( unsigned A15SDOptimizer::createRegSequence( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, unsigned Reg1, unsigned Reg2) { - unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass); + Register Out = MRI->createVirtualRegister(&ARM::QPRRegClass); BuildMI(MBB, InsertBefore, DL, @@ -466,7 +466,7 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, unsigned Ssub0, unsigned Ssub1) { - unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); + Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass); BuildMI(MBB, InsertBefore, DL, TII->get(ARM::VEXTd32), Out) .addReg(Ssub0) .addReg(Ssub1) @@ -478,7 +478,7 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB, unsigned A15SDOptimizer::createInsertSubreg( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) { - unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass); + Register Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass); BuildMI(MBB, InsertBefore, DL, @@ -494,7 +494,7 @@ unsigned A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL) { - unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); + Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass); BuildMI(MBB, InsertBefore, DL, @@ -602,7 +602,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { // we can end up with multiple defs of this DPR. SmallVector<MachineInstr *, 8> DefSrcs; - if (!TRI->isVirtualRegister(*I)) + if (!Register::isVirtualRegister(*I)) continue; MachineInstr *Def = MRI->getVRegDef(*I); if (!Def) @@ -622,7 +622,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { // Collect all the uses of this MI's DPR def for updating later. SmallVector<MachineOperand*, 8> Uses; - unsigned DPRDefReg = MI->getOperand(0).getReg(); + Register DPRDefReg = MI->getOperand(0).getReg(); for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg), E = MRI->use_end(); I != E; ++I) Uses.push_back(&*I); diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h index bf8ed6562fe7..2e6f756d522c 100644 --- a/lib/Target/ARM/ARM.h +++ b/lib/Target/ARM/ARM.h @@ -35,6 +35,7 @@ class MachineInstr; class MCInst; class PassRegistry; +Pass *createMVETailPredicationPass(); FunctionPass *createARMLowOverheadLoopsPass(); Pass *createARMParallelDSPPass(); FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, @@ -67,6 +68,7 @@ void initializeThumb2SizeReducePass(PassRegistry &); void initializeThumb2ITBlockPass(PassRegistry &); void initializeMVEVPTBlockPass(PassRegistry &); void initializeARMLowOverheadLoopsPass(PassRegistry &); +void initializeMVETailPredicationPass(PassRegistry &); } // end namespace llvm diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index b687db12eaf5..fed4cb2b9316 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -57,12 +57,15 @@ def FeatureD32 : SubtargetFeature<"d32", "HasD32", "true", "Extend FP to 32 double registers">; multiclass VFPver<string name, string query, string description, - list<SubtargetFeature> prev = [], - list<SubtargetFeature> otherimplies = []> { + list<SubtargetFeature> prev, + list<SubtargetFeature> otherimplies, + list<SubtargetFeature> vfp2prev = []> { def _D16_SP: SubtargetFeature< name#"d16sp", query#"D16SP", "true", description#" with only 16 d-registers and no double precision", - !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16_SP")) # otherimplies>; + !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16_SP")) # + !foreach(v, vfp2prev, !cast<SubtargetFeature>(v # "_SP")) # + otherimplies>; def _SP: SubtargetFeature< name#"sp", query#"SP", "true", description#" with no double precision", @@ -72,6 +75,7 @@ multiclass VFPver<string name, string query, string description, name#"d16", query#"D16", "true", description#" with only 16 d-registers", !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16")) # + vfp2prev # otherimplies # [FeatureFP64, !cast<SubtargetFeature>(NAME # "_D16_SP")]>; def "": SubtargetFeature< name, query, "true", description, @@ -80,11 +84,17 @@ multiclass VFPver<string name, string query, string description, !cast<SubtargetFeature>(NAME # "_SP")]>; } -defm FeatureVFP2: VFPver<"vfp2", "HasVFPv2", "Enable VFP2 instructions", - [], [FeatureFPRegs]>; +def FeatureVFP2_SP : SubtargetFeature<"vfp2sp", "HasVFPv2SP", "true", + "Enable VFP2 instructions with " + "no double precision", + [FeatureFPRegs]>; + +def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true", + "Enable VFP2 instructions", + [FeatureFP64, FeatureVFP2_SP]>; defm FeatureVFP3: VFPver<"vfp3", "HasVFPv3", "Enable VFP3 instructions", - [FeatureVFP2]>; + [], [], [FeatureVFP2]>; def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", "Enable NEON instructions", @@ -98,7 +108,7 @@ defm FeatureVFP4: VFPver<"vfp4", "HasVFPv4", "Enable VFP4 instructions", [FeatureVFP3], [FeatureFP16]>; defm FeatureFPARMv8: VFPver<"fp-armv8", "HasFPARMv8", "Enable ARMv8 FP", - [FeatureVFP4]>; + [FeatureVFP4], []>; def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", "Enable full half-precision " @@ -302,9 +312,18 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", "Prefer 32-bit Thumb instrs">; -def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2", +def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2", "Prefer 32-bit alignment for loops">; +def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "1", + "Model MVE instructions as a 1 beat per tick architecture">; + +def FeatureMVEVectorCostFactor2 : SubtargetFeature<"mve2beat", "MVEVectorCostFactor", "2", + "Model MVE instructions as a 2 beats per tick architecture">; + +def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "4", + "Model MVE instructions as a 4 beats per tick architecture">; + /// Some instructions update CPSR partially, which can add false dependency for /// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is /// mapped to a separate physical register. Avoid partial CPSR update for these @@ -1156,6 +1175,13 @@ def : ProcNoItin<"cortex-a76ae", [ARMv82a, ProcA76, FeatureFullFP16, FeatureDotProd]>; +def : ProcNoItin<"neoverse-n1", [ARMv82a, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureDotProd]>; + def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureHasRetAddrStack, FeatureNEONForFP, diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index e29077266fcd..c8c91e53c44e 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -168,7 +168,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { // relatively easy to exceed the thumb branch range within a TU. if (! ThumbIndirectPads.empty()) { OutStreamer->EmitAssemblerFlag(MCAF_Code16); - EmitAlignment(1); + EmitAlignment(Align(2)); for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) { OutStreamer->EmitLabel(TIP.second); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX) @@ -203,8 +203,8 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, switch (MO.getType()) { default: llvm_unreachable("<unknown operand type>"); case MachineOperand::MO_Register: { - unsigned Reg = MO.getReg(); - assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + Register Reg = MO.getReg(); + assert(Register::isPhysicalRegister(Reg)); assert(!MO.getSubReg() && "Subregs should be eliminated!"); if(ARM::GPRPairRegClass.contains(Reg)) { const MachineFunction &MF = *MI->getParent()->getParent(); @@ -275,7 +275,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, return false; case 'y': // Print a VFP single precision register as indexed double. if (MI->getOperand(OpNum).isReg()) { - unsigned Reg = MI->getOperand(OpNum).getReg(); + Register Reg = MI->getOperand(OpNum).getReg(); const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); // Find the 'd' register that has this 's' register as a sub-register, // and determine the lane number. @@ -302,14 +302,14 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, if (!MI->getOperand(OpNum).isReg()) return true; const MachineOperand &MO = MI->getOperand(OpNum); - unsigned RegBegin = MO.getReg(); + Register RegBegin = MO.getReg(); // This takes advantage of the 2 operand-ness of ldm/stm and that we've // already got the operands in registers that are operands to the // inline asm statement. O << "{"; if (ARM::GPRPairRegClass.contains(RegBegin)) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0); + Register Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0); O << ARMInstPrinter::getRegisterName(Reg0) << ", "; RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1); } @@ -378,8 +378,8 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, if (!MO.isReg()) return true; const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - unsigned Reg = TRI->getSubReg(MO.getReg(), FirstHalf ? - ARM::gsub_0 : ARM::gsub_1); + Register Reg = + TRI->getSubReg(MO.getReg(), FirstHalf ? ARM::gsub_0 : ARM::gsub_1); O << ARMInstPrinter::getRegisterName(Reg); return false; } @@ -391,7 +391,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, const MachineOperand &MO = MI->getOperand(RegOp); if (!MO.isReg()) return true; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); O << ARMInstPrinter::getRegisterName(Reg); return false; } @@ -400,12 +400,12 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, case 'f': { // The high doubleword register of a NEON quad register. if (!MI->getOperand(OpNum).isReg()) return true; - unsigned Reg = MI->getOperand(OpNum).getReg(); + Register Reg = MI->getOperand(OpNum).getReg(); if (!ARM::QPRRegClass.contains(Reg)) return true; const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ? - ARM::dsub_0 : ARM::dsub_1); + Register SubReg = + TRI->getSubReg(Reg, ExtraCode[0] == 'e' ? ARM::dsub_0 : ARM::dsub_1); O << ARMInstPrinter::getRegisterName(SubReg); return false; } @@ -419,7 +419,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, return true; const MachineFunction &MF = *MI->getParent()->getParent(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if(!ARM::GPRPairRegClass.contains(Reg)) return false; Reg = TRI->getSubReg(Reg, ARM::gsub_1); @@ -526,7 +526,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { if (!Stubs.empty()) { // Switch with ".non_lazy_symbol_pointer" directive. OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); - EmitAlignment(2); + EmitAlignment(Align(4)); for (auto &Stub : Stubs) emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); @@ -539,7 +539,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { if (!Stubs.empty()) { // Switch with ".non_lazy_symbol_pointer" directive. OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection()); - EmitAlignment(2); + EmitAlignment(Align(4)); for (auto &Stub : Stubs) emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); @@ -940,7 +940,7 @@ void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) { // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for // ARM mode tables. - EmitAlignment(2); + EmitAlignment(Align(4)); // Emit a label for the jump table. MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); @@ -986,7 +986,7 @@ void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) { // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for // ARM mode tables. - EmitAlignment(2); + EmitAlignment(Align(4)); // Emit a label for the jump table. MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); @@ -1015,7 +1015,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI, unsigned JTI = MO1.getIndex(); if (Subtarget->isThumb1Only()) - EmitAlignment(2); + EmitAlignment(Align(4)); MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); OutStreamer->EmitLabel(JTISymbol); @@ -1058,7 +1058,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI, OutStreamer->EmitDataRegion(MCDR_DataRegionEnd); // Make sure the next instruction is 2-byte aligned. - EmitAlignment(1); + EmitAlignment(Align(2)); } void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { @@ -1072,7 +1072,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { MF.getSubtarget().getRegisterInfo(); const MachineRegisterInfo &MachineRegInfo = MF.getRegInfo(); - unsigned FramePtr = TargetRegInfo->getFrameRegister(MF); + Register FramePtr = TargetRegInfo->getFrameRegister(MF); unsigned Opc = MI->getOpcode(); unsigned SrcReg, DstReg; @@ -1136,7 +1136,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { } // Check for registers that are remapped (for a Thumb1 prologue that // saves high registers). - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(Reg)) Reg = RemappedReg; RegList.push_back(Reg); @@ -1326,7 +1326,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // So here we generate a bl to a small jump pad that does bx rN. // The jump pads are emitted after the function body. - unsigned TReg = MI->getOperand(0).getReg(); + Register TReg = MI->getOperand(0).getReg(); MCSymbol *TRegSym = nullptr; for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) { if (TIP.first == TReg) { @@ -1663,8 +1663,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { case ARM::tTBH_JT: { bool Is8Bit = MI->getOpcode() == ARM::tTBB_JT; - unsigned Base = MI->getOperand(0).getReg(); - unsigned Idx = MI->getOperand(1).getReg(); + Register Base = MI->getOperand(0).getReg(); + Register Idx = MI->getOperand(1).getReg(); assert(MI->getOperand(1).isKill() && "We need the index register as scratch!"); // Multiply up idx if necessary. @@ -1844,8 +1844,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // b LSJLJEH // movs r0, #1 // LSJLJEH: - unsigned SrcReg = MI->getOperand(0).getReg(); - unsigned ValReg = MI->getOperand(1).getReg(); + Register SrcReg = MI->getOperand(0).getReg(); + Register ValReg = MI->getOperand(1).getReg(); MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true); OutStreamer->AddComment("eh_setjmp begin"); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr) @@ -1910,8 +1910,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // mov r0, #0 // add pc, pc, #0 // mov r0, #1 - unsigned SrcReg = MI->getOperand(0).getReg(); - unsigned ValReg = MI->getOperand(1).getReg(); + Register SrcReg = MI->getOperand(0).getReg(); + Register ValReg = MI->getOperand(1).getReg(); OutStreamer->AddComment("eh_setjmp begin"); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDri) @@ -1967,8 +1967,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // ldr $scratch, [$src, #4] // ldr r7, [$src] // bx $scratch - unsigned SrcReg = MI->getOperand(0).getReg(); - unsigned ScratchReg = MI->getOperand(1).getReg(); + Register SrcReg = MI->getOperand(0).getReg(); + Register ScratchReg = MI->getOperand(1).getReg(); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12) .addReg(ARM::SP) .addReg(SrcReg) @@ -2027,8 +2027,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // ldr $scratch, [$src, #4] // ldr r7, [$src] // bx $scratch - unsigned SrcReg = MI->getOperand(0).getReg(); - unsigned ScratchReg = MI->getOperand(1).getReg(); + Register SrcReg = MI->getOperand(0).getReg(); + Register ScratchReg = MI->getOperand(1).getReg(); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi) .addReg(ScratchReg) @@ -2095,7 +2095,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // ldr.w sp, [$src, #8] // ldr.w pc, [$src, #4] - unsigned SrcReg = MI->getOperand(0).getReg(); + Register SrcReg = MI->getOperand(0).getReg(); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12) .addReg(ARM::R11) diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 222aa85856a2..684cd1def977 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -172,9 +172,9 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress( const MachineOperand &WB = isLoad ? MI.getOperand(1) : MI.getOperand(0); const MachineOperand &Base = MI.getOperand(2); const MachineOperand &Offset = MI.getOperand(NumOps - 3); - unsigned WBReg = WB.getReg(); - unsigned BaseReg = Base.getReg(); - unsigned OffReg = Offset.getReg(); + Register WBReg = WB.getReg(); + Register BaseReg = Base.getReg(); + Register OffReg = Offset.getReg(); unsigned OffImm = MI.getOperand(NumOps - 2).getImm(); ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI.getOperand(NumOps - 1).getImm(); switch (AddrMode) { @@ -276,8 +276,8 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress( if (LV) { for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); - if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) { - unsigned Reg = MO.getReg(); + if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) { + Register Reg = MO.getReg(); LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); if (MO.isDef()) { @@ -966,8 +966,8 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, SmallSet<unsigned, 4> DstRegs; #endif for (unsigned i = 0; i != SubRegs; ++i) { - unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing); - unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing); + Register Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing); + Register Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing); assert(Dst && Src && "Bad sub-register"); #ifndef NDEBUG assert(!DstRegs.count(Src) && "destructive vector copy"); @@ -1019,7 +1019,7 @@ ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg, if (!SubIdx) return MIB.addReg(Reg, State); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); return MIB.addReg(Reg, State, SubIdx); } @@ -1133,7 +1133,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, case 24: if (ARM::DTripleRegClass.hasSubClassEq(RC)) { // Use aligned spills if the stack can be realigned. - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + Subtarget.hasNEON()) { BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64TPseudo)) .addFrameIndex(FI) .addImm(16) @@ -1155,7 +1156,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 32: if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + Subtarget.hasNEON()) { // FIXME: It's possible to only store part of the QQ register if the // spilled def has a sub-register index. BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64QPseudo)) @@ -1337,7 +1339,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); } - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + if (Register::isPhysicalRegister(DestReg)) MIB.addReg(DestReg, RegState::ImplicitDefine); } else llvm_unreachable("Unknown reg class!"); @@ -1368,7 +1370,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 24: if (ARM::DTripleRegClass.hasSubClassEq(RC)) { - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + Subtarget.hasNEON()) { BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg) .addFrameIndex(FI) .addImm(16) @@ -1382,7 +1385,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + if (Register::isPhysicalRegister(DestReg)) MIB.addReg(DestReg, RegState::ImplicitDefine); } } else @@ -1390,7 +1393,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 32: if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + Subtarget.hasNEON()) { BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg) .addFrameIndex(FI) .addImm(16) @@ -1405,7 +1409,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + if (Register::isPhysicalRegister(DestReg)) MIB.addReg(DestReg, RegState::ImplicitDefine); } } else @@ -1425,7 +1429,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI); - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + if (Register::isPhysicalRegister(DestReg)) MIB.addReg(DestReg, RegState::ImplicitDefine); } else llvm_unreachable("Unknown reg class!"); @@ -1583,8 +1587,8 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // Look for a copy between even S-registers. That is where we keep floats // when using NEON v2f32 instructions for f32 arithmetic. - unsigned DstRegS = MI.getOperand(0).getReg(); - unsigned SrcRegS = MI.getOperand(1).getReg(); + Register DstRegS = MI.getOperand(0).getReg(); + Register SrcRegS = MI.getOperand(1).getReg(); if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS)) return false; @@ -1794,12 +1798,11 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0, if (MI0.getNumOperands() != MI1.getNumOperands()) return false; - unsigned Addr0 = MI0.getOperand(1).getReg(); - unsigned Addr1 = MI1.getOperand(1).getReg(); + Register Addr0 = MI0.getOperand(1).getReg(); + Register Addr1 = MI1.getOperand(1).getReg(); if (Addr0 != Addr1) { - if (!MRI || - !TargetRegisterInfo::isVirtualRegister(Addr0) || - !TargetRegisterInfo::isVirtualRegister(Addr1)) + if (!MRI || !Register::isVirtualRegister(Addr0) || + !Register::isVirtualRegister(Addr1)) return false; // This assumes SSA form. @@ -2076,6 +2079,38 @@ isProfitableToIfCvt(MachineBasicBlock &TBB, return PredCost <= UnpredCost; } +unsigned +ARMBaseInstrInfo::extraSizeToPredicateInstructions(const MachineFunction &MF, + unsigned NumInsts) const { + // Thumb2 needs a 2-byte IT instruction to predicate up to 4 instructions. + // ARM has a condition code field in every predicable instruction, using it + // doesn't change code size. + return Subtarget.isThumb2() ? divideCeil(NumInsts, 4) * 2 : 0; +} + +unsigned +ARMBaseInstrInfo::predictBranchSizeForIfCvt(MachineInstr &MI) const { + // If this branch is likely to be folded into the comparison to form a + // CB(N)Z, then removing it won't reduce code size at all, because that will + // just replace the CB(N)Z with a CMP. + if (MI.getOpcode() == ARM::t2Bcc && + findCMPToFoldIntoCBZ(&MI, &getRegisterInfo())) + return 0; + + unsigned Size = getInstSizeInBytes(MI); + + // For Thumb2, all branches are 32-bit instructions during the if conversion + // pass, but may be replaced with 16-bit instructions during size reduction. + // Since the branches considered by if conversion tend to be forward branches + // over small basic blocks, they are very likely to be in range for the + // narrow instructions, so we assume the final code size will be half what it + // currently is. + if (Subtarget.isThumb2()) + Size /= 2; + + return Size; +} + bool ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, MachineBasicBlock &FMBB) const { @@ -2141,7 +2176,7 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI, MachineInstr * ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI, const TargetInstrInfo *TII) const { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return nullptr; if (!MRI.hasOneNonDBGUse(Reg)) return nullptr; @@ -2163,7 +2198,7 @@ ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI, // MI can't have any tied operands, that would conflict with predication. if (MO.isTied()) return nullptr; - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + if (Register::isPhysicalRegister(MO.getReg())) return nullptr; if (MO.isDef() && !MO.isDead()) return nullptr; @@ -2211,7 +2246,7 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI, // Find new register class to use. MachineOperand FalseReg = MI.getOperand(Invert ? 2 : 1); - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg()); if (!MRI.constrainRegClass(DestReg, PreviousClass)) return nullptr; @@ -2298,6 +2333,7 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = { {ARM::tSUBSrr, ARM::tSUBrr}, {ARM::tSBCS, ARM::tSBC}, {ARM::tRSBS, ARM::tRSB}, + {ARM::tLSLSri, ARM::tLSLri}, {ARM::t2ADDSri, ARM::t2ADDri}, {ARM::t2ADDSrr, ARM::t2ADDrr}, @@ -2420,7 +2456,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, MachineOperand &MO = MI->getOperand(i); RegList.push_back(MO); - if (MO.isReg() && TRI->getEncodingValue(MO.getReg()) < FirstRegEnc) + if (MO.isReg() && !MO.isImplicit() && + TRI->getEncodingValue(MO.getReg()) < FirstRegEnc) FirstRegEnc = TRI->getEncodingValue(MO.getReg()); } @@ -2430,7 +2467,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, for (int CurRegEnc = FirstRegEnc - 1; CurRegEnc >= 0 && RegsNeeded; --CurRegEnc) { unsigned CurReg = RegClass->getRegister(CurRegEnc); - if (IsT1PushPop && CurReg > ARM::R7) + if (IsT1PushPop && CurRegEnc > TRI->getEncodingValue(ARM::R7)) continue; if (!IsPop) { // Pushing any register is completely harmless, mark the register involved @@ -3039,18 +3076,22 @@ bool ARMBaseInstrInfo::optimizeCompareInstr( break; case ARM::VSELEQD: case ARM::VSELEQS: + case ARM::VSELEQH: CC = ARMCC::EQ; break; case ARM::VSELGTD: case ARM::VSELGTS: + case ARM::VSELGTH: CC = ARMCC::GT; break; case ARM::VSELGED: case ARM::VSELGES: + case ARM::VSELGEH: CC = ARMCC::GE; break; - case ARM::VSELVSS: case ARM::VSELVSD: + case ARM::VSELVSS: + case ARM::VSELVSH: CC = ARMCC::VS; break; } @@ -3271,9 +3312,9 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, } unsigned OpIdx = Commute ? 2 : 1; - unsigned Reg1 = UseMI.getOperand(OpIdx).getReg(); + Register Reg1 = UseMI.getOperand(OpIdx).getReg(); bool isKill = UseMI.getOperand(OpIdx).isKill(); - unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg)); + Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg)); BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(NewUseOpc), NewReg) .addReg(Reg1, getKillRegState(isKill)) @@ -3335,15 +3376,15 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDRSB_POST: case ARM::LDRSH_POST: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rm = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rm = MI.getOperand(3).getReg(); return (Rt == Rm) ? 4 : 3; } case ARM::LDR_PRE_REG: case ARM::LDRB_PRE_REG: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rm = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rm = MI.getOperand(3).getReg(); if (Rt == Rm) return 3; unsigned ShOpVal = MI.getOperand(4).getImm(); @@ -3372,8 +3413,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDRH_PRE: case ARM::STRH_PRE: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rm = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rm = MI.getOperand(3).getReg(); if (!Rm) return 2; if (Rt == Rm) @@ -3384,8 +3425,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDR_POST_REG: case ARM::LDRB_POST_REG: case ARM::LDRH_POST: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rm = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rm = MI.getOperand(3).getReg(); return (Rt == Rm) ? 3 : 2; } @@ -3404,10 +3445,10 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, case ARM::LDRSB_PRE: case ARM::LDRSH_PRE: { - unsigned Rm = MI.getOperand(3).getReg(); + Register Rm = MI.getOperand(3).getReg(); if (Rm == 0) return 3; - unsigned Rt = MI.getOperand(0).getReg(); + Register Rt = MI.getOperand(0).getReg(); if (Rt == Rm) return 4; unsigned ShOpVal = MI.getOperand(4).getImm(); @@ -3422,9 +3463,9 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, } case ARM::LDRD: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rn = MI.getOperand(2).getReg(); - unsigned Rm = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rn = MI.getOperand(2).getReg(); + Register Rm = MI.getOperand(3).getReg(); if (Rm) return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4 : 3; @@ -3432,7 +3473,7 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, } case ARM::STRD: { - unsigned Rm = MI.getOperand(3).getReg(); + Register Rm = MI.getOperand(3).getReg(); if (Rm) return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4 : 3; @@ -3448,9 +3489,9 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, return 4; case ARM::LDRD_PRE: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rn = MI.getOperand(3).getReg(); - unsigned Rm = MI.getOperand(4).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rn = MI.getOperand(3).getReg(); + Register Rm = MI.getOperand(4).getReg(); if (Rm) return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5 : 4; @@ -3458,13 +3499,13 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, } case ARM::t2LDRD_PRE: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rn = MI.getOperand(3).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rn = MI.getOperand(3).getReg(); return (Rt == Rn) ? 4 : 3; } case ARM::STRD_PRE: { - unsigned Rm = MI.getOperand(4).getReg(); + Register Rm = MI.getOperand(4).getReg(); if (Rm) return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5 : 4; @@ -3495,8 +3536,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, return 2; case ARM::t2LDRDi8: { - unsigned Rt = MI.getOperand(0).getReg(); - unsigned Rn = MI.getOperand(2).getReg(); + Register Rt = MI.getOperand(0).getReg(); + Register Rn = MI.getOperand(2).getReg(); return (Rt == Rn) ? 3 : 2; } @@ -3745,7 +3786,7 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData, } bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const { - unsigned BaseReg = MI.getOperand(0).getReg(); + Register BaseReg = MI.getOperand(0).getReg(); for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) { const auto &Op = MI.getOperand(i); if (Op.isReg() && Op.getReg() == BaseReg) @@ -4219,7 +4260,7 @@ int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, return -1; const MachineOperand &DefMO = DefMI.getOperand(DefIdx); - unsigned Reg = DefMO.getReg(); + Register Reg = DefMO.getReg(); const MachineInstr *ResolvedDefMI = &DefMI; unsigned DefAdj = 0; @@ -4328,10 +4369,10 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, } const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode()); - const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode); + auto *DefMN = cast<MachineSDNode>(DefNode); unsigned DefAlign = !DefMN->memoperands_empty() ? (*DefMN->memoperands_begin())->getAlignment() : 0; - const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode); + auto *UseMN = cast<MachineSDNode>(UseNode); unsigned UseAlign = !UseMN->memoperands_empty() ? (*UseMN->memoperands_begin())->getAlignment() : 0; int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, @@ -4708,7 +4749,7 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI, if (MI.getOperand(i).isImplicit() || !MI.getOperand(i).isReg()) continue; - unsigned Reg = MI.getOperand(i).getReg(); + Register Reg = MI.getOperand(i).getReg(); if (Reg < ARM::R0 || Reg > ARM::R7) { if (!(MI.getOpcode() == ARM::tPUSH && Reg == ARM::LR) && !(MI.getOpcode() == ARM::tPOP_RET && Reg == ARM::PC)) { @@ -4731,7 +4772,7 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI, MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - unsigned Reg = MI->getOperand(0).getReg(); + Register Reg = MI->getOperand(0).getReg(); const GlobalValue *GV = cast<GlobalValue>((*MI->memoperands_begin())->getValue()); MachineInstrBuilder MIB; @@ -5104,7 +5145,7 @@ unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance( const MachineOperand &MO = MI.getOperand(OpNum); if (MO.readsReg()) return 0; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); int UseOp = -1; switch (MI.getOpcode()) { @@ -5134,7 +5175,7 @@ unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance( return 0; // We must be able to clobber the whole D-reg. - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { // Virtual register must be a def undef foo:ssub_0 operand. if (!MO.getSubReg() || MI.readsVirtualRegister(Reg)) return 0; @@ -5159,8 +5200,8 @@ void ARMBaseInstrInfo::breakPartialRegDependency( assert(TRI && "Need TRI instance"); const MachineOperand &MO = MI.getOperand(OpNum); - unsigned Reg = MO.getReg(); - assert(TargetRegisterInfo::isPhysicalRegister(Reg) && + Register Reg = MO.getReg(); + assert(Register::isPhysicalRegister(Reg) && "Can't break virtual register dependencies."); unsigned DReg = Reg; @@ -5337,7 +5378,7 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br, // is not redefined between the cmp and the br. if (CmpMI->getOpcode() != ARM::tCMPi8 && CmpMI->getOpcode() != ARM::t2CMPri) return nullptr; - unsigned Reg = CmpMI->getOperand(0).getReg(); + Register Reg = CmpMI->getOperand(0).getReg(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(*CmpMI, PredReg); if (Pred != ARMCC::AL || CmpMI->getOperand(1).getImm() != 0) @@ -5349,3 +5390,50 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br, return &*CmpMI; } + +unsigned llvm::ConstantMaterializationCost(unsigned Val, + const ARMSubtarget *Subtarget, + bool ForCodesize) { + if (Subtarget->isThumb()) { + if (Val <= 255) // MOV + return ForCodesize ? 2 : 1; + if (Subtarget->hasV6T2Ops() && (Val <= 0xffff || // MOV + ARM_AM::getT2SOImmVal(Val) != -1 || // MOVW + ARM_AM::getT2SOImmVal(~Val) != -1)) // MVN + return ForCodesize ? 4 : 1; + if (Val <= 510) // MOV + ADDi8 + return ForCodesize ? 4 : 2; + if (~Val <= 255) // MOV + MVN + return ForCodesize ? 4 : 2; + if (ARM_AM::isThumbImmShiftedVal(Val)) // MOV + LSL + return ForCodesize ? 4 : 2; + } else { + if (ARM_AM::getSOImmVal(Val) != -1) // MOV + return ForCodesize ? 4 : 1; + if (ARM_AM::getSOImmVal(~Val) != -1) // MVN + return ForCodesize ? 4 : 1; + if (Subtarget->hasV6T2Ops() && Val <= 0xffff) // MOVW + return ForCodesize ? 4 : 1; + if (ARM_AM::isSOImmTwoPartVal(Val)) // two instrs + return ForCodesize ? 8 : 2; + } + if (Subtarget->useMovt()) // MOVW + MOVT + return ForCodesize ? 8 : 2; + return ForCodesize ? 8 : 3; // Literal pool load +} + +bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, + const ARMSubtarget *Subtarget, + bool ForCodesize) { + // Check with ForCodesize + unsigned Cost1 = ConstantMaterializationCost(Val1, Subtarget, ForCodesize); + unsigned Cost2 = ConstantMaterializationCost(Val2, Subtarget, ForCodesize); + if (Cost1 < Cost2) + return true; + if (Cost1 > Cost2) + return false; + + // If they are equal, try with !ForCodesize + return ConstantMaterializationCost(Val1, Subtarget, !ForCodesize) < + ConstantMaterializationCost(Val2, Subtarget, !ForCodesize); +} diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index c28983fcc15c..c232b6f0b45d 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -276,6 +276,10 @@ public: return NumCycles == 1; } + unsigned extraSizeToPredicateInstructions(const MachineFunction &MF, + unsigned NumInsts) const override; + unsigned predictBranchSizeForIfCvt(MachineInstr &MI) const override; + bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, MachineBasicBlock &FMBB) const override; @@ -601,7 +605,8 @@ bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, - const ARMBaseInstrInfo &TII); + const ARMBaseInstrInfo &TII, + const TargetRegisterInfo *TRI); /// Return true if Reg is defd between From and To bool registerDefinedBetween(unsigned Reg, MachineBasicBlock::iterator From, @@ -620,6 +625,20 @@ void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond); void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond, unsigned Inactive); +/// Returns the number of instructions required to materialize the given +/// constant in a register, or 3 if a literal pool load is needed. +/// If ForCodesize is specified, an approximate cost in bytes is returned. +unsigned ConstantMaterializationCost(unsigned Val, + const ARMSubtarget *Subtarget, + bool ForCodesize = false); + +/// Returns true if Val1 has a lower Constant Materialization Cost than Val2. +/// Uses the cost from ConstantMaterializationCost, first with ForCodesize as +/// specified. If the scores are equal, return the comparison for !ForCodesize. +bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, + const ARMSubtarget *Subtarget, + bool ForCodesize = false); + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index dc99b37742da..1eaf871867e0 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -174,6 +174,12 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, : CSR_AAPCS_ThisReturn_RegMask; } +ArrayRef<MCPhysReg> ARMBaseRegisterInfo::getIntraCallClobberedRegs( + const MachineFunction *MF) const { + static const MCPhysReg IntraCallClobberedRegs[] = {ARM::R12}; + return ArrayRef<MCPhysReg>(IntraCallClobberedRegs); +} + BitVector ARMBaseRegisterInfo:: getReservedRegs(const MachineFunction &MF) const { const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); @@ -185,7 +191,7 @@ getReservedRegs(const MachineFunction &MF) const { markSuperRegs(Reserved, ARM::PC); markSuperRegs(Reserved, ARM::FPSCR); markSuperRegs(Reserved, ARM::APSR_NZCV); - if (TFI->hasFP(MF)) + if (TFI->hasFP(MF) || STI.isTargetDarwin()) markSuperRegs(Reserved, getFramePointerReg(STI)); if (hasBasePointer(MF)) markSuperRegs(Reserved, BasePtr); @@ -217,7 +223,7 @@ isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const { const TargetRegisterClass * ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, - const MachineFunction &) const { + const MachineFunction &MF) const { const TargetRegisterClass *Super = RC; TargetRegisterClass::sc_iterator I = RC->getSuperClasses(); do { @@ -225,11 +231,13 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, case ARM::GPRRegClassID: case ARM::SPRRegClassID: case ARM::DPRRegClassID: + case ARM::GPRPairRegClassID: + return Super; case ARM::QPRRegClassID: case ARM::QQPRRegClassID: case ARM::QQQQPRRegClassID: - case ARM::GPRPairRegClassID: - return Super; + if (MF.getSubtarget<ARMSubtarget>().hasNEON()) + return Super; } Super = *I++; } while (Super); @@ -317,7 +325,7 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg, return false; unsigned PairedPhys = 0; - if (TargetRegisterInfo::isPhysicalRegister(Paired)) { + if (Register::isPhysicalRegister(Paired)) { PairedPhys = Paired; } else if (VRM && VRM->hasPhys(Paired)) { PairedPhys = getPairedGPR(VRM->getPhys(Paired), Odd, this); @@ -347,7 +355,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg, std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg); if ((Hint.first == (unsigned)ARMRI::RegPairOdd || Hint.first == (unsigned)ARMRI::RegPairEven) && - TargetRegisterInfo::isVirtualRegister(Hint.second)) { + Register::isVirtualRegister(Hint.second)) { // If 'Reg' is one of the even / odd register pair and it's now changed // (e.g. coalesced) into a different register. The other register of the // pair allocation hint must be updated to reflect the relationship @@ -357,7 +365,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg, // Make sure the pair has not already divorced. if (Hint.second == Reg) { MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg); - if (TargetRegisterInfo::isVirtualRegister(NewReg)) + if (Register::isVirtualRegister(NewReg)) MRI->setRegAllocationHint(NewReg, Hint.first == (unsigned)ARMRI::RegPairOdd ? ARMRI::RegPairEven : ARMRI::RegPairOdd, OtherReg); @@ -663,7 +671,7 @@ void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII); else { assert(AFI->isThumb2Function()); - Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII); + Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII, this); } assert(Done && "Unable to resolve frame index!"); (void)Done; @@ -775,7 +783,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII); else { assert(AFI->isThumb2Function()); - Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII); + Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII, this); } if (Done) return; @@ -783,21 +791,32 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above, handle the rest, providing a register that is // SP+LargeImm. - assert((Offset || - (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 || - (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) && - "This code isn't needed if offset already handled!"); + assert( + (Offset || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7s2 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == + ARMII::AddrModeT2_i7s4) && + "This code isn't needed if offset already handled!"); unsigned ScratchReg = 0; int PIdx = MI.findFirstPredOperandIdx(); ARMCC::CondCodes Pred = (PIdx == -1) ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm(); Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg(); - if (Offset == 0) + + const MCInstrDesc &MCID = MI.getDesc(); + const TargetRegisterClass *RegClass = + TII.getRegClass(MCID, FIOperandNum, this, *MI.getParent()->getParent()); + + if (Offset == 0 && + (Register::isVirtualRegister(FrameReg) || RegClass->contains(FrameReg))) // Must be addrmode4/6. MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false); else { - ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass); + ScratchReg = MF.getRegInfo().createVirtualRegister(RegClass); if (!AFI->isThumbFunction()) emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, Pred, PredReg, TII); diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index 7e2c72b4d712..477f3ad0a9a7 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -129,6 +129,9 @@ public: const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const; + ArrayRef<MCPhysReg> + getIntraCallClobberedRegs(const MachineFunction *MF) const override; + BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const override; @@ -176,8 +179,6 @@ public: Register getFrameRegister(const MachineFunction &MF) const override; unsigned getBaseRegister() const { return BasePtr; } - bool isLowRegister(unsigned Reg) const; - /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. diff --git a/lib/Target/ARM/ARMBasicBlockInfo.cpp b/lib/Target/ARM/ARMBasicBlockInfo.cpp index 2de90e816b33..00a2231f59e3 100644 --- a/lib/Target/ARM/ARMBasicBlockInfo.cpp +++ b/lib/Target/ARM/ARMBasicBlockInfo.cpp @@ -6,14 +6,16 @@ // //===----------------------------------------------------------------------===// +#include "ARMBasicBlockInfo.h" #include "ARM.h" #include "ARMBaseInstrInfo.h" -#include "ARMBasicBlockInfo.h" #include "ARMMachineFunctionInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/Support/Debug.h" #include <vector> #define DEBUG_TYPE "arm-bb-utils" @@ -47,7 +49,7 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) { BasicBlockInfo &BBI = BBInfo[MBB->getNumber()]; BBI.Size = 0; BBI.Unalign = 0; - BBI.PostAlign = 0; + BBI.PostAlign = Align::None(); for (MachineInstr &I : *MBB) { BBI.Size += TII->getInstSizeInBytes(I); @@ -62,8 +64,8 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) { // tBR_JTr contains a .align 2 directive. if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) { - BBI.PostAlign = 2; - MBB->getParent()->ensureAlignment(2); + BBI.PostAlign = Align(4); + MBB->getParent()->ensureAlignment(Align(4)); } } @@ -126,9 +128,9 @@ void ARMBasicBlockUtils::adjustBBOffsetsAfter(MachineBasicBlock *BB) { for(unsigned i = BBNum + 1, e = MF.getNumBlockIDs(); i < e; ++i) { // Get the offset and known bits at the end of the layout predecessor. // Include the alignment of the current block. - unsigned LogAlign = MF.getBlockNumbered(i)->getAlignment(); - unsigned Offset = BBInfo[i - 1].postOffset(LogAlign); - unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign); + const Align Align = MF.getBlockNumbered(i)->getAlignment(); + const unsigned Offset = BBInfo[i - 1].postOffset(Align); + const unsigned KnownBits = BBInfo[i - 1].postKnownBits(Align); // This is where block i begins. Stop if the offset is already correct, // and we have updated 2 blocks. This is the maximum number of blocks diff --git a/lib/Target/ARM/ARMBasicBlockInfo.h b/lib/Target/ARM/ARMBasicBlockInfo.h index 400bba351cec..13df399ed995 100644 --- a/lib/Target/ARM/ARMBasicBlockInfo.h +++ b/lib/Target/ARM/ARMBasicBlockInfo.h @@ -21,17 +21,18 @@ namespace llvm { +struct BasicBlockInfo; using BBInfoVector = SmallVectorImpl<BasicBlockInfo>; /// UnknownPadding - Return the worst case padding that could result from /// unknown offset bits. This does not include alignment padding caused by /// known offset bits. /// -/// @param LogAlign log2(alignment) +/// @param Alignment alignment /// @param KnownBits Number of known low offset bits. -inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) { - if (KnownBits < LogAlign) - return (1u << LogAlign) - (1u << KnownBits); +inline unsigned UnknownPadding(Align Alignment, unsigned KnownBits) { + if (KnownBits < Log2(Alignment)) + return Alignment.value() - (1ull << KnownBits); return 0; } @@ -65,10 +66,9 @@ struct BasicBlockInfo { /// multiple of 1 << Unalign. uint8_t Unalign = 0; - /// PostAlign - When non-zero, the block terminator contains a .align - /// directive, so the end of the block is aligned to 1 << PostAlign - /// bytes. - uint8_t PostAlign = 0; + /// PostAlign - When > 1, the block terminator contains a .align + /// directive, so the end of the block is aligned to PostAlign bytes. + Align PostAlign; BasicBlockInfo() = default; @@ -84,16 +84,16 @@ struct BasicBlockInfo { return Bits; } - /// Compute the offset immediately following this block. If LogAlign is + /// Compute the offset immediately following this block. If Align is /// specified, return the offset the successor block will get if it has /// this alignment. - unsigned postOffset(unsigned LogAlign = 0) const { + unsigned postOffset(Align Alignment = Align::None()) const { unsigned PO = Offset + Size; - unsigned LA = std::max(unsigned(PostAlign), LogAlign); - if (!LA) + const Align PA = std::max(PostAlign, Alignment); + if (PA == Align::None()) return PO; // Add alignment padding from the terminator. - return PO + UnknownPadding(LA, internalKnownBits()); + return PO + UnknownPadding(PA, internalKnownBits()); } /// Compute the number of known low bits of postOffset. If this block @@ -101,9 +101,8 @@ struct BasicBlockInfo { /// instruction alignment. An aligned terminator may increase the number /// of know bits. /// If LogAlign is given, also consider the alignment of the next block. - unsigned postKnownBits(unsigned LogAlign = 0) const { - return std::max(std::max(unsigned(PostAlign), LogAlign), - internalKnownBits()); + unsigned postKnownBits(Align Align = Align::None()) const { + return std::max(Log2(std::max(PostAlign, Align)), internalKnownBits()); } }; diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index 0cbe6e1871e4..d3b595ce8323 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -90,6 +90,8 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { MachineInstrBuilder &MIB, CCAssignFn *AssignFn) : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + bool isIncomingArgumentHandler() const override { return false; } + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) && @@ -169,8 +171,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - const CallLowering::ArgInfo &Info, CCState &State) override { - if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State)) + const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, + CCState &State) override { + if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State)) return true; StackSize = @@ -199,9 +202,8 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg, if (SplitVTs.size() == 1) { // Even if there is no splitting to do, we still want to replace the // original type (e.g. pointer type -> integer). - auto Flags = OrigArg.Flags; - unsigned OriginalAlignment = DL.getABITypeAlignment(OrigArg.Ty); - Flags.setOrigAlign(OriginalAlignment); + auto Flags = OrigArg.Flags[0]; + Flags.setOrigAlign(Align(DL.getABITypeAlignment(OrigArg.Ty))); SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), Flags, OrigArg.IsFixed); return; @@ -211,10 +213,9 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg, for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) { EVT SplitVT = SplitVTs[i]; Type *SplitTy = SplitVT.getTypeForEVT(Ctx); - auto Flags = OrigArg.Flags; + auto Flags = OrigArg.Flags[0]; - unsigned OriginalAlignment = DL.getABITypeAlignment(SplitTy); - Flags.setOrigAlign(OriginalAlignment); + Flags.setOrigAlign(Align(DL.getABITypeAlignment(SplitTy))); bool NeedsConsecutiveRegisters = TLI.functionArgumentNeedsConsecutiveRegisters( @@ -286,7 +287,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { CCAssignFn AssignFn) : ValueHandler(MIRBuilder, MRI, AssignFn) {} - bool isArgumentHandler() const override { return true; } + bool isIncomingArgumentHandler() const override { return true; } Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -298,7 +299,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { int FI = MFI.CreateFixedObject(Size, Offset, true); MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); - unsigned AddrReg = + Register AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(MPO.getAddrSpace(), 32)); MIRBuilder.buildFrameIndex(AddrReg, FI); @@ -405,6 +406,7 @@ struct FormalArgHandler : public IncomingValueHandler { : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {} void markPhysRegUsed(unsigned PhysReg) override { + MIRBuilder.getMRI()->addLiveIn(PhysReg); MIRBuilder.getMBB().addLiveIn(PhysReg); } }; @@ -498,11 +500,7 @@ unsigned getCallOpcode(const ARMSubtarget &STI, bool isDirect) { } } // end anonymous namespace -bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, - CallingConv::ID CallConv, - const MachineOperand &Callee, - const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const { +bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const { MachineFunction &MF = MIRBuilder.getMF(); const auto &TLI = *getTLI<ARMTargetLowering>(); const auto &DL = MF.getDataLayout(); @@ -520,7 +518,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Create the call instruction so we can add the implicit uses of arg // registers, but don't insert it yet. - bool IsDirect = !Callee.isReg(); + bool IsDirect = !Info.Callee.isReg(); auto CallOpcode = getCallOpcode(STI, IsDirect); auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode); @@ -528,35 +526,35 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (IsThumb) MIB.add(predOps(ARMCC::AL)); - MIB.add(Callee); + MIB.add(Info.Callee); if (!IsDirect) { - auto CalleeReg = Callee.getReg(); - if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) { + auto CalleeReg = Info.Callee.getReg(); + if (CalleeReg && !Register::isPhysicalRegister(CalleeReg)) { unsigned CalleeIdx = IsThumb ? 2 : 0; MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass( MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(), - *MIB.getInstr(), MIB->getDesc(), Callee, CalleeIdx)); + *MIB.getInstr(), MIB->getDesc(), Info.Callee, CalleeIdx)); } } - MIB.addRegMask(TRI->getCallPreservedMask(MF, CallConv)); + MIB.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv)); bool IsVarArg = false; SmallVector<ArgInfo, 8> ArgInfos; - for (auto Arg : OrigArgs) { + for (auto Arg : Info.OrigArgs) { if (!isSupportedType(DL, TLI, Arg.Ty)) return false; if (!Arg.IsFixed) IsVarArg = true; - if (Arg.Flags.isByVal()) + if (Arg.Flags[0].isByVal()) return false; splitToValueTypes(Arg, ArgInfos, MF); } - auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, IsVarArg); + auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, IsVarArg); OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn); if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler)) return false; @@ -564,13 +562,13 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Now we can add the actual call instruction to the correct basic block. MIRBuilder.insertInstr(MIB); - if (!OrigRet.Ty->isVoidTy()) { - if (!isSupportedType(DL, TLI, OrigRet.Ty)) + if (!Info.OrigRet.Ty->isVoidTy()) { + if (!isSupportedType(DL, TLI, Info.OrigRet.Ty)) return false; ArgInfos.clear(); - splitToValueTypes(OrigRet, ArgInfos, MF); - auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, IsVarArg); + splitToValueTypes(Info.OrigRet, ArgInfos, MF); + auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, IsVarArg); CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn); if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler)) return false; diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h index 794127b5ebc7..ddbc9feb90e2 100644 --- a/lib/Target/ARM/ARMCallLowering.h +++ b/lib/Target/ARM/ARMCallLowering.h @@ -38,9 +38,8 @@ public: bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const override; - bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, - const MachineOperand &Callee, const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const override; + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; private: bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val, diff --git a/lib/Target/ARM/ARMCallingConv.cpp b/lib/Target/ARM/ARMCallingConv.cpp index 5ede7c67f7c2..92ebc542b423 100644 --- a/lib/Target/ARM/ARMCallingConv.cpp +++ b/lib/Target/ARM/ARMCallingConv.cpp @@ -193,7 +193,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, // Try to allocate a contiguous block of registers, each of the correct // size to hold one member. auto &DL = State.getMachineFunction().getDataLayout(); - unsigned StackAlign = DL.getStackAlignment(); + unsigned StackAlign = DL.getStackAlignment().value(); unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign); ArrayRef<MCPhysReg> RegList; diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp index 2fc5f4aaab50..1c2c8aef55bb 100644 --- a/lib/Target/ARM/ARMCodeGenPrepare.cpp +++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp @@ -179,16 +179,12 @@ public: } static bool GenerateSignBits(Value *V) { - if (auto *Arg = dyn_cast<Argument>(V)) - return Arg->hasSExtAttr(); - if (!isa<Instruction>(V)) return false; unsigned Opc = cast<Instruction>(V)->getOpcode(); return Opc == Instruction::AShr || Opc == Instruction::SDiv || - Opc == Instruction::SRem || Opc == Instruction::SExt || - Opc == Instruction::SIToFP; + Opc == Instruction::SRem || Opc == Instruction::SExt; } static bool EqualTypeSize(Value *V) { @@ -806,54 +802,48 @@ void IRPromoter::Mutate(Type *OrigTy, /// return value is zeroext. We don't allow opcodes that can introduce sign /// bits. bool ARMCodeGenPrepare::isSupportedValue(Value *V) { - if (auto *I = dyn_cast<ICmpInst>(V)) { - // Now that we allow small types than TypeSize, only allow icmp of - // TypeSize because they will require a trunc to be legalised. - // TODO: Allow icmp of smaller types, and calculate at the end - // whether the transform would be beneficial. - if (isa<PointerType>(I->getOperand(0)->getType())) + if (auto *I = dyn_cast<Instruction>(V)) { + switch (I->getOpcode()) { + default: + return isa<BinaryOperator>(I) && isSupportedType(I) && + !GenerateSignBits(I); + case Instruction::GetElementPtr: + case Instruction::Store: + case Instruction::Br: + case Instruction::Switch: return true; - return EqualTypeSize(I->getOperand(0)); - } - - if (GenerateSignBits(V)) { - LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n"); - return false; - } - - // Memory instructions - if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V)) - return true; - - // Branches and targets. - if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V)) - return true; - - // Non-instruction values that we can handle. - if ((isa<Constant>(V) && !isa<ConstantExpr>(V)) || isa<Argument>(V)) + case Instruction::PHI: + case Instruction::Select: + case Instruction::Ret: + case Instruction::Load: + case Instruction::Trunc: + case Instruction::BitCast: + return isSupportedType(I); + case Instruction::ZExt: + return isSupportedType(I->getOperand(0)); + case Instruction::ICmp: + // Now that we allow small types than TypeSize, only allow icmp of + // TypeSize because they will require a trunc to be legalised. + // TODO: Allow icmp of smaller types, and calculate at the end + // whether the transform would be beneficial. + if (isa<PointerType>(I->getOperand(0)->getType())) + return true; + return EqualTypeSize(I->getOperand(0)); + case Instruction::Call: { + // Special cases for calls as we need to check for zeroext + // TODO We should accept calls even if they don't have zeroext, as they + // can still be sinks. + auto *Call = cast<CallInst>(I); + return isSupportedType(Call) && + Call->hasRetAttr(Attribute::AttrKind::ZExt); + } + } + } else if (isa<Constant>(V) && !isa<ConstantExpr>(V)) { return isSupportedType(V); - - if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V) || - isa<LoadInst>(V)) + } else if (isa<Argument>(V)) return isSupportedType(V); - if (auto *Cast = dyn_cast<CastInst>(V)) - return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0)); - - // Special cases for calls as we need to check for zeroext - // TODO We should accept calls even if they don't have zeroext, as they can - // still be sinks. - if (auto *Call = dyn_cast<CallInst>(V)) - return isSupportedType(Call) && - Call->hasRetAttr(Attribute::AttrKind::ZExt); - - if (!isa<BinaryOperator>(V)) - return false; - - if (!isSupportedType(V)) - return false; - - return true; + return isa<BasicBlock>(V); } /// Check that the type of V would be promoted and that the original type is diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index 60e5d7bf6098..24ca25f73e96 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -26,8 +26,10 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -69,6 +71,7 @@ STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk"); STATISTIC(NumCBZ, "Number of CBZ / CBNZ formed"); STATISTIC(NumJTMoved, "Number of jump table destination blocks moved"); STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted"); +STATISTIC(NumLEInserted, "Number of LE backwards branches inserted"); static cl::opt<bool> AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true), @@ -212,6 +215,7 @@ namespace { const ARMBaseInstrInfo *TII; const ARMSubtarget *STI; ARMFunctionInfo *AFI; + MachineDominatorTree *DT = nullptr; bool isThumb; bool isThumb1; bool isThumb2; @@ -224,6 +228,11 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); @@ -238,7 +247,7 @@ namespace { void doInitialJumpTablePlacement(std::vector<MachineInstr *> &CPEMIs); bool BBHasFallthrough(MachineBasicBlock *MBB); CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI); - unsigned getCPELogAlign(const MachineInstr *CPEMI); + Align getCPEAlign(const MachineInstr *CPEMI); void scanFunctionJumpTables(); void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs); MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI); @@ -327,8 +336,7 @@ LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() { const BasicBlockInfo &BBI = BBInfo[J]; dbgs() << format("%08x %bb.%u\t", BBI.Offset, J) << " kb=" << unsigned(BBI.KnownBits) - << " ua=" << unsigned(BBI.Unalign) - << " pa=" << unsigned(BBI.PostAlign) + << " ua=" << unsigned(BBI.Unalign) << " pa=" << Log2(BBI.PostAlign) << format(" size=%#x\n", BBInfo[J].Size); } }); @@ -349,6 +357,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { isPositionIndependentOrROPI = STI->getTargetLowering()->isPositionIndependent() || STI->isROPI(); AFI = MF->getInfo<ARMFunctionInfo>(); + DT = &getAnalysis<MachineDominatorTree>(); isThumb = AFI->isThumbFunction(); isThumb1 = AFI->isThumb1OnlyFunction(); @@ -357,9 +366,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { HasFarJump = false; bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB); - // This pass invalidates liveness information when it splits basic blocks. - MF->getRegInfo().invalidateLiveness(); - // Renumber all of the machine basic blocks in the function, guaranteeing that // the numbers agree with the position of the block in the function. MF->RenumberBlocks(); @@ -398,7 +404,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { // Functions with jump tables need an alignment of 4 because they use the ADR // instruction, which aligns the PC to 4 bytes before adding an offset. if (!T2JumpTables.empty()) - MF->ensureAlignment(2); + MF->ensureAlignment(Align(4)); /// Remove dead constant pool entries. MadeChange |= removeUnusedCPEntries(); @@ -487,8 +493,9 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) MachineBasicBlock *BB = MF->CreateMachineBasicBlock(); MF->push_back(BB); - // MachineConstantPool measures alignment in bytes. We measure in log2(bytes). - unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment()); + // MachineConstantPool measures alignment in bytes. + const Align MaxAlign(MCP->getConstantPoolAlignment()); + const unsigned MaxLogAlign = Log2(MaxAlign); // Mark the basic block as required by the const-pool. BB->setAlignment(MaxAlign); @@ -501,7 +508,8 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) // alignment of all entries as long as BB is sufficiently aligned. Keep // track of the insertion point for each alignment. We are going to bucket // sort the entries as they are created. - SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end()); + SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxLogAlign + 1, + BB->end()); // Add all of the constants from the constant pool to the end block, use an // identity mapping of CPI's to CPE's. @@ -526,7 +534,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) // Ensure that future entries with higher alignment get inserted before // CPEMI. This is bucket sort with iterators. - for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a) + for (unsigned a = LogAlign + 1; a <= MaxLogAlign; ++a) if (InsPoint[a] == InsAt) InsPoint[a] = CPEMI; @@ -640,29 +648,27 @@ ARMConstantIslands::findConstPoolEntry(unsigned CPI, return nullptr; } -/// getCPELogAlign - Returns the required alignment of the constant pool entry -/// represented by CPEMI. Alignment is measured in log2(bytes) units. -unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) { +/// getCPEAlign - Returns the required alignment of the constant pool entry +/// represented by CPEMI. +Align ARMConstantIslands::getCPEAlign(const MachineInstr *CPEMI) { switch (CPEMI->getOpcode()) { case ARM::CONSTPOOL_ENTRY: break; case ARM::JUMPTABLE_TBB: - return isThumb1 ? 2 : 0; + return isThumb1 ? Align(4) : Align(1); case ARM::JUMPTABLE_TBH: - return isThumb1 ? 2 : 1; + return isThumb1 ? Align(4) : Align(2); case ARM::JUMPTABLE_INSTS: - return 1; + return Align(2); case ARM::JUMPTABLE_ADDRS: - return 2; + return Align(4); default: llvm_unreachable("unknown constpool entry kind"); } unsigned CPI = getCombinedIndex(CPEMI); assert(CPI < MCP->getConstants().size() && "Invalid constant pool index."); - unsigned Align = MCP->getConstants()[CPI].getAlignment(); - assert(isPowerOf2_32(Align) && "Invalid CPE alignment"); - return Log2_32(Align); + return Align(MCP->getConstants()[CPI].getAlignment()); } /// scanFunctionJumpTables - Do a scan of the function, building up @@ -687,7 +693,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { BBInfoVector &BBInfo = BBUtils->getBBInfo(); // The known bits of the entry block offset are determined by the function // alignment. - BBInfo.front().KnownBits = MF->getAlignment(); + BBInfo.front().KnownBits = Log2(MF->getAlignment()); // Compute block offsets and known bits. BBUtils->adjustBBOffsetsAfter(&MF->front()); @@ -824,11 +830,6 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { Scale = 2; // +-(offset_8*2) NegOk = true; break; - - case ARM::tLDRHi: - Bits = 5; - Scale = 2; // +(offset_5*2) - break; } // Remember that this is a user of a CP entry. @@ -885,6 +886,13 @@ void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) { MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) { MachineBasicBlock *OrigBB = MI->getParent(); + // Collect liveness information at MI. + LivePhysRegs LRs(*MF->getSubtarget().getRegisterInfo()); + LRs.addLiveOuts(*OrigBB); + auto LivenessEnd = ++MachineBasicBlock::iterator(MI).getReverse(); + for (MachineInstr &LiveMI : make_range(OrigBB->rbegin(), LivenessEnd)) + LRs.stepBackward(LiveMI); + // Create a new MBB for the code after the OrigBB. MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); @@ -913,6 +921,12 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) { // OrigBB branches to NewBB. OrigBB->addSuccessor(NewBB); + // Update live-in information in the new block. + MachineRegisterInfo &MRI = MF->getRegInfo(); + for (MCPhysReg L : LRs) + if (!MRI.isReserved(L)) + NewBB->addLiveIn(L); + // Update internal data structures to account for the newly inserted MBB. // This is almost the same as updateForInsertedWaterBlock, except that // the Water goes after OrigBB, not NewBB. @@ -1007,13 +1021,13 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset, MachineBasicBlock* Water, CPUser &U, unsigned &Growth) { BBInfoVector &BBInfo = BBUtils->getBBInfo(); - unsigned CPELogAlign = getCPELogAlign(U.CPEMI); - unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign); - unsigned NextBlockOffset, NextBlockAlignment; + const Align CPEAlign = getCPEAlign(U.CPEMI); + const unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPEAlign); + unsigned NextBlockOffset; + Align NextBlockAlignment; MachineFunction::const_iterator NextBlock = Water->getIterator(); if (++NextBlock == MF->end()) { NextBlockOffset = BBInfo[Water->getNumber()].postOffset(); - NextBlockAlignment = 0; } else { NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset; NextBlockAlignment = NextBlock->getAlignment(); @@ -1028,13 +1042,13 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset, Growth = CPEEnd - NextBlockOffset; // Compute the padding that would go at the end of the CPE to align the next // block. - Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment); + Growth += offsetToAlignment(CPEEnd, NextBlockAlignment); // If the CPE is to be inserted before the instruction, that will raise // the offset of the instruction. Also account for unknown alignment padding // in blocks between CPE and the user. if (CPEOffset < UserOffset) - UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign); + UserOffset += Growth + UnknownPadding(MF->getAlignment(), Log2(CPEAlign)); } else // CPE fits in existing padding. Growth = 0; @@ -1200,8 +1214,8 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset, // inserting islands between BB0 and BB1 makes other accesses out of range. MachineBasicBlock *UserBB = U.MI->getParent(); BBInfoVector &BBInfo = BBUtils->getBBInfo(); - unsigned MinNoSplitDisp = - BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI)); + const Align CPEAlign = getCPEAlign(U.CPEMI); + unsigned MinNoSplitDisp = BBInfo[UserBB->getNumber()].postOffset(CPEAlign); if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2) return false; for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();; @@ -1254,7 +1268,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, CPUser &U = CPUsers[CPUserIndex]; MachineInstr *UserMI = U.MI; MachineInstr *CPEMI = U.CPEMI; - unsigned CPELogAlign = getCPELogAlign(CPEMI); + const Align CPEAlign = getCPEAlign(CPEMI); MachineBasicBlock *UserMBB = UserMI->getParent(); BBInfoVector &BBInfo = BBUtils->getBBInfo(); const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()]; @@ -1267,7 +1281,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // Size of branch to insert. unsigned Delta = isThumb1 ? 2 : 4; // Compute the offset where the CPE will begin. - unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta; + unsigned CPEOffset = UserBBI.postOffset(CPEAlign) + Delta; if (isOffsetInRange(UserOffset, CPEOffset, U)) { LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB) @@ -1308,11 +1322,11 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // Try to split the block so it's fully aligned. Compute the latest split // point where we can add a 4-byte branch instruction, and then align to - // LogAlign which is the largest possible alignment in the function. - unsigned LogAlign = MF->getAlignment(); - assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry"); + // Align which is the largest possible alignment in the function. + const Align Align = MF->getAlignment(); + assert(Align >= CPEAlign && "Over-aligned constant pool entry"); unsigned KnownBits = UserBBI.internalKnownBits(); - unsigned UPad = UnknownPadding(LogAlign, KnownBits); + unsigned UPad = UnknownPadding(Align, KnownBits); unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad; LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x", BaseInsertOffset)); @@ -1323,7 +1337,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, BaseInsertOffset -= 4; LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset) - << " la=" << LogAlign << " kb=" << KnownBits + << " la=" << Log2(Align) << " kb=" << KnownBits << " up=" << UPad << '\n'); // This could point off the end of the block if we've already got constant @@ -1337,6 +1351,28 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, BaseInsertOffset = std::max(UserBBI.postOffset() - UPad - 8, UserOffset + TII->getInstSizeInBytes(*UserMI) + 1); + // If the CP is referenced(ie, UserOffset) is in first four instructions + // after IT, this recalculated BaseInsertOffset could be in the middle of + // an IT block. If it is, change the BaseInsertOffset to just after the + // IT block. This still make the CP Entry is in range becuase of the + // following reasons. + // 1. The initial BaseseInsertOffset calculated is (UserOffset + + // U.getMaxDisp() - UPad). + // 2. An IT block is only at most 4 instructions plus the "it" itself (18 + // bytes). + // 3. All the relevant instructions support much larger Maximum + // displacement. + MachineBasicBlock::iterator I = UserMI; + ++I; + for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI), + PredReg = 0; + I->getOpcode() != ARM::t2IT && + getITInstrPredicate(*I, PredReg) != ARMCC::AL; + Offset += TII->getInstSizeInBytes(*I), I = std::next(I)) { + BaseInsertOffset = + std::max(BaseInsertOffset, Offset + TII->getInstSizeInBytes(*I) + 1); + assert(I != UserMBB->end() && "Fell off end of block"); + } LLVM_DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset)); } unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad + @@ -1354,8 +1390,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, CPUser &U = CPUsers[CPUIndex]; if (!isOffsetInRange(Offset, EndInsertOffset, U)) { // Shift intertion point by one unit of alignment so it is within reach. - BaseInsertOffset -= 1u << LogAlign; - EndInsertOffset -= 1u << LogAlign; + BaseInsertOffset -= Align.value(); + EndInsertOffset -= Align.value(); } // This is overly conservative, as we don't account for CPEMIs being // reused within the block, but it doesn't matter much. Also assume CPEs @@ -1397,9 +1433,10 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, } // We really must not split an IT block. - LLVM_DEBUG(unsigned PredReg; assert( - !isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL)); - +#ifndef NDEBUG + unsigned PredReg; + assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL); +#endif NewMBB = splitBlockBeforeInstr(&*MI); } @@ -1464,9 +1501,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex, // Always align the new block because CP entries can be smaller than 4 // bytes. Be careful not to decrease the existing alignment, e.g. NewMBB may // be an already aligned constant pool block. - const unsigned Align = isThumb ? 1 : 2; - if (NewMBB->getAlignment() < Align) - NewMBB->setAlignment(Align); + const Align Alignment = isThumb ? Align(2) : Align(4); + if (NewMBB->getAlignment() < Alignment) + NewMBB->setAlignment(Alignment); // Remove the original WaterList entry; we want subsequent insertions in // this vicinity to go after the one we're about to insert. This @@ -1495,7 +1532,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex, decrementCPEReferenceCount(CPI, CPEMI); // Mark the basic block as aligned as required by the const-pool entry. - NewIsland->setAlignment(getCPELogAlign(U.CPEMI)); + NewIsland->setAlignment(getCPEAlign(U.CPEMI)); // Increase the size of the island block to account for the new entry. BBUtils->adjustBBSize(NewIsland, Size); @@ -1529,10 +1566,11 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) { BBInfo[CPEBB->getNumber()].Size = 0; // This block no longer needs to be aligned. - CPEBB->setAlignment(0); - } else + CPEBB->setAlignment(Align::None()); + } else { // Entries are sorted by descending alignment, so realign from the front. - CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin())); + CPEBB->setAlignment(getCPEAlign(&*CPEBB->begin())); + } BBUtils->adjustBBOffsetsAfter(CPEBB); // An island has only one predecessor BB and one successor BB. Check if @@ -1620,7 +1658,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) { // L2: ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm(); CC = ARMCC::getOppositeCondition(CC); - unsigned CCReg = MI->getOperand(2).getReg(); + Register CCReg = MI->getOperand(2).getReg(); // If the branch is at the end of its MBB and that has a fall-through block, // direct the updated conditional branch to the fall-through block. Otherwise, @@ -1778,16 +1816,10 @@ bool ARMConstantIslands::optimizeThumb2Instructions() { return MadeChange; } + bool ARMConstantIslands::optimizeThumb2Branches() { - bool MadeChange = false; - // The order in which branches appear in ImmBranches is approximately their - // order within the function body. By visiting later branches first, we reduce - // the distance between earlier forward branches and their targets, making it - // more likely that the cbn?z optimization, which can only apply to forward - // branches, will succeed. - for (unsigned i = ImmBranches.size(); i != 0; --i) { - ImmBranch &Br = ImmBranches[i-1]; + auto TryShrinkBranch = [this](ImmBranch &Br) { unsigned Opcode = Br.MI->getOpcode(); unsigned NewOpc = 0; unsigned Scale = 1; @@ -1815,47 +1847,115 @@ bool ARMConstantIslands::optimizeThumb2Branches() { BBUtils->adjustBBSize(MBB, -2); BBUtils->adjustBBOffsetsAfter(MBB); ++NumT2BrShrunk; - MadeChange = true; + return true; } } + return false; + }; - Opcode = Br.MI->getOpcode(); - if (Opcode != ARM::tBcc) - continue; + struct ImmCompare { + MachineInstr* MI = nullptr; + unsigned NewOpc = 0; + }; + + auto FindCmpForCBZ = [this](ImmBranch &Br, ImmCompare &ImmCmp, + MachineBasicBlock *DestBB) { + ImmCmp.MI = nullptr; + ImmCmp.NewOpc = 0; // If the conditional branch doesn't kill CPSR, then CPSR can be liveout // so this transformation is not safe. if (!Br.MI->killsRegister(ARM::CPSR)) - continue; + return false; - NewOpc = 0; unsigned PredReg = 0; + unsigned NewOpc = 0; ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg); if (Pred == ARMCC::EQ) NewOpc = ARM::tCBZ; else if (Pred == ARMCC::NE) NewOpc = ARM::tCBNZ; - if (!NewOpc) - continue; - MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + else + return false; + // Check if the distance is within 126. Subtract starting offset by 2 // because the cmp will be eliminated. unsigned BrOffset = BBUtils->getOffsetOf(Br.MI) + 4 - 2; BBInfoVector &BBInfo = BBUtils->getBBInfo(); unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset; if (BrOffset >= DestOffset || (DestOffset - BrOffset) > 126) - continue; + return false; // Search backwards to find a tCMPi8 auto *TRI = STI->getRegisterInfo(); MachineInstr *CmpMI = findCMPToFoldIntoCBZ(Br.MI, TRI); if (!CmpMI || CmpMI->getOpcode() != ARM::tCMPi8) + return false; + + ImmCmp.MI = CmpMI; + ImmCmp.NewOpc = NewOpc; + return true; + }; + + auto TryConvertToLE = [this](ImmBranch &Br, ImmCompare &Cmp) { + if (Br.MI->getOpcode() != ARM::t2Bcc || !STI->hasLOB() || + STI->hasMinSize()) + return false; + + MachineBasicBlock *MBB = Br.MI->getParent(); + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + if (BBUtils->getOffsetOf(MBB) < BBUtils->getOffsetOf(DestBB) || + !BBUtils->isBBInRange(Br.MI, DestBB, 4094)) + return false; + + if (!DT->dominates(DestBB, MBB)) + return false; + + // We queried for the CBN?Z opcode based upon the 'ExitBB', the opposite + // target of Br. So now we need to reverse the condition. + Cmp.NewOpc = Cmp.NewOpc == ARM::tCBZ ? ARM::tCBNZ : ARM::tCBZ; + + MachineInstrBuilder MIB = BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), + TII->get(ARM::t2LE)); + MIB.add(Br.MI->getOperand(0)); + Br.MI->eraseFromParent(); + Br.MI = MIB; + ++NumLEInserted; + return true; + }; + + bool MadeChange = false; + + // The order in which branches appear in ImmBranches is approximately their + // order within the function body. By visiting later branches first, we reduce + // the distance between earlier forward branches and their targets, making it + // more likely that the cbn?z optimization, which can only apply to forward + // branches, will succeed. + for (ImmBranch &Br : reverse(ImmBranches)) { + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + MachineBasicBlock *MBB = Br.MI->getParent(); + MachineBasicBlock *ExitBB = &MBB->back() == Br.MI ? + MBB->getFallThrough() : + MBB->back().getOperand(0).getMBB(); + + ImmCompare Cmp; + if (FindCmpForCBZ(Br, Cmp, ExitBB) && TryConvertToLE(Br, Cmp)) { + DestBB = ExitBB; + MadeChange = true; + } else { + FindCmpForCBZ(Br, Cmp, DestBB); + MadeChange |= TryShrinkBranch(Br); + } + + unsigned Opcode = Br.MI->getOpcode(); + if ((Opcode != ARM::tBcc && Opcode != ARM::t2LE) || !Cmp.NewOpc) continue; - unsigned Reg = CmpMI->getOperand(0).getReg(); + Register Reg = Cmp.MI->getOperand(0).getReg(); // Check for Kill flags on Reg. If they are present remove them and set kill // on the new CBZ. + auto *TRI = STI->getRegisterInfo(); MachineBasicBlock::iterator KillMI = Br.MI; bool RegKilled = false; do { @@ -1865,19 +1965,32 @@ bool ARMConstantIslands::optimizeThumb2Branches() { RegKilled = true; break; } - } while (KillMI != CmpMI); + } while (KillMI != Cmp.MI); // Create the new CBZ/CBNZ - MachineBasicBlock *MBB = Br.MI->getParent(); - LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI); + LLVM_DEBUG(dbgs() << "Fold: " << *Cmp.MI << " and: " << *Br.MI); MachineInstr *NewBR = - BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(NewOpc)) + BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(Cmp.NewOpc)) .addReg(Reg, getKillRegState(RegKilled)) .addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags()); - CmpMI->eraseFromParent(); - Br.MI->eraseFromParent(); - Br.MI = NewBR; + + Cmp.MI->eraseFromParent(); + BBInfoVector &BBInfo = BBUtils->getBBInfo(); BBInfo[MBB->getNumber()].Size -= 2; + + if (Br.MI->getOpcode() == ARM::tBcc) { + Br.MI->eraseFromParent(); + Br.MI = NewBR; + } else if (&MBB->back() != Br.MI) { + // We've generated an LE and already erased the original conditional + // branch. The CBN?Z is now used to branch to the other successor, so an + // unconditional branch terminator is now redundant. + MachineInstr *LastMI = &MBB->back(); + if (LastMI != Br.MI) { + BBInfo[MBB->getNumber()].Size -= LastMI->getDesc().getSize(); + LastMI->eraseFromParent(); + } + } BBUtils->adjustBBOffsetsAfter(MBB); ++NumCBZ; MadeChange = true; @@ -1931,8 +2044,8 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI, // of BaseReg, but only if the t2ADDrs can be removed. // + Some instruction other than t2ADDrs computing the entry. Not seen in // the wild, but we should be careful. - unsigned EntryReg = JumpMI->getOperand(0).getReg(); - unsigned BaseReg = LEAMI->getOperand(0).getReg(); + Register EntryReg = JumpMI->getOperand(0).getReg(); + Register BaseReg = LEAMI->getOperand(0).getReg(); CanDeleteLEA = true; BaseRegKill = false; @@ -2009,7 +2122,7 @@ static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI, // but the JT now uses PC. Finds the last ADD (if any) that def's EntryReg // and is not clobbered / used. MachineInstr *RemovableAdd = nullptr; - unsigned EntryReg = JumpMI->getOperand(0).getReg(); + Register EntryReg = JumpMI->getOperand(0).getReg(); // Find the last ADD to set EntryReg MachineBasicBlock::iterator I(LEAMI); @@ -2106,7 +2219,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() { // %idx = tLSLri %idx, 2 // %base = tLEApcrelJT // %t = tLDRr %base, %idx - unsigned BaseReg = User.MI->getOperand(0).getReg(); + Register BaseReg = User.MI->getOperand(0).getReg(); if (User.MI->getIterator() == User.MI->getParent()->begin()) continue; @@ -2116,7 +2229,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() { !Shift->getOperand(2).isKill()) continue; IdxReg = Shift->getOperand(2).getReg(); - unsigned ShiftedIdxReg = Shift->getOperand(0).getReg(); + Register ShiftedIdxReg = Shift->getOperand(0).getReg(); // It's important that IdxReg is live until the actual TBB/TBH. Most of // the range is checked later, but the LEA might still clobber it and not @@ -2313,6 +2426,10 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { MachineFunction::iterator MBBI = ++JTBB->getIterator(); MF->insert(MBBI, NewBB); + // Copy live-in information to new block. + for (const MachineBasicBlock::RegisterMaskPair &RegMaskPair : BB->liveins()) + NewBB->addLiveIn(RegMaskPair); + // Add an unconditional branch from NewBB to BB. // There doesn't seem to be meaningful DebugInfo available; this doesn't // correspond directly to anything in the source. diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp index 3bdb0e1ef62d..72c95f441265 100644 --- a/lib/Target/ARM/ARMConstantPoolValue.cpp +++ b/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Type.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index b32ba3eeea18..563fdda56104 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -481,7 +481,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { unsigned OpIdx = 0; bool DstIsDead = MI.getOperand(OpIdx).isDead(); - unsigned DstReg = MI.getOperand(OpIdx++).getReg(); + Register DstReg = MI.getOperand(OpIdx++).getReg(); if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 || TableEntry->RealOpc == ARM::VLD2DUPd16x2 || TableEntry->RealOpc == ARM::VLD2DUPd32x2) { @@ -492,7 +492,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { assert(RegSpc == OddDblSpc && "Unexpected spacing!"); SubRegIndex = ARM::dsub_1; } - unsigned SubReg = TRI->getSubReg(DstReg, SubRegIndex); + Register SubReg = TRI->getSubReg(DstReg, SubRegIndex); unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0, &ARM::DPairSpcRegClass); MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead)); @@ -624,7 +624,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { bool SrcIsKill = MI.getOperand(OpIdx).isKill(); bool SrcIsUndef = MI.getOperand(OpIdx).isUndef(); - unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + Register SrcReg = MI.getOperand(OpIdx++).getReg(); unsigned D0, D1, D2, D3; GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0, getUndefRegState(SrcIsUndef)); @@ -760,7 +760,7 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, } bool SrcIsKill = MI.getOperand(OpIdx).isKill(); - unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + Register SrcReg = MI.getOperand(OpIdx++).getReg(); unsigned D0, D1, D2, D3; GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0); @@ -789,6 +789,7 @@ static bool IsAnAddressOperand(const MachineOperand &MO) { case MachineOperand::MO_Immediate: case MachineOperand::MO_CImmediate: case MachineOperand::MO_FPImmediate: + case MachineOperand::MO_ShuffleMask: return false; case MachineOperand::MO_MachineBasicBlock: return true; @@ -828,7 +829,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, unsigned Opcode = MI.getOpcode(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm; const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1); @@ -932,13 +933,13 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Dest = MI.getOperand(0); - unsigned TempReg = MI.getOperand(1).getReg(); + Register TempReg = MI.getOperand(1).getReg(); // Duplicating undef operands into 2 instructions does not guarantee the same // value on both; However undef should be replaced by xzr anyway. assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); - unsigned AddrReg = MI.getOperand(2).getReg(); - unsigned DesiredReg = MI.getOperand(3).getReg(); - unsigned NewReg = MI.getOperand(4).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register DesiredReg = MI.getOperand(3).getReg(); + Register NewReg = MI.getOperand(4).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -1035,8 +1036,8 @@ static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg, unsigned Flags, bool IsThumb, const TargetRegisterInfo *TRI) { if (IsThumb) { - unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0); - unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1); + Register RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0); + Register RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1); MIB.addReg(RegLo, Flags); MIB.addReg(RegHi, Flags); } else @@ -1051,19 +1052,19 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); - unsigned TempReg = MI.getOperand(1).getReg(); + Register TempReg = MI.getOperand(1).getReg(); // Duplicating undef operands into 2 instructions does not guarantee the same // value on both; However undef should be replaced by xzr anyway. assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); - unsigned AddrReg = MI.getOperand(2).getReg(); - unsigned DesiredReg = MI.getOperand(3).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register DesiredReg = MI.getOperand(3).getReg(); MachineOperand New = MI.getOperand(4); New.setIsKill(false); - unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0); - unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1); - unsigned DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0); - unsigned DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1); + Register DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0); + Register DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1); + Register DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0); + Register DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -1204,8 +1205,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) NewMI->addOperand(MBBI->getOperand(i)); - // Delete the pseudo instruction TCRETURN. + + // Update call site info and delete the pseudo instruction TCRETURN. + MBB.getParent()->moveCallSiteInfo(&MI, &*NewMI); MBB.erase(MBBI); + MBBI = NewMI; return true; } @@ -1336,7 +1340,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // for us. Otherwise, expand to nothing. if (RI.hasBasePointer(MF)) { int32_t NumBytes = AFI->getFramePtrSpillOffset(); - unsigned FramePtr = RI.getFrameRegister(MF); + Register FramePtr = RI.getFrameRegister(MF); assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) && "base pointer without frame pointer?"); @@ -1412,7 +1416,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(MF->getFunction().getContext(), "__aeabi_read_tp", PCLabelID, 0); - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg) .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4)); @@ -1435,6 +1439,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB.cloneMemRefs(MI); TransferImpOps(MI, MIB, MIB); + MI.getMF()->moveCallSiteInfo(&MI, &*MIB); MI.eraseFromParent(); return true; } @@ -1442,7 +1447,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::t2LDRpci_pic: { unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic) ? ARM::tLDRpci : ARM::t2LDRpci; - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg) @@ -1464,7 +1469,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::LDRLIT_ga_pcrel_ldr: case ARM::tLDRLIT_ga_abs: case ARM::tLDRLIT_ga_pcrel: { - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); const MachineOperand &MO1 = MI.getOperand(1); auto Flags = MO1.getTargetFlags(); @@ -1522,7 +1527,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::t2MOV_ga_pcrel: { // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode. unsigned LabelId = AFI->createPICLabelUId(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); const MachineOperand &MO1 = MI.getOperand(1); const GlobalValue *GV = MO1.getGlobal(); @@ -1586,7 +1591,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // Grab the Q register destination. bool DstIsDead = MI.getOperand(OpIdx).isDead(); - unsigned DstReg = MI.getOperand(OpIdx++).getReg(); + Register DstReg = MI.getOperand(OpIdx++).getReg(); // Copy the source register. MIB.add(MI.getOperand(OpIdx++)); @@ -1596,8 +1601,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB.add(MI.getOperand(OpIdx++)); // Add the destination operands (D subregs). - unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0); - unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1); + Register D0 = TRI->getSubReg(DstReg, ARM::dsub_0); + Register D1 = TRI->getSubReg(DstReg, ARM::dsub_1); MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)) .addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); @@ -1617,7 +1622,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // Grab the Q register source. bool SrcIsKill = MI.getOperand(OpIdx).isKill(); - unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + Register SrcReg = MI.getOperand(OpIdx++).getReg(); // Copy the destination register. MachineOperand Dst(MI.getOperand(OpIdx++)); @@ -1628,8 +1633,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB.add(MI.getOperand(OpIdx++)); // Add the source operands (D subregs). - unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); - unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); + Register D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); + Register D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); MIB.addReg(D0, SrcIsKill ? RegState::Kill : 0) .addReg(D1, SrcIsKill ? RegState::Kill : 0); @@ -1915,6 +1920,37 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::CMP_SWAP_64: return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI); + + case ARM::tBL_PUSHLR: + case ARM::BL_PUSHLR: { + const bool Thumb = Opcode == ARM::tBL_PUSHLR; + Register Reg = MI.getOperand(0).getReg(); + assert(Reg == ARM::LR && "expect LR register!"); + MachineInstrBuilder MIB; + if (Thumb) { + // push {lr} + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPUSH)) + .add(predOps(ARMCC::AL)) + .addReg(Reg); + + // bl __gnu_mcount_nc + MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tBL)); + } else { + // stmdb sp!, {lr} + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::STMDB_UPD)) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)) + .addReg(Reg); + + // bl __gnu_mcount_nc + MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::BL)); + } + MIB.cloneMemRefs(MI); + for (unsigned i = 1; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i)); + MI.eraseFromParent(); + return true; + } } } diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 6e274d269bf2..1fc5ff6921c6 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -191,8 +191,8 @@ class ARMFastISel final : public FastISel { bool isTypeLegal(Type *Ty, MVT &VT); bool isLoadTypeLegal(Type *Ty, MVT &VT); bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, - bool isZExt, bool isEquality); - bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, + bool isZExt); + bool ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr, unsigned Alignment = 0, bool isZExt = true, bool allocReg = true); bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr, @@ -219,15 +219,15 @@ class ARMFastISel final : public FastISel { bool Return, bool isVarArg); bool ProcessCallArgs(SmallVectorImpl<Value*> &Args, - SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<Register> &ArgRegs, SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, - SmallVectorImpl<unsigned> &RegArgs, + SmallVectorImpl<Register> &RegArgs, CallingConv::ID CC, unsigned &NumBytes, bool isVarArg); unsigned getLibcallReg(const Twine &Name); - bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, + bool FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs, const Instruction *I, CallingConv::ID CC, unsigned &NumBytes, bool isVarArg); bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call); @@ -301,7 +301,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) { unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill) { - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); const MCInstrDesc &II = TII.get(MachineInstOpcode); // Make sure the input operand is sufficiently constrained to be legal @@ -913,7 +913,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr, AddOptionalDefs(MIB); } -bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, +bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr, unsigned Alignment, bool isZExt, bool allocReg) { unsigned Opc; bool useAM3 = false; @@ -1045,7 +1045,7 @@ bool ARMFastISel::SelectLoad(const Instruction *I) { Address Addr; if (!ARMComputeAddress(I->getOperand(0), Addr)) return false; - unsigned ResultReg; + Register ResultReg; if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment())) return false; updateValueMap(I, ResultReg); @@ -1259,8 +1259,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { if (ARMPred == ARMCC::AL) return false; // Emit the compare. - if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(), - CI->isEquality())) + if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) return false; unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc; @@ -1349,7 +1348,7 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) { } bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, - bool isZExt, bool isEquality) { + bool isZExt) { Type *Ty = Src1Value->getType(); EVT SrcEVT = TLI.getValueType(DL, Ty, true); if (!SrcEVT.isSimple()) return false; @@ -1397,19 +1396,11 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, // TODO: Verify compares. case MVT::f32: isICmp = false; - // Equality comparisons shouldn't raise Invalid on uordered inputs. - if (isEquality) - CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS; - else - CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES; + CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS; break; case MVT::f64: isICmp = false; - // Equality comparisons shouldn't raise Invalid on uordered inputs. - if (isEquality) - CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD; - else - CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED; + CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD; break; case MVT::i1: case MVT::i8: @@ -1485,8 +1476,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) { if (ARMPred == ARMCC::AL) return false; // Emit the compare. - if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(), - CI->isEquality())) + if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) return false; // Now set a register based on the comparison. Explicitly set the predicates @@ -1893,10 +1883,10 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, } bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, - SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<Register> &ArgRegs, SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, - SmallVectorImpl<unsigned> &RegArgs, + SmallVectorImpl<Register> &RegArgs, CallingConv::ID CC, unsigned &NumBytes, bool isVarArg) { @@ -1960,7 +1950,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; const Value *ArgVal = Args[VA.getValNo()]; - unsigned Arg = ArgRegs[VA.getValNo()]; + Register Arg = ArgRegs[VA.getValNo()]; MVT ArgVT = ArgVTs[VA.getValNo()]; assert((!ArgVT.isVector() && ArgVT.getSizeInBits() <= 64) && @@ -2039,7 +2029,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, return true; } -bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, +bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs, const Instruction *I, CallingConv::ID CC, unsigned &NumBytes, bool isVarArg) { // Issue CALLSEQ_END @@ -2060,7 +2050,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, // double fp reg we want. MVT DestVT = RVLocs[0].getValVT(); const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT); - unsigned ResultReg = createResultReg(DstRC); + Register ResultReg = createResultReg(DstRC); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM::VMOVDRR), ResultReg) .addReg(RVLocs[0].getLocReg()) @@ -2081,7 +2071,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, const TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT); - unsigned ResultReg = createResultReg(DstRC); + Register ResultReg = createResultReg(DstRC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(RVLocs[0].getLocReg()); @@ -2162,7 +2152,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) { } // Make the copy. - unsigned DstReg = VA.getLocReg(); + Register DstReg = VA.getLocReg(); const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); // Avoid a cross-class copy. This is very unlikely. if (!SrcRC->contains(DstReg)) @@ -2231,7 +2221,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { // Set up the argument vectors. SmallVector<Value*, 8> Args; - SmallVector<unsigned, 8> ArgRegs; + SmallVector<Register, 8> ArgRegs; SmallVector<MVT, 8> ArgVTs; SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; Args.reserve(I->getNumOperands()); @@ -2247,8 +2237,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { if (!isTypeLegal(ArgTy, ArgVT)) return false; ISD::ArgFlagsTy Flags; - unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - Flags.setOrigAlign(OriginalAlignment); + Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy))); Args.push_back(Op); ArgRegs.push_back(Arg); @@ -2257,13 +2246,13 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { } // Handle the arguments now that we've gotten them. - SmallVector<unsigned, 4> RegArgs; + SmallVector<Register, 4> RegArgs; unsigned NumBytes; if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes, false)) return false; - unsigned CalleeReg = 0; + Register CalleeReg; if (Subtarget->genLongCalls()) { CalleeReg = getLibcallReg(TLI.getLibcallName(Call)); if (CalleeReg == 0) return false; @@ -2282,7 +2271,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { MIB.addExternalSymbol(TLI.getLibcallName(Call)); // Add implicit physical register uses to the call. - for (unsigned R : RegArgs) + for (Register R : RegArgs) MIB.addReg(R, RegState::Implicit); // Add a register mask with the call-preserved registers. @@ -2290,7 +2279,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); // Finish off the call including any return values. - SmallVector<unsigned, 4> UsedRegs; + SmallVector<Register, 4> UsedRegs; if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false; // Set all unused physreg defs as dead. @@ -2340,7 +2329,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, // Set up the argument vectors. SmallVector<Value*, 8> Args; - SmallVector<unsigned, 8> ArgRegs; + SmallVector<Register, 8> ArgRegs; SmallVector<MVT, 8> ArgVTs; SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; unsigned arg_size = CS.arg_size(); @@ -2377,12 +2366,11 @@ bool ARMFastISel::SelectCall(const Instruction *I, ArgVT != MVT::i1) return false; - unsigned Arg = getRegForValue(*i); - if (Arg == 0) + Register Arg = getRegForValue(*i); + if (!Arg.isValid()) return false; - unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - Flags.setOrigAlign(OriginalAlignment); + Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy))); Args.push_back(*i); ArgRegs.push_back(Arg); @@ -2391,7 +2379,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, } // Handle the arguments now that we've gotten them. - SmallVector<unsigned, 4> RegArgs; + SmallVector<Register, 4> RegArgs; unsigned NumBytes; if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes, isVarArg)) @@ -2401,7 +2389,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); if (!GV || Subtarget->genLongCalls()) UseReg = true; - unsigned CalleeReg = 0; + Register CalleeReg; if (UseReg) { if (IntrMemName) CalleeReg = getLibcallReg(IntrMemName); @@ -2427,7 +2415,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, MIB.addExternalSymbol(IntrMemName, 0); // Add implicit physical register uses to the call. - for (unsigned R : RegArgs) + for (Register R : RegArgs) MIB.addReg(R, RegState::Implicit); // Add a register mask with the call-preserved registers. @@ -2435,7 +2423,7 @@ bool ARMFastISel::SelectCall(const Instruction *I, MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); // Finish off the call including any return values. - SmallVector<unsigned, 4> UsedRegs; + SmallVector<Register, 4> UsedRegs; if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg)) return false; @@ -2476,7 +2464,7 @@ bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src, } bool RV; - unsigned ResultReg; + Register ResultReg; RV = ARMEmitLoad(VT, ResultReg, Src); assert(RV && "Should be able to handle this load."); RV = ARMEmitStore(VT, ResultReg, Dest); @@ -2506,7 +2494,7 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) { const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(Subtarget->getRegisterInfo()); - unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); + Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); unsigned SrcReg = FramePtr; // Recursively load frame address @@ -2947,7 +2935,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, Address Addr; if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false; - unsigned ResultReg = MI->getOperand(0).getReg(); + Register ResultReg = MI->getOperand(0).getReg(); if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false)) return false; MachineBasicBlock::iterator I(MI); @@ -2974,7 +2962,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), MachineMemOperand::MOLoad, 4, 4); - unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); + Register TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp; MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg) diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index bedb779bcba0..01ae93086dcb 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -76,7 +76,7 @@ skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, unsigned NumAlignedDPRCS2Regs); ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti) - : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4), + : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, Align(4)), STI(sti) {} bool ARMFrameLowering::keepFramePointer(const MachineFunction &MF) const { @@ -376,7 +376,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // to determine the end of the prologue. DebugLoc dl; - unsigned FramePtr = RegInfo->getFrameRegister(MF); + Register FramePtr = RegInfo->getFrameRegister(MF); // Determine the sizes of each callee-save spill areas and record which frame // belongs to which callee-save spill areas. @@ -780,7 +780,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); int NumBytes = (int)MFI.getStackSize(); - unsigned FramePtr = RegInfo->getFrameRegister(MF); + Register FramePtr = RegInfo->getFrameRegister(MF); // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. @@ -1503,11 +1503,17 @@ static unsigned EstimateFunctionSizeInBytes(const MachineFunction &MF, /// instructions will require a scratch register during their expansion later. // FIXME: Move to TII? static unsigned estimateRSStackSizeLimit(MachineFunction &MF, - const TargetFrameLowering *TFI) { + const TargetFrameLowering *TFI, + bool &HasNonSPFrameIndex) { const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); unsigned Limit = (1 << 12) - 1; for (auto &MBB : MF) { for (auto &MI : MBB) { + if (MI.isDebugInstr()) + continue; for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (!MI.getOperand(i).isFI()) continue; @@ -1518,13 +1524,29 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF, Limit = std::min(Limit, (1U << 8) - 1); break; } + // t2ADDri will not require an extra register, it can reuse the + // destination. + if (MI.getOpcode() == ARM::t2ADDri || MI.getOpcode() == ARM::t2ADDri12) + break; + + const MCInstrDesc &MCID = MI.getDesc(); + const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI, MF); + if (RegClass && !RegClass->contains(ARM::SP)) + HasNonSPFrameIndex = true; // Otherwise check the addressing mode. switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) { + case ARMII::AddrMode_i12: + case ARMII::AddrMode2: + // Default 12 bit limit. + break; case ARMII::AddrMode3: case ARMII::AddrModeT2_i8: Limit = std::min(Limit, (1U << 8) - 1); break; + case ARMII::AddrMode5FP16: + Limit = std::min(Limit, ((1U << 8) - 1) * 2); + break; case ARMII::AddrMode5: case ARMII::AddrModeT2_i8s4: case ARMII::AddrModeT2_ldrex: @@ -1541,8 +1563,17 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF, // Addressing modes 4 & 6 (load/store) instructions can't encode an // immediate offset for stack references. return 0; - default: + case ARMII::AddrModeT2_i7: + Limit = std::min(Limit, ((1U << 7) - 1) * 1); + break; + case ARMII::AddrModeT2_i7s2: + Limit = std::min(Limit, ((1U << 7) - 1) * 2); break; + case ARMII::AddrModeT2_i7s4: + Limit = std::min(Limit, ((1U << 7) - 1) * 4); + break; + default: + llvm_unreachable("Unhandled addressing mode in stack size limit calculation"); } break; // At most one FI per instruction } @@ -1623,7 +1654,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); (void)TRI; // Silence unused warning in non-assert builds. - unsigned FramePtr = RegInfo->getFrameRegister(MF); + Register FramePtr = RegInfo->getFrameRegister(MF); // Spill R4 if Thumb2 function requires stack realignment - it will be used as // scratch register. Also spill R4 if Thumb2 function has varsized objects, @@ -1784,6 +1815,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, EstimatedStackSize += 16; // For possible paddings. unsigned EstimatedRSStackSizeLimit, EstimatedRSFixedSizeLimit; + bool HasNonSPFrameIndex = false; if (AFI->isThumb1OnlyFunction()) { // For Thumb1, don't bother to iterate over the function. The only // instruction that requires an emergency spill slot is a store to a @@ -1804,7 +1836,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, EstimatedRSStackSizeLimit = (1U << 8) * 4; EstimatedRSFixedSizeLimit = (1U << 5) * 4; } else { - EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this); + EstimatedRSStackSizeLimit = + estimateRSStackSizeLimit(MF, this, HasNonSPFrameIndex); EstimatedRSFixedSizeLimit = EstimatedRSStackSizeLimit; } // Final estimate of whether sp or bp-relative accesses might require @@ -1830,12 +1863,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit; bool BigFrameOffsets = HasLargeStack || !HasBPOrFixedSP || - HasLargeArgumentList; + HasLargeArgumentList || HasNonSPFrameIndex; LLVM_DEBUG(dbgs() << "EstimatedLimit: " << EstimatedRSStackSizeLimit - << "; EstimatedStack" << EstimatedStackSize - << "; EstimatedFPStack" << MaxFixedOffset - MaxFPOffset - << "; BigFrameOffsets: " << BigFrameOffsets - << "\n"); + << "; EstimatedStack: " << EstimatedStackSize + << "; EstimatedFPStack: " << MaxFixedOffset - MaxFPOffset + << "; BigFrameOffsets: " << BigFrameOffsets << "\n"); if (BigFrameOffsets || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { AFI->setHasStackFrame(true); @@ -2080,9 +2112,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, ExtraCSSpill = true; } } - if (!ExtraCSSpill) { + if (!ExtraCSSpill && RS) { // Reserve a slot closest to SP or frame pointer. - assert(RS && "Register scavenging not provided"); LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n"); const TargetRegisterClass &RC = ARM::GPRRegClass; unsigned Size = TRI->getSpillSize(RC); @@ -2097,6 +2128,12 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, AFI->setLRIsSpilledForFarJump(true); } AFI->setLRIsSpilled(SavedRegs.test(ARM::LR)); + + // If we have the "returned" parameter attribute which guarantees that we + // return the value which was passed in r0 unmodified (e.g. C++ 'structors), + // record that fact for IPRA. + if (AFI->getPreservesR0()) + SavedRegs.set(ARM::R0); } MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr( diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index 7544ca3c38d6..6d8aee597945 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -63,6 +63,11 @@ public: bool enableShrinkWrapping(const MachineFunction &MF) const override { return true; } + bool isProfitableForNoCSROpt(const Function &F) const override { + // The no-CSR optimisation is bad for code size on ARM, because we can save + // many registers with a single PUSH/POP pair. + return false; + } private: void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index b349627b67b1..8f6515c423eb 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -139,6 +139,8 @@ public: bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm); + template <unsigned Shift> + bool SelectTAddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm); // Thumb 2 Addressing Modes: bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); @@ -146,9 +148,12 @@ public: SDValue &OffImm); bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, SDValue &OffImm); - template<unsigned Shift> - bool SelectT2AddrModeImm7(SDValue N, SDValue &Base, - SDValue &OffImm); + template <unsigned Shift> + bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm); + bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm, + unsigned Shift); + template <unsigned Shift> + bool SelectT2AddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm); bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm); @@ -179,6 +184,7 @@ private: bool tryARMIndexedLoad(SDNode *N); bool tryT1IndexedLoad(SDNode *N); bool tryT2IndexedLoad(SDNode *N); + bool tryMVEIndexedLoad(SDNode *N); /// SelectVLD - Select NEON load intrinsics. NumVecs should be /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for @@ -246,10 +252,6 @@ private: SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs, bool is64BitVector); - /// Returns the number of instructions required to materialize the given - /// constant in a register, or 3 if a literal pool load is needed. - unsigned ConstantMaterializationCost(unsigned Val) const; - /// Checks if N is a multiplication by a constant where we can extract out a /// power of two from the constant so that it can be used in a shift, but only /// if it simplifies the materialization of the constant. Returns true if it @@ -450,27 +452,6 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift, (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1)); } -unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const { - if (Subtarget->isThumb()) { - if (Val <= 255) return 1; // MOV - if (Subtarget->hasV6T2Ops() && - (Val <= 0xffff || // MOV - ARM_AM::getT2SOImmVal(Val) != -1 || // MOVW - ARM_AM::getT2SOImmVal(~Val) != -1)) // MVN - return 1; - if (Val <= 510) return 2; // MOV + ADDi8 - if (~Val <= 255) return 2; // MOV + MVN - if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL - } else { - if (ARM_AM::getSOImmVal(Val) != -1) return 1; // MOV - if (ARM_AM::getSOImmVal(~Val) != -1) return 1; // MVN - if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW - if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs - } - if (Subtarget->useMovt()) return 2; // MOVW + MOVT - return 3; // Literal pool load -} - bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N, unsigned MaxShift, unsigned &PowerOfTwo, @@ -500,8 +481,8 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N, // Only optimise if the new cost is better unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo); NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32); - unsigned OldCost = ConstantMaterializationCost(MulConstVal); - unsigned NewCost = ConstantMaterializationCost(NewMulConstVal); + unsigned OldCost = ConstantMaterializationCost(MulConstVal, Subtarget); + unsigned NewCost = ConstantMaterializationCost(NewMulConstVal, Subtarget); return NewCost < OldCost; } @@ -1172,6 +1153,28 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, return false; } +template <unsigned Shift> +bool ARMDAGToDAGISel::SelectTAddrModeImm7(SDValue N, SDValue &Base, + SDValue &OffImm) { + if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80, + RHSC)) { + Base = N.getOperand(0); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + OffImm = + CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32); + return true; + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; +} + //===----------------------------------------------------------------------===// // Thumb 2 Addressing Modes @@ -1278,35 +1281,59 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, return false; } -template<unsigned Shift> -bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, - SDValue &Base, SDValue &OffImm) { - if (N.getOpcode() == ISD::SUB || - CurDAG->isBaseWithConstantOffset(N)) { - if (auto RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - int RHSC = (int)RHS->getZExtValue(); - if (N.getOpcode() == ISD::SUB) - RHSC = -RHSC; - - if (isShiftedInt<7, Shift>(RHSC)) { - Base = N.getOperand(0); - if (Base.getOpcode() == ISD::FrameIndex) { - int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex( +template <unsigned Shift> +bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, SDValue &Base, + SDValue &OffImm) { + if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80, + RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); - } - OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); - return true; } + + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + OffImm = + CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32); + return true; } } // Base only. Base = N; - OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); return true; } +template <unsigned Shift> +bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, + SDValue &OffImm) { + return SelectT2AddrModeImm7Offset(Op, N, OffImm, Shift); +} + +bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, + SDValue &OffImm, + unsigned Shift) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + int RHSC; + if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { // 7 bits. + OffImm = + ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) + ? CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32) + : CurDAG->getTargetConstant(-RHSC * (1 << Shift), SDLoc(N), + MVT::i32); + return true; + } + return false; +} + bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm) { @@ -1565,6 +1592,68 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) { return false; } +bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return false; + EVT LoadedVT = LD->getMemoryVT(); + if (!LoadedVT.isVector()) + return false; + bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; + SDValue Offset; + bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + unsigned Opcode = 0; + unsigned Align = LD->getAlignment(); + bool IsLE = Subtarget->isLittle(); + + if (Align >= 2 && LoadedVT == MVT::v4i16 && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) { + if (isSExtLd) + Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post; + else + Opcode = isPre ? ARM::MVE_VLDRHU32_pre : ARM::MVE_VLDRHU32_post; + } else if (LoadedVT == MVT::v8i8 && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) { + if (isSExtLd) + Opcode = isPre ? ARM::MVE_VLDRBS16_pre : ARM::MVE_VLDRBS16_post; + else + Opcode = isPre ? ARM::MVE_VLDRBU16_pre : ARM::MVE_VLDRBU16_post; + } else if (LoadedVT == MVT::v4i8 && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) { + if (isSExtLd) + Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post; + else + Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post; + } else if (Align >= 4 && + (IsLE || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 2)) + Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post; + else if (Align >= 2 && + (IsLE || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) + Opcode = isPre ? ARM::MVE_VLDRHU16_pre : ARM::MVE_VLDRHU16_post; + else if ((IsLE || LoadedVT == MVT::v16i8) && + SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) + Opcode = isPre ? ARM::MVE_VLDRBU8_pre : ARM::MVE_VLDRBU8_post; + else + return false; + + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[] = {Base, Offset, + CurDAG->getTargetConstant(ARMVCC::None, SDLoc(N), MVT::i32), + CurDAG->getRegister(0, MVT::i32), Chain}; + SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), LD->getValueType(0), + MVT::i32, MVT::Other, Ops); + transferMemOperands(N, New); + ReplaceUses(SDValue(N, 0), SDValue(New, 1)); + ReplaceUses(SDValue(N, 1), SDValue(New, 0)); + ReplaceUses(SDValue(N, 2), SDValue(New, 2)); + CurDAG->RemoveDeadNode(N); + return true; +} + /// Form a GPRPair pseudo register from a pair of GPR regs. SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) { SDLoc dl(V0.getNode()); @@ -2701,7 +2790,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { case ISD::Constant: { unsigned Val = cast<ConstantSDNode>(N)->getZExtValue(); // If we can't materialize the constant we need to use a literal pool - if (ConstantMaterializationCost(Val) > 2) { + if (ConstantMaterializationCost(Val, Subtarget) > 2) { SDValue CPIdx = CurDAG->getTargetConstantPool( ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val), TLI->getPointerTy(CurDAG->getDataLayout())); @@ -2842,8 +2931,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) { bool PreferImmediateEncoding = Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm)); if (!PreferImmediateEncoding && - ConstantMaterializationCost(Imm) > - ConstantMaterializationCost(~Imm)) { + ConstantMaterializationCost(Imm, Subtarget) > + ConstantMaterializationCost(~Imm, Subtarget)) { // The current immediate costs more to materialize than a negated // immediate, so negate the immediate and use a BIC. SDValue NewImm = @@ -2987,6 +3076,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) { return; } case ISD::LOAD: { + if (Subtarget->hasMVEIntegerOps() && tryMVEIndexedLoad(N)) + return; if (Subtarget->isThumb() && Subtarget->hasThumb2()) { if (tryT2IndexedLoad(N)) return; @@ -2998,13 +3089,26 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } - case ARMISD::WLS: { - SDValue Ops[] = { N->getOperand(1), // Loop count - N->getOperand(2), // Exit target + case ARMISD::WLS: + case ARMISD::LE: { + SDValue Ops[] = { N->getOperand(1), + N->getOperand(2), + N->getOperand(0) }; + unsigned Opc = N->getOpcode() == ARMISD::WLS ? + ARM::t2WhileLoopStart : ARM::t2LoopEnd; + SDNode *New = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + ReplaceUses(N, New); + CurDAG->RemoveDeadNode(N); + return; + } + case ARMISD::LOOP_DEC: { + SDValue Ops[] = { N->getOperand(1), + N->getOperand(2), N->getOperand(0) }; - SDNode *LoopStart = - CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other, Ops); - ReplaceUses(N, LoopStart); + SDNode *Dec = + CurDAG->getMachineNode(ARM::t2LoopDec, dl, + CurDAG->getVTList(MVT::i32, MVT::Other), Ops); + ReplaceUses(N, Dec); CurDAG->RemoveDeadNode(N); return; } @@ -4365,7 +4469,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){ // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to // the original GPRs. - unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped); SDValue Chain = SDValue(N,0); @@ -4401,7 +4505,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){ // Copy REG_SEQ into a GPRPair-typed VR and replace the original two // i32 VRs of inline asm with it. - unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped); Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1)); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 18bb9bf3eccc..db26feb57010 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -245,7 +245,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; for (auto VT : IntTypes) { - addRegisterClass(VT, &ARM::QPRRegClass); + addRegisterClass(VT, &ARM::MQPRRegClass); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); @@ -258,12 +258,31 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::CTLZ, VT, Legal); + setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::BITREVERSE, VT, Legal); + setOperationAction(ISD::BSWAP, VT, Legal); + setOperationAction(ISD::SADDSAT, VT, Legal); + setOperationAction(ISD::UADDSAT, VT, Legal); + setOperationAction(ISD::SSUBSAT, VT, Legal); + setOperationAction(ISD::USUBSAT, VT, Legal); // No native support for these. setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + + // Vector reductions + setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); @@ -271,11 +290,18 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); } + + // Pre and Post inc are supported on loads and stores + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, VT, Legal); + setIndexedStoreAction(im, VT, Legal); + } } const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; for (auto VT : FloatTypes) { - addRegisterClass(VT, &ARM::QPRRegClass); + addRegisterClass(VT, &ARM::MQPRRegClass); if (!HasMVEFP) setAllExpand(VT); @@ -287,6 +313,16 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Legal); + + // Pre and Post inc are supported on loads and stores + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, VT, Legal); + setIndexedStoreAction(im, VT, Legal); + } if (HasMVEFP) { setOperationAction(ISD::FMINNUM, VT, Legal); @@ -314,7 +350,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { // vector types is inhibited at integer-only level. const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; for (auto VT : LongTypes) { - addRegisterClass(VT, &ARM::QPRRegClass); + addRegisterClass(VT, &ARM::MQPRRegClass); setAllExpand(VT); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); @@ -334,6 +370,33 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); + + // Pre and Post inc on these are legal, given the correct extends + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, MVT::v8i8, Legal); + setIndexedStoreAction(im, MVT::v8i8, Legal); + setIndexedLoadAction(im, MVT::v4i8, Legal); + setIndexedStoreAction(im, MVT::v4i8, Legal); + setIndexedLoadAction(im, MVT::v4i16, Legal); + setIndexedStoreAction(im, MVT::v4i16, Legal); + } + + // Predicate types + const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; + for (auto VT : pTypes) { + addRegisterClass(VT, &ARM::VCCRRegClass); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); + } } ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, @@ -645,8 +708,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); } - for (MVT VT : MVT::vector_valuetypes()) { - for (MVT InnerVT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { + for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); addAllExtLoads(VT, InnerVT, Expand); } @@ -669,8 +732,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, addMVEVectorTypes(Subtarget->hasMVEFloatOps()); // Combine low-overhead loop intrinsics so that we can lower i1 types. - if (Subtarget->hasLOB()) + if (Subtarget->hasLOB()) { setTargetDAGCombine(ISD::BRCOND); + setTargetDAGCombine(ISD::BR_CC); + } if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); @@ -837,10 +902,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SRA); - setTargetDAGCombine(ISD::SIGN_EXTEND); - setTargetDAGCombine(ISD::ZERO_EXTEND); - setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); @@ -849,7 +910,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // It is legal to extload from v4i8 to v4i16 or v4i32. for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, MVT::v2i32}) { - for (MVT VT : MVT::integer_vector_valuetypes()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); @@ -861,6 +922,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); } if (!Subtarget->hasFP64()) { @@ -901,9 +966,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); } - if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){ + if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + if (Subtarget->hasFullFP16()) + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); } if (!Subtarget->hasFP16()) @@ -955,6 +1021,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); + if (Subtarget->hasDSP()) { + setOperationAction(ISD::SADDSAT, MVT::i8, Custom); + setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); + setOperationAction(ISD::SADDSAT, MVT::i16, Custom); + setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); + } + if (Subtarget->hasBaseDSP()) { + setOperationAction(ISD::SADDSAT, MVT::i32, Legal); + setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); + } // i64 operation support. setOperationAction(ISD::MUL, MVT::i64, Expand); @@ -972,6 +1048,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i64, Custom); setOperationAction(ISD::SRA, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); // MVE lowers 64 bit shifts to lsll and lsrl @@ -991,7 +1068,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // ARM does not have ROTL. setOperationAction(ISD::ROTL, MVT::i32, Expand); - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } @@ -1365,14 +1442,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. - setMinStackArgumentAlignment(4); + setMinStackArgumentAlignment(Align(4)); // Prefer likely predicted branches to selects on out-of-order cores. PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); - setPrefLoopAlignment(Subtarget->getPrefLoopAlignment()); + setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); - setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); + setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); if (Subtarget->isThumb() || Subtarget->isThumb2()) setTargetDAGCombine(ISD::ABS); @@ -1472,6 +1549,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::ADDE: return "ARMISD::ADDE"; case ARMISD::SUBC: return "ARMISD::SUBC"; case ARMISD::SUBE: return "ARMISD::SUBE"; + case ARMISD::LSLS: return "ARMISD::LSLS"; case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; @@ -1496,16 +1574,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; - case ARMISD::VCEQ: return "ARMISD::VCEQ"; - case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; - case ARMISD::VCGE: return "ARMISD::VCGE"; - case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; - case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; - case ARMISD::VCGEU: return "ARMISD::VCGEU"; - case ARMISD::VCGT: return "ARMISD::VCGT"; - case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; - case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; - case ARMISD::VCGTU: return "ARMISD::VCGTU"; + case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; + case ARMISD::VCMP: return "ARMISD::VCMP"; + case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; case ARMISD::VTST: return "ARMISD::VTST"; case ARMISD::VSHLs: return "ARMISD::VSHLs"; @@ -1543,6 +1614,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VTRN: return "ARMISD::VTRN"; case ARMISD::VTBL1: return "ARMISD::VTBL1"; case ARMISD::VTBL2: return "ARMISD::VTBL2"; + case ARMISD::VMOVN: return "ARMISD::VMOVN"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; case ARMISD::UMAAL: return "ARMISD::UMAAL"; @@ -1560,6 +1632,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; + case ARMISD::QADD16b: return "ARMISD::QADD16b"; + case ARMISD::QSUB16b: return "ARMISD::QSUB16b"; + case ARMISD::QADD8b: return "ARMISD::QADD8b"; + case ARMISD::QSUB8b: return "ARMISD::QSUB8b"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; @@ -1589,6 +1665,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; case ARMISD::WLS: return "ARMISD::WLS"; + case ARMISD::LE: return "ARMISD::LE"; + case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC"; + case ARMISD::CSINV: return "ARMISD::CSINV"; + case ARMISD::CSNEG: return "ARMISD::CSNEG"; + case ARMISD::CSINC: return "ARMISD::CSINC"; } return nullptr; } @@ -1597,6 +1678,11 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const { if (!VT.isVector()) return getPointerTy(DL); + + // MVE has a predicate register. + if (Subtarget->hasMVEIntegerOps() && + (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) + return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); return VT.changeVectorElementTypeToInteger(); } @@ -1726,34 +1812,22 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, - ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) { + ARMCC::CondCodes &CondCode2) { CondCode2 = ARMCC::AL; - InvalidOnQNaN = true; switch (CC) { default: llvm_unreachable("Unknown FP condition!"); case ISD::SETEQ: - case ISD::SETOEQ: - CondCode = ARMCC::EQ; - InvalidOnQNaN = false; - break; + case ISD::SETOEQ: CondCode = ARMCC::EQ; break; case ISD::SETGT: case ISD::SETOGT: CondCode = ARMCC::GT; break; case ISD::SETGE: case ISD::SETOGE: CondCode = ARMCC::GE; break; case ISD::SETOLT: CondCode = ARMCC::MI; break; case ISD::SETOLE: CondCode = ARMCC::LS; break; - case ISD::SETONE: - CondCode = ARMCC::MI; - CondCode2 = ARMCC::GT; - InvalidOnQNaN = false; - break; + case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; case ISD::SETO: CondCode = ARMCC::VC; break; case ISD::SETUO: CondCode = ARMCC::VS; break; - case ISD::SETUEQ: - CondCode = ARMCC::EQ; - CondCode2 = ARMCC::VS; - InvalidOnQNaN = false; - break; + case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; case ISD::SETUGT: CondCode = ARMCC::HI; break; case ISD::SETUGE: CondCode = ARMCC::PL; break; case ISD::SETLT: @@ -1761,10 +1835,7 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, case ISD::SETLE: case ISD::SETULE: CondCode = ARMCC::LE; break; case ISD::SETNE: - case ISD::SETUNE: - CondCode = ARMCC::NE; - InvalidOnQNaN = false; - break; + case ISD::SETUNE: CondCode = ARMCC::NE; break; } } @@ -1988,6 +2059,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction::CallSiteInfo CSInfo; bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); @@ -2112,6 +2184,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, "unexpected use of 'returned'"); isThisReturn = true; } + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.EnableDebugEntryValues) + CSInfo.emplace_back(VA.getLocReg(), i); RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else if (isByVal) { assert(VA.isMemLoc()); @@ -2347,12 +2422,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); if (isTailCall) { MF.getFrameInfo().setHasTailCall(); - return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); + SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); + return Ret; } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); @@ -2431,7 +2509,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, int FI = std::numeric_limits<int>::max(); if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); - if (!TargetRegisterInfo::isVirtualRegister(VR)) + if (!Register::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) @@ -3047,12 +3125,12 @@ ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, // Load the current TEB (thread environment block) SDValue Ops[] = {Chain, - DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), - DAG.getConstant(15, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(13, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(2, DL, MVT::i32)}; + DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), + DAG.getTargetConstant(15, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32), + DAG.getTargetConstant(13, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32), + DAG.getTargetConstant(2, DL, MVT::i32)}; SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, DAG.getVTList(MVT::i32, MVT::Other), Ops); @@ -3498,6 +3576,48 @@ SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, Op.getOperand(0)); } +SDValue ARMTargetLowering::LowerINTRINSIC_VOID( + SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { + unsigned IntNo = + cast<ConstantSDNode>( + Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) + ->getZExtValue(); + switch (IntNo) { + default: + return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::arm_gnu_eabi_mcount: { + MachineFunction &MF = DAG.getMachineFunction(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + SDValue Chain = Op.getOperand(0); + // call "\01__gnu_mcount_nc" + const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); + const uint32_t *Mask = + ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); + assert(Mask && "Missing call preserved mask for calling convention"); + // Mark LR an implicit live-in. + unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); + SDValue ReturnAddress = + DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); + std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue}; + SDValue Callee = + DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); + SDValue RegisterMask = DAG.getRegisterMask(Mask); + if (Subtarget->isThumb()) + return SDValue( + DAG.getMachineNode( + ARM::tBL_PUSHLR, dl, ResultTys, + {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), + DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), + 0); + return SDValue( + DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, + {ReturnAddress, Callee, RegisterMask, Chain}), + 0); + } + } +} + SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { @@ -3898,6 +4018,12 @@ SDValue ARMTargetLowering::LowerFormalArguments( // Transform the arguments in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + + // If this value is passed in r0 and has the returned attribute (e.g. + // C++ 'structors), record this fact for later use. + if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { + AFI->setPreservesR0(); + } } // If this is an 8 or 16-bit value, it is really passed promoted @@ -4049,6 +4175,67 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, std::swap(LHS, RHS); } + // Thumb1 has very limited immediate modes, so turning an "and" into a + // shift can save multiple instructions. + // + // If we have (x & C1), and C1 is an appropriate mask, we can transform it + // into "((x << n) >> n)". But that isn't necessarily profitable on its + // own. If it's the operand to an unsigned comparison with an immediate, + // we can eliminate one of the shifts: we transform + // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". + // + // We avoid transforming cases which aren't profitable due to encoding + // details: + // + // 1. C2 fits into the immediate field of a cmp, and the transformed version + // would not; in that case, we're essentially trading one immediate load for + // another. + // 2. C1 is 255 or 65535, so we can use uxtb or uxth. + // 3. C2 is zero; we have other code for this special case. + // + // FIXME: Figure out profitability for Thumb2; we usually can't save an + // instruction, since the AND is always one instruction anyway, but we could + // use narrow instructions in some cases. + if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && + LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && + LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && + !isSignedIntSetCC(CC)) { + unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); + auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); + uint64_t RHSV = RHSC->getZExtValue(); + if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { + unsigned ShiftBits = countLeadingZeros(Mask); + if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { + SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); + LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); + RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); + } + } + } + + // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a + // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same + // way a cmp would. + // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and + // some tweaks to the heuristics for the previous and->shift transform. + // FIXME: Optimize cases where the LHS isn't a shift. + if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && + isa<ConstantSDNode>(RHS) && + cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && + CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && + cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { + unsigned ShiftAmt = + cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; + SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, + DAG.getVTList(MVT::i32, MVT::i32), + LHS.getOperand(0), + DAG.getConstant(ShiftAmt, dl, MVT::i32)); + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, + Shift.getValue(1), SDValue()); + ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); + return Chain.getValue(1); + } + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); // If the RHS is a constant zero then the V (overflow) flag will never be @@ -4083,15 +4270,13 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, - SelectionDAG &DAG, const SDLoc &dl, - bool InvalidOnQNaN) const { + SelectionDAG &DAG, const SDLoc &dl) const { assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); SDValue Cmp; - SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32); if (!isFloatingPointZero(RHS)) - Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C); + Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); else - Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C); + Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); } @@ -4108,12 +4293,10 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { Cmp = Cmp.getOperand(0); Opc = Cmp.getOpcode(); if (Opc == ARMISD::CMPFP) - Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), - Cmp.getOperand(1), Cmp.getOperand(2)); + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); else { assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); - Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), - Cmp.getOperand(1)); + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); } return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); } @@ -4276,6 +4459,35 @@ SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } +static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + EVT VT = Op.getValueType(); + if (!Subtarget->hasDSP()) + return SDValue(); + if (!VT.isSimple()) + return SDValue(); + + unsigned NewOpcode; + bool IsAdd = Op->getOpcode() == ISD::SADDSAT; + switch (VT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::i8: + NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b; + break; + case MVT::i16: + NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b; + break; + } + + SDLoc dl(Op); + SDValue Add = + DAG.getNode(NewOpcode, dl, MVT::i32, + DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), + DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); +} + SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue SelectTrue = Op.getOperand(1); @@ -4656,10 +4868,62 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); + ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); + ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); + + if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && + LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { + unsigned TVal = CTVal->getZExtValue(); + unsigned FVal = CFVal->getZExtValue(); + unsigned Opcode = 0; + + if (TVal == ~FVal) { + Opcode = ARMISD::CSINV; + } else if (TVal == ~FVal + 1) { + Opcode = ARMISD::CSNEG; + } else if (TVal + 1 == FVal) { + Opcode = ARMISD::CSINC; + } else if (TVal == FVal + 1) { + Opcode = ARMISD::CSINC; + std::swap(TrueVal, FalseVal); + std::swap(TVal, FVal); + CC = ISD::getSetCCInverse(CC, true); + } + + if (Opcode) { + // If one of the constants is cheaper than another, materialise the + // cheaper one and let the csel generate the other. + if (Opcode != ARMISD::CSINC && + HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { + std::swap(TrueVal, FalseVal); + std::swap(TVal, FVal); + CC = ISD::getSetCCInverse(CC, true); + } + + // Attempt to use ZR checking TVal is 0, possibly inverting the condition + // to get there. CSINC not is invertable like the other two (~(~a) == a, + // -(-a) == a, but (a+1)+1 != a). + if (FVal == 0 && Opcode != ARMISD::CSINC) { + std::swap(TrueVal, FalseVal); + std::swap(TVal, FVal); + CC = ISD::getSetCCInverse(CC, true); + } + if (TVal == 0) + TrueVal = DAG.getRegister(ARM::ZR, MVT::i32); + + // Drops F's value because we can get it by inverting/negating TVal. + FalseVal = TrueVal; + + SDValue ARMcc; + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + EVT VT = TrueVal.getValueType(); + return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); + } + } if (isUnsupportedFloatingType(LHS.getValueType())) { DAG.getTargetLoweringInfo().softenSetCCOperands( - DAG, LHS.getValueType(), LHS, RHS, CC, dl); + DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands only returned one value, we should compare it to // zero. @@ -4701,8 +4965,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { } ARMCC::CondCodes CondCode, CondCode2; - bool InvalidOnQNaN; - FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); + FPCCToARMCC(CC, CondCode, CondCode2); // Normalize the fp compare. If RHS is zero we prefer to keep it there so we // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we @@ -4727,13 +4990,13 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { } SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); - SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); if (CondCode2 != ARMCC::AL) { SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); // FIXME: Needs another CMP because flag can have but one use. - SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); + SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); } return Result; @@ -4903,7 +5166,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { if (isUnsupportedFloatingType(LHS.getValueType())) { DAG.getTargetLoweringInfo().softenSetCCOperands( - DAG, LHS.getValueType(), LHS, RHS, CC, dl); + DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands only returned one value, we should compare it to // zero. @@ -4960,11 +5223,10 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { } ARMCC::CondCodes CondCode, CondCode2; - bool InvalidOnQNaN; - FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); + FPCCToARMCC(CC, CondCode, CondCode2); SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); - SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; @@ -5056,8 +5318,9 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); + MakeLibCallOptions CallOptions; return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), - /*isSigned*/ false, SDLoc(Op)).first; + CallOptions, SDLoc(Op)).first; } return Op; @@ -5120,8 +5383,9 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { else LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + MakeLibCallOptions CallOptions; return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), - /*isSigned*/ false, SDLoc(Op)).first; + CallOptions, SDLoc(Op)).first; } return Op; @@ -5140,7 +5404,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { if (UseNEON) { // Use VBSL to copy the sign bit. - unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); + unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; @@ -5163,7 +5427,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); - SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), + SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, @@ -5243,7 +5507,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - unsigned FrameReg = ARI.getFrameRegister(MF); + Register FrameReg = ARI.getFrameRegister(MF); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, @@ -5253,9 +5517,9 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { - unsigned Reg = StringSwitch<unsigned>(RegName) +Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const { + Register Reg = StringSwitch<unsigned>(RegName) .Case("sp", ARM::SP) .Default(0); if (Reg) @@ -5576,8 +5840,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { SDLoc dl(N); EVT VT = N->getValueType(0); - if (VT.isVector()) { - assert(ST->hasNEON()); + if (VT.isVector() && ST->hasNEON()) { // Compute the least significant set bit: LSB = X & -X SDValue X = N->getOperand(0); @@ -5777,14 +6040,15 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, unsigned ShPartsOpc = ARMISD::LSLL; ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); - // If the shift amount is greater than 32 then do the default optimisation - if (Con && Con->getZExtValue() > 32) + // If the shift amount is greater than 32 or has a greater bitwidth than 64 + // then do the default optimisation + if (ShAmt->getValueType(0).getSizeInBits() > 64 || + (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) return SDValue(); - // Extract the lower 32 bits of the shift amount if it's an i64 - if (ShAmt->getValueType(0) == MVT::i64) - ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt, - DAG.getConstant(0, dl, MVT::i32)); + // Extract the lower 32 bits of the shift amount if it's not an i32 + if (ShAmt->getValueType(0) != MVT::i32) + ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); if (ShOpc == ISD::SRL) { if (!Con) @@ -5839,20 +6103,37 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); } -static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { - SDValue TmpOp0, TmpOp1; +static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { bool Invert = false; bool Swap = false; - unsigned Opc = 0; + unsigned Opc = ARMCC::AL; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); - EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); EVT VT = Op.getValueType(); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); SDLoc dl(Op); + EVT CmpVT; + if (ST->hasNEON()) + CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); + else { + assert(ST->hasMVEIntegerOps() && + "No hardware support for integer vector comparison!"); + + if (Op.getValueType().getVectorElementType() != MVT::i1) + return SDValue(); + + // Make sure we expand floating point setcc to scalar if we do not have + // mve.fp, so that we can handle them from there. + if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) + return SDValue(); + + CmpVT = VT; + } + if (Op0.getValueType().getVectorElementType() == MVT::i64 && (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { // Special-case integer 64-bit equality comparisons. They aren't legal, @@ -5880,60 +6161,74 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { switch (SetCCOpcode) { default: llvm_unreachable("Illegal FP comparison"); case ISD::SETUNE: - case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; + case ISD::SETNE: + if (ST->hasMVEFloatOps()) { + Opc = ARMCC::NE; break; + } else { + Invert = true; LLVM_FALLTHROUGH; + } case ISD::SETOEQ: - case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETEQ: Opc = ARMCC::EQ; break; case ISD::SETOLT: case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETOGT: - case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETGT: Opc = ARMCC::GT; break; case ISD::SETOLE: case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETOGE: - case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETGE: Opc = ARMCC::GE; break; case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; + case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; + case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; - case ISD::SETONE: + case ISD::SETONE: { // Expand this to (OLT | OGT). - TmpOp0 = Op0; - TmpOp1 = Op1; - Opc = ISD::OR; - Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); - Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); - break; - case ISD::SETUO: - Invert = true; - LLVM_FALLTHROUGH; - case ISD::SETO: + SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, + DAG.getConstant(ARMCC::GT, dl, MVT::i32)); + SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, + DAG.getConstant(ARMCC::GT, dl, MVT::i32)); + SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + return Result; + } + case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; + case ISD::SETO: { // Expand this to (OLT | OGE). - TmpOp0 = Op0; - TmpOp1 = Op1; - Opc = ISD::OR; - Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); - Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); - break; + SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, + DAG.getConstant(ARMCC::GT, dl, MVT::i32)); + SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, + DAG.getConstant(ARMCC::GE, dl, MVT::i32)); + SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + return Result; + } } } else { // Integer comparisons. switch (SetCCOpcode) { default: llvm_unreachable("Illegal integer comparison"); - case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; - case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETNE: + if (ST->hasMVEIntegerOps()) { + Opc = ARMCC::NE; break; + } else { + Invert = true; LLVM_FALLTHROUGH; + } + case ISD::SETEQ: Opc = ARMCC::EQ; break; case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETGT: Opc = ARMCC::GT; break; case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETGE: Opc = ARMCC::GE; break; case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETUGT: Opc = ARMISD::VCGTU; break; + case ISD::SETUGT: Opc = ARMCC::HI; break; case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETUGE: Opc = ARMISD::VCGEU; break; + case ISD::SETUGE: Opc = ARMCC::HS; break; } // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). - if (Opc == ARMISD::VCEQ) { + if (ST->hasNEON() && Opc == ARMCC::EQ) { SDValue AndOp; if (ISD::isBuildVectorAllZeros(Op1.getNode())) AndOp = Op0; @@ -5945,10 +6240,12 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { AndOp = AndOp.getOperand(0); if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { - Opc = ARMISD::VTST; Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); - Invert = !Invert; + SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); + if (!Invert) + Result = DAG.getNOT(dl, Result, VT); + return Result; } } } @@ -5962,31 +6259,20 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { if (ISD::isBuildVectorAllZeros(Op1.getNode())) SingleOp = Op0; else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { - if (Opc == ARMISD::VCGE) - Opc = ARMISD::VCLEZ; - else if (Opc == ARMISD::VCGT) - Opc = ARMISD::VCLTZ; + if (Opc == ARMCC::GE) + Opc = ARMCC::LE; + else if (Opc == ARMCC::GT) + Opc = ARMCC::LT; SingleOp = Op1; } SDValue Result; if (SingleOp.getNode()) { - switch (Opc) { - case ARMISD::VCEQ: - Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; - case ARMISD::VCGE: - Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; - case ARMISD::VCLEZ: - Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; - case ARMISD::VCGT: - Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; - case ARMISD::VCLTZ: - Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; - default: - Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); - } + Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, + DAG.getConstant(Opc, dl, MVT::i32)); } else { - Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); + Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, + DAG.getConstant(Opc, dl, MVT::i32)); } Result = DAG.getSExtOrTrunc(Result, dl, VT); @@ -6027,13 +6313,13 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { CCR, Chain.getValue(1)); } -/// isNEONModifiedImm - Check if the specified splat value corresponds to a -/// valid vector constant for a NEON or MVE instruction with a "modified immediate" -/// operand (e.g., VMOV). If so, return the encoded value. -static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, +/// isVMOVModifiedImm - Check if the specified splat value corresponds to a +/// valid vector constant for a NEON or MVE instruction with a "modified +/// immediate" operand (e.g., VMOV). If so, return the encoded value. +static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, bool is128Bits, - NEONModImmType type) { + VMOVModImmType type) { unsigned OpCmode, Imm; // SplatBitSize is set to the smallest size that splats the vector, so a @@ -6163,10 +6449,10 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, } default: - llvm_unreachable("unexpected size for isNEONModifiedImm"); + llvm_unreachable("unexpected size for isVMOVModifiedImm"); } - unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); + unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); } @@ -6246,7 +6532,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, return SDValue(); // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). - SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), + SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, false, VMOVModImm); if (NewVal != SDValue()) { SDLoc DL(Op); @@ -6263,7 +6549,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, } // Finally, try a VMVN.i32 - NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, + NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, false, VMVNModImm); if (NewVal != SDValue()) { SDLoc DL(Op); @@ -6649,6 +6935,29 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) { return true; } +static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) { + unsigned NumElts = VT.getVectorNumElements(); + // Make sure the mask has the right size. + if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) + return false; + + // If Top + // Look for <0, N, 2, N+2, 4, N+4, ..>. + // This inserts Input2 into Input1 + // else if not Top + // Look for <0, N+1, 2, N+3, 4, N+5, ..> + // This inserts Input1 into Input2 + unsigned Offset = Top ? 0 : 1; + for (unsigned i = 0; i < NumElts; i+=2) { + if (M[i] >= 0 && M[i] != (int)i) + return false; + if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset)) + return false; + } + + return true; +} + // If N is an integer constant that can be moved into a register in one // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. @@ -6669,6 +6978,66 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, return SDValue(); } +static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned BoolMask; + unsigned BitsPerBool; + if (NumElts == 4) { + BitsPerBool = 4; + BoolMask = 0xf; + } else if (NumElts == 8) { + BitsPerBool = 2; + BoolMask = 0x3; + } else if (NumElts == 16) { + BitsPerBool = 1; + BoolMask = 0x1; + } else + return SDValue(); + + // If this is a single value copied into all lanes (a splat), we can just sign + // extend that single value + SDValue FirstOp = Op.getOperand(0); + if (!isa<ConstantSDNode>(FirstOp) && + std::all_of(std::next(Op->op_begin()), Op->op_end(), + [&FirstOp](SDUse &U) { + return U.get().isUndef() || U.get() == FirstOp; + })) { + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, + DAG.getValueType(MVT::i1)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); + } + + // First create base with bits set where known + unsigned Bits32 = 0; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (!isa<ConstantSDNode>(V) && !V.isUndef()) + continue; + bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); + if (BitSet) + Bits32 |= BoolMask << (i * BitsPerBool); + } + + // Add in unknown nodes + SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, + DAG.getConstant(Bits32, dl, MVT::i32)); + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (isa<ConstantSDNode>(V) || V.isUndef()) + continue; + Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, + DAG.getConstant(i, dl, MVT::i32)); + } + + return Base; +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, @@ -6677,6 +7046,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); EVT VT = Op.getValueType(); + if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) + return LowerBUILD_VECTOR_i1(Op, DAG, ST); + APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; @@ -6688,7 +7060,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { // Check if an immediate VMOV works. EVT VmovVT; - SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), VMOVModImm); @@ -6700,7 +7072,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Try an immediate VMVN. uint64_t NegatedImm = (~SplatBits).getZExtValue(); - Val = isNEONModifiedImm( + Val = isVMOVModifiedImm( NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); @@ -7088,9 +7460,6 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, LaneMask[j] = ExtractBase + j; } - // Final check before we try to produce nonsense... - if (!isShuffleMaskLegal(Mask, ShuffleVT)) - return SDValue(); // We can't handle more than two sources. This should have already // been checked before this point. @@ -7100,8 +7469,10 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, for (unsigned i = 0; i < Sources.size(); ++i) ShuffleOps[i] = Sources[i].ShuffleVec; - SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], - ShuffleOps[1], Mask); + SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], + ShuffleOps[1], Mask, DAG); + if (!Shuffle) + return SDValue(); return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } @@ -7168,6 +7539,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { unsigned EltSize = VT.getScalarSizeInBits(); if (EltSize >= 32 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + ShuffleVectorInst::isIdentityMask(M) || isVREVMask(M, VT, 64) || isVREVMask(M, VT, 32) || isVREVMask(M, VT, 16)) @@ -7180,6 +7552,9 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)) return true; + else if (Subtarget->hasMVEIntegerOps() && + (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1))) + return true; else return false; } @@ -7282,6 +7657,94 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, DAG.getConstant(ExtractNum, DL, MVT::i32)); } +static EVT getVectorTyFromPredicateVector(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v4i1: + return MVT::v4i32; + case MVT::v8i1: + return MVT::v8i16; + case MVT::v16i1: + return MVT::v16i8; + default: + llvm_unreachable("Unexpected vector predicate type"); + } +} + +static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, + SelectionDAG &DAG) { + // Converting from boolean predicates to integers involves creating a vector + // of all ones or all zeroes and selecting the lanes based upon the real + // predicate. + SDValue AllOnes = + DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); + AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); + + SDValue AllZeroes = + DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); + AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); + + // Get full vector type from predicate type + EVT NewVT = getVectorTyFromPredicateVector(VT); + + SDValue RecastV1; + // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast + // this to a v16i1. This cannot be done with an ordinary bitcast because the + // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, + // since we know in hardware the sizes are really the same. + if (VT != MVT::v16i1) + RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); + else + RecastV1 = Pred; + + // Select either all ones or zeroes depending upon the real predicate bits. + SDValue PredAsVector = + DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); + + // Recast our new predicate-as-integer v16i8 vector into something + // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. + return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); +} + +static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); + ArrayRef<int> ShuffleMask = SVN->getMask(); + + assert(ST->hasMVEIntegerOps() && + "No support for vector shuffle of boolean predicates"); + + SDValue V1 = Op.getOperand(0); + SDLoc dl(Op); + if (isReverseMask(ShuffleMask, VT)) { + SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); + SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); + SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, + DAG.getConstant(16, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); + } + + // Until we can come up with optimised cases for every single vector + // shuffle in existence we have chosen the least painful strategy. This is + // to essentially promote the boolean predicate to a 8-bit integer, where + // each predicate represents a byte. Then we fall back on a normal integer + // vector shuffle and convert the result back into a predicate vector. In + // many cases the generated code might be even better than scalar code + // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit + // fields in a register into 8 other arbitrary 2-bit fields! + SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); + EVT NewVT = PredAsVector.getValueType(); + + // Do the shuffle! + SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, + DAG.getUNDEF(NewVT), ShuffleMask); + + // Now return the result of comparing the shuffled vector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); +} + static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); @@ -7289,6 +7752,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); + unsigned EltSize = VT.getScalarSizeInBits(); + + if (ST->hasMVEIntegerOps() && EltSize == 1) + return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); // Convert shuffles that are directly supported on NEON to target-specific // DAG nodes, instead of keeping them as shuffles and matching them again @@ -7298,7 +7765,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // of the same time so that they get CSEd properly. ArrayRef<int> ShuffleMask = SVN->getMask(); - unsigned EltSize = VT.getScalarSizeInBits(); if (EltSize <= 32) { if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); @@ -7364,6 +7830,14 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, .getValue(WhichResult); } } + if (ST->hasMVEIntegerOps()) { + if (isVMOVNMask(ShuffleMask, VT, 0)) + return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, + DAG.getConstant(0, dl, MVT::i32)); + if (isVMOVNMask(ShuffleMask, VT, 1)) + return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, + DAG.getConstant(1, dl, MVT::i32)); + } // Also check for these shuffles through CONCAT_VECTORS: we canonicalize // shuffles that produce a result larger than their operands with: @@ -7468,8 +7942,29 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, return SDValue(); } -SDValue ARMTargetLowering:: -LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VecVT = Op.getOperand(0).getValueType(); + SDLoc dl(Op); + + assert(ST->hasMVEIntegerOps() && + "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); + + SDValue Conv = + DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); + unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + unsigned LaneWidth = + getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; + unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, + Op.getOperand(1), DAG.getValueType(MVT::i1)); + SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, + DAG.getConstant(~Mask, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); +} + +SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { // INSERT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(2); if (!isa<ConstantSDNode>(Lane)) @@ -7477,6 +7972,11 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue Elt = Op.getOperand(1); EVT EltVT = Elt.getValueType(); + + if (Subtarget->hasMVEIntegerOps() && + Op.getValueType().getScalarSizeInBits() == 1) + return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); + if (getTypeAction(*DAG.getContext(), EltVT) == TargetLowering::TypePromoteFloat) { // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, @@ -7505,13 +8005,37 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { return Op; } -static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VecVT = Op.getOperand(0).getValueType(); + SDLoc dl(Op); + + assert(ST->hasMVEIntegerOps() && + "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); + + SDValue Conv = + DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); + unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + unsigned LaneWidth = + getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; + SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, + DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); + return Shift; +} + +static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { // EXTRACT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(1); if (!isa<ConstantSDNode>(Lane)) return SDValue(); SDValue Vec = Op.getOperand(0); + EVT VT = Vec.getValueType(); + + if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) + return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); + if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { SDLoc dl(Op); return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); @@ -7520,7 +8044,64 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { return Op; } -static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT Op1VT = V1.getValueType(); + EVT Op2VT = V2.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + assert(Op1VT == Op2VT && "Operand types don't match!"); + assert(VT.getScalarSizeInBits() == 1 && + "Unexpected custom CONCAT_VECTORS lowering"); + assert(ST->hasMVEIntegerOps() && + "CONCAT_VECTORS lowering only supported for MVE"); + + SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); + SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); + + // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets + // promoted to v8i16, etc. + + MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); + + // Extract the vector elements from Op1 and Op2 one by one and truncate them + // to be the right size for the destination. For example, if Op1 is v4i1 then + // the promoted vector is v4i32. The result of concatentation gives a v8i1, + // which when promoted is v8i16. That means each i32 element from Op1 needs + // truncating to i16 and inserting in the result. + EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); + SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); + auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { + EVT NewVT = NewV.getValueType(); + EVT ConcatVT = ConVec.getValueType(); + for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, + DAG.getIntPtrConstant(i, dl)); + ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, + DAG.getConstant(j, dl, MVT::i32)); + } + return ConVec; + }; + unsigned j = 0; + ConVec = ExractInto(NewV1, ConVec, j); + ConVec = ExractInto(NewV2, ConVec, j); + + // Now return the result of comparing the subvector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = Op->getValueType(0); + if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) + return LowerCONCAT_VECTORS_i1(Op, DAG, ST); + // The only time a CONCAT_VECTORS operation can have legal types is when // two 64-bit vectors are concatenated to a 128-bit vector. assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && @@ -7540,6 +8121,43 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); } +static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT Op1VT = V1.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); + + assert(VT.getScalarSizeInBits() == 1 && + "Unexpected custom EXTRACT_SUBVECTOR lowering"); + assert(ST->hasMVEIntegerOps() && + "EXTRACT_SUBVECTOR lowering only supported for MVE"); + + SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); + + // We now have Op1 promoted to a vector of integers, where v8i1 gets + // promoted to v8i16, etc. + + MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); + + EVT SubVT = MVT::getVectorVT(ElType, NumElts); + SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); + for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, + DAG.getIntPtrConstant(i, dl)); + SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, + DAG.getConstant(j, dl, MVT::i32)); + } + + // Now return the result of comparing the subvector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); +} + /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each /// element has been zero/sign-extended, depending on the isSigned parameter, /// from an integer type half its size. @@ -7897,7 +8515,8 @@ static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, return N0; } -static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::SDIV"); @@ -7924,7 +8543,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); - N0 = LowerCONCAT_VECTORS(N0, DAG); + N0 = LowerCONCAT_VECTORS(N0, DAG, ST); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); return N0; @@ -7932,7 +8551,8 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { return LowerSDIV_v4i16(N0, N1, dl, DAG); } -static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { // TODO: Should this propagate fast-math-flags? EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && @@ -7960,7 +8580,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); - N0 = LowerCONCAT_VECTORS(N0, DAG); + N0 = LowerCONCAT_VECTORS(N0, DAG, ST); N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, @@ -8255,6 +8875,96 @@ void ARMTargetLowering::ExpandDIV_Windows( Results.push_back(Upper); } +static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { + LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); + EVT MemVT = LD->getMemoryVT(); + assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && + "Expected a predicate type!"); + assert(MemVT == Op.getValueType()); + assert(LD->getExtensionType() == ISD::NON_EXTLOAD && + "Expected a non-extending load"); + assert(LD->isUnindexed() && "Expected a unindexed load"); + + // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit + // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We + // need to make sure that 8/4 bits are actually loaded into the correct + // place, which means loading the value and then shuffling the values into + // the bottom bits of the predicate. + // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect + // for BE). + + SDLoc dl(Op); + SDValue Load = DAG.getExtLoad( + ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), + EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), + LD->getMemOperand()); + SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load); + if (MemVT != MVT::v16i1) + Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); +} + +static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { + StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); + EVT MemVT = ST->getMemoryVT(); + assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && + "Expected a predicate type!"); + assert(MemVT == ST->getValue().getValueType()); + assert(!ST->isTruncatingStore() && "Expected a non-extending store"); + assert(ST->isUnindexed() && "Expected a unindexed store"); + + // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits + // unset and a scalar store. + SDLoc dl(Op); + SDValue Build = ST->getValue(); + if (MemVT != MVT::v16i1) { + SmallVector<SDValue, 16> Ops; + for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, + DAG.getConstant(I, dl, MVT::i32))); + for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) + Ops.push_back(DAG.getUNDEF(MVT::i32)); + Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); + } + SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); + return DAG.getTruncStore( + ST->getChain(), dl, GRP, ST->getBasePtr(), + EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), + ST->getMemOperand()); +} + +static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { + MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); + MVT VT = Op.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDValue PassThru = N->getPassThru(); + SDLoc dl(Op); + + auto IsZero = [](SDValue PassThru) { + return (ISD::isBuildVectorAllZeros(PassThru.getNode()) || + (PassThru->getOpcode() == ARMISD::VMOVIMM && + isNullConstant(PassThru->getOperand(0)))); + }; + + if (IsZero(PassThru)) + return Op; + + // MVE Masked loads use zero as the passthru value. Here we convert undef to + // zero too, and other values are lowered to a select. + SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(0, dl, MVT::i32)); + SDValue NewLoad = DAG.getMaskedLoad( + VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(), + N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad()); + SDValue Combo = NewLoad; + if (!PassThru.isUndef() && + (PassThru.getOpcode() != ISD::BITCAST || + !IsZero(PassThru->getOperand(0)))) + Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); + return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) // Acquire/Release load/store is not legal for targets without a dmb or @@ -8273,12 +8983,12 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N, // Under Power Management extensions, the cycle-count is: // mrc p15, #0, <Rt>, c9, c13, #0 SDValue Ops[] = { N->getOperand(0), // Chain - DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), - DAG.getConstant(15, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(9, DL, MVT::i32), - DAG.getConstant(13, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32) + DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), + DAG.getTargetConstant(15, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32), + DAG.getTargetConstant(9, DL, MVT::i32), + DAG.getTargetConstant(13, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32) }; SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, @@ -8412,6 +9122,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); + case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); @@ -8426,24 +9137,25 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); - case ISD::SETCC: return LowerVSETCC(Op, DAG); + case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); - case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); - case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SDIV: if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) return LowerDIV_Windows(Op, DAG, /* Signed */ true); - return LowerSDIV(Op, DAG); + return LowerSDIV(Op, DAG, Subtarget); case ISD::UDIV: if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) return LowerDIV_Windows(Op, DAG, /* Signed */ false); - return LowerUDIV(Op, DAG); + return LowerUDIV(Op, DAG, Subtarget); case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::SADDO: @@ -8452,6 +9164,15 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UADDO: case ISD::USUBO: return LowerUnsignedALUO(Op, DAG); + case ISD::SADDSAT: + case ISD::SSUBSAT: + return LowerSADDSUBSAT(Op, DAG, Subtarget); + case ISD::LOAD: + return LowerPredicateLoad(Op, DAG); + case ISD::STORE: + return LowerPredicateStore(Op, DAG); + case ISD::MLOAD: + return LowerMLOAD(Op, DAG); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); @@ -8530,6 +9251,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Res.getValue(0)); Results.push_back(Res.getValue(1)); return; + case ISD::SADDSAT: + case ISD::SSUBSAT: + Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget); + break; case ISD::READCYCLECOUNTER: ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); return; @@ -8600,19 +9325,19 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, // orr r5, r5, #1 // add r5, pc // str r5, [$jbuf, #+4] ; &jbuf[1] - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) .addConstantPoolIndex(CPI) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); // Set the low bit because of thumb mode. - unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(0x01) .add(predOps(ARMCC::AL)) .add(condCodeOp()); - unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) .addReg(NewVReg2, RegState::Kill) .addImm(PCLabelId); @@ -8630,28 +9355,28 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, // orrs r1, r2 // add r2, $jbuf, #+4 ; &jbuf[1] // str r1, [r2] - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) .addConstantPoolIndex(CPI) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); - unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(PCLabelId); // Set the low bit because of thumb mode. - unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) .addReg(ARM::CPSR, RegState::Define) .addImm(1) .add(predOps(ARMCC::AL)); - unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3, RegState::Kill) .add(predOps(ARMCC::AL)); - unsigned NewVReg5 = MRI->createVirtualRegister(TRC); + Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) .addFrameIndex(FI) .addImm(36); // &jbuf[1] :: pc @@ -8666,13 +9391,13 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, // ldr r1, LCPI1_1 // add r1, pc, r1 // str r1, [$jbuf, #+4] ; &jbuf[1] - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) .addConstantPoolIndex(CPI) .addImm(0) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); - unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(PCLabelId) @@ -8794,7 +9519,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, bool IsPositionIndependent = isPositionIndependent(); unsigned NumLPads = LPadList.size(); if (Subtarget->isThumb2()) { - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) .addFrameIndex(FI) .addImm(4) @@ -8807,7 +9532,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addImm(LPadList.size()) .add(predOps(ARMCC::AL)); } else { - unsigned VReg1 = MRI->createVirtualRegister(TRC); + Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) .addImm(NumLPads & 0xFFFF) .add(predOps(ARMCC::AL)); @@ -8832,12 +9557,12 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addImm(ARMCC::HI) .addReg(ARM::CPSR); - unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); - unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) .addReg(NewVReg3, RegState::Kill) .addReg(NewVReg1) @@ -8850,7 +9575,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(NewVReg1) .addJumpTableIndex(MJTI); } else if (Subtarget->isThumb()) { - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) .addFrameIndex(FI) .addImm(1) @@ -8873,7 +9598,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); - unsigned VReg1 = MRI->createVirtualRegister(TRC); + Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) .addReg(VReg1, RegState::Define) .addConstantPoolIndex(Idx) @@ -8889,19 +9614,19 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addImm(ARMCC::HI) .addReg(ARM::CPSR); - unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + Register NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg1) .addImm(2) .add(predOps(ARMCC::AL)); - unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); - unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg2, RegState::Kill) @@ -8911,7 +9636,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); - unsigned NewVReg5 = MRI->createVirtualRegister(TRC); + Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) .addReg(NewVReg4, RegState::Kill) .addImm(0) @@ -8932,7 +9657,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(NewVReg6, RegState::Kill) .addJumpTableIndex(MJTI); } else { - unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + Register NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) .addFrameIndex(FI) .addImm(4) @@ -8945,7 +9670,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addImm(NumLPads) .add(predOps(ARMCC::AL)); } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { - unsigned VReg1 = MRI->createVirtualRegister(TRC); + Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) .addImm(NumLPads & 0xFFFF) .add(predOps(ARMCC::AL)); @@ -8974,7 +9699,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); - unsigned VReg1 = MRI->createVirtualRegister(TRC); + Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) .addReg(VReg1, RegState::Define) .addConstantPoolIndex(Idx) @@ -8991,20 +9716,20 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addImm(ARMCC::HI) .addReg(ARM::CPSR); - unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + Register NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) .addReg(NewVReg1) .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) .add(predOps(ARMCC::AL)) .add(condCodeOp()); - unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + Register NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); - unsigned NewVReg5 = MRI->createVirtualRegister(TRC); + Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) .addReg(NewVReg3, RegState::Kill) .addReg(NewVReg4) @@ -9239,8 +9964,8 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); - unsigned dest = MI.getOperand(0).getReg(); - unsigned src = MI.getOperand(1).getReg(); + Register dest = MI.getOperand(0).getReg(); + Register src = MI.getOperand(1).getReg(); unsigned SizeVal = MI.getOperand(2).getImm(); unsigned Align = MI.getOperand(3).getImm(); DebugLoc dl = MI.getDebugLoc(); @@ -9291,9 +10016,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, unsigned srcIn = src; unsigned destIn = dest; for (unsigned i = 0; i < LoopSize; i+=UnitSize) { - unsigned srcOut = MRI.createVirtualRegister(TRC); - unsigned destOut = MRI.createVirtualRegister(TRC); - unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); + Register srcOut = MRI.createVirtualRegister(TRC); + Register destOut = MRI.createVirtualRegister(TRC); + Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, @@ -9306,9 +10031,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, // [scratch, srcOut] = LDRB_POST(srcIn, 1) // [destOut] = STRB_POST(scratch, destIn, 1) for (unsigned i = 0; i < BytesLeft; i++) { - unsigned srcOut = MRI.createVirtualRegister(TRC); - unsigned destOut = MRI.createVirtualRegister(TRC); - unsigned scratch = MRI.createVirtualRegister(TRC); + Register srcOut = MRI.createVirtualRegister(TRC); + Register destOut = MRI.createVirtualRegister(TRC); + Register scratch = MRI.createVirtualRegister(TRC); emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, @@ -9351,7 +10076,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, exitMBB->transferSuccessorsAndUpdatePHIs(BB); // Load an immediate to varEnd. - unsigned varEnd = MRI.createVirtualRegister(TRC); + Register varEnd = MRI.createVirtualRegister(TRC); if (Subtarget->useMovt()) { unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) @@ -9401,12 +10126,12 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, // destPhi = PHI(destLoop, dst) MachineBasicBlock *entryBB = BB; BB = loopMBB; - unsigned varLoop = MRI.createVirtualRegister(TRC); - unsigned varPhi = MRI.createVirtualRegister(TRC); - unsigned srcLoop = MRI.createVirtualRegister(TRC); - unsigned srcPhi = MRI.createVirtualRegister(TRC); - unsigned destLoop = MRI.createVirtualRegister(TRC); - unsigned destPhi = MRI.createVirtualRegister(TRC); + Register varLoop = MRI.createVirtualRegister(TRC); + Register varPhi = MRI.createVirtualRegister(TRC); + Register srcLoop = MRI.createVirtualRegister(TRC); + Register srcPhi = MRI.createVirtualRegister(TRC); + Register destLoop = MRI.createVirtualRegister(TRC); + Register destPhi = MRI.createVirtualRegister(TRC); BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) .addReg(varLoop).addMBB(loopMBB) @@ -9420,7 +10145,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) - unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); + Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, IsThumb1, IsThumb2); emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, @@ -9461,9 +10186,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, unsigned srcIn = srcLoop; unsigned destIn = destLoop; for (unsigned i = 0; i < BytesLeft; i++) { - unsigned srcOut = MRI.createVirtualRegister(TRC); - unsigned destOut = MRI.createVirtualRegister(TRC); - unsigned scratch = MRI.createVirtualRegister(TRC); + Register srcOut = MRI.createVirtualRegister(TRC); + Register destOut = MRI.createVirtualRegister(TRC); + Register scratch = MRI.createVirtualRegister(TRC); emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, @@ -9523,7 +10248,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, break; case CodeModel::Large: { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) .addExternalSymbol("__chkstk"); @@ -9771,8 +10496,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // equality. bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; - unsigned LHS1 = MI.getOperand(1).getReg(); - unsigned LHS2 = MI.getOperand(2).getReg(); + Register LHS1 = MI.getOperand(1).getReg(); + Register LHS2 = MI.getOperand(2).getReg(); if (RHSisZero) { BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(LHS1) @@ -9782,8 +10507,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(LHS2).addImm(0) .addImm(ARMCC::EQ).addReg(ARM::CPSR); } else { - unsigned RHS1 = MI.getOperand(3).getReg(); - unsigned RHS2 = MI.getOperand(4).getReg(); + Register RHS1 = MI.getOperand(3).getReg(); + Register RHS2 = MI.getOperand(4).getReg(); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(LHS1) .addReg(RHS1) @@ -9844,15 +10569,15 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, Fn->insert(BBI, RSBBB); Fn->insert(BBI, SinkBB); - unsigned int ABSSrcReg = MI.getOperand(1).getReg(); - unsigned int ABSDstReg = MI.getOperand(0).getReg(); + Register ABSSrcReg = MI.getOperand(1).getReg(); + Register ABSDstReg = MI.getOperand(0).getReg(); bool ABSSrcKIll = MI.getOperand(1).isKill(); bool isThumb2 = Subtarget->isThumb2(); MachineRegisterInfo &MRI = Fn->getRegInfo(); // In Thumb mode S must not be specified if source register is the SP or // PC and if destination register is the SP, so restrict register class - unsigned NewRsbDstReg = - MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); + Register NewRsbDstReg = MRI.createVirtualRegister( + isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); // Transfer the remainder of BB and its successor edges to sinkMBB. SinkBB->splice(SinkBB->begin(), BB, @@ -9931,7 +10656,7 @@ static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, // The MEMCPY both defines and kills the scratch registers. for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { - unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass + Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass); MIB.addReg(TmpReg, RegState::Define|RegState::Dead); } @@ -10369,10 +11094,7 @@ static SDValue findMUL_LOHI(SDValue V) { static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { - if (Subtarget->isThumb()) { - if (!Subtarget->hasDSP()) - return SDValue(); - } else if (!Subtarget->hasV5TEOps()) + if (!Subtarget->hasBaseDSP()) return SDValue(); // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and @@ -11253,7 +11975,7 @@ static SDValue PerformANDCombine(SDNode *N, BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VbicVT; - SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), + SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VbicVT, VT.is128BitVector(), OtherModImm); @@ -11469,6 +12191,77 @@ static SDValue PerformORCombineToBFI(SDNode *N, return SDValue(); } +static bool isValidMVECond(unsigned CC, bool IsFloat) { + switch (CC) { + case ARMCC::EQ: + case ARMCC::NE: + case ARMCC::LE: + case ARMCC::GT: + case ARMCC::GE: + case ARMCC::LT: + return true; + case ARMCC::HS: + case ARMCC::HI: + return !IsFloat; + default: + return false; + }; +} + +static SDValue PerformORCombine_i1(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain + // together with predicates + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + ARMCC::CondCodes CondCode0 = ARMCC::AL; + ARMCC::CondCodes CondCode1 = ARMCC::AL; + if (N0->getOpcode() == ARMISD::VCMP) + CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2)) + ->getZExtValue(); + else if (N0->getOpcode() == ARMISD::VCMPZ) + CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1)) + ->getZExtValue(); + if (N1->getOpcode() == ARMISD::VCMP) + CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2)) + ->getZExtValue(); + else if (N1->getOpcode() == ARMISD::VCMPZ) + CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1)) + ->getZExtValue(); + + if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) + return SDValue(); + + unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); + unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); + + if (!isValidMVECond(Opposite0, + N0->getOperand(0)->getValueType(0).isFloatingPoint()) || + !isValidMVECond(Opposite1, + N1->getOperand(0)->getValueType(0).isFloatingPoint())) + return SDValue(); + + SmallVector<SDValue, 4> Ops0; + Ops0.push_back(N0->getOperand(0)); + if (N0->getOpcode() == ARMISD::VCMP) + Ops0.push_back(N0->getOperand(1)); + Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); + SmallVector<SDValue, 4> Ops1; + Ops1.push_back(N1->getOperand(0)); + if (N1->getOpcode() == ARMISD::VCMP) + Ops1.push_back(N1->getOperand(1)); + Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); + + SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); + SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); + SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); + return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, + DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); +} + /// PerformORCombine - Target-specific dag combine xforms for ISD::OR static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, @@ -11489,7 +12282,7 @@ static SDValue PerformORCombine(SDNode *N, BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VorrVT; - SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VorrVT, VT.is128BitVector(), OtherModImm); @@ -11553,6 +12346,10 @@ static SDValue PerformORCombine(SDNode *N, } } + if (Subtarget->hasMVEIntegerOps() && + (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) + return PerformORCombine_i1(N, DCI, Subtarget); + // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when // reasonable. if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { @@ -11921,6 +12718,24 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return Vec; } +static SDValue +PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + SDLoc dl(N); + + // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) + if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { + // If the valuetypes are the same, we can remove the cast entirely. + if (Op->getOperand(0).getValueType() == VT) + return Op->getOperand(0); + return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, + Op->getOperand(0).getValueType(), Op->getOperand(0)); + } + + return SDValue(); +} + /// PerformInsertEltCombine - Target-specific dag combine xforms for /// ISD::INSERT_VECTOR_ELT. static SDValue PerformInsertEltCombine(SDNode *N, @@ -12332,7 +13147,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N, // The canonical VMOV for a zero vector uses a 32-bit element size. unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); unsigned EltBits; - if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) + if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) EltSize = 8; EVT VT = N->getValueType(0); if (EltSize > VT.getScalarSizeInBits()) @@ -12382,95 +13197,163 @@ static SDValue PerformLOADCombine(SDNode *N, return SDValue(); } -/// PerformSTORECombine - Target-specific dag combine xforms for -/// ISD::STORE. -static SDValue PerformSTORECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - StoreSDNode *St = cast<StoreSDNode>(N); - if (St->isVolatile()) - return SDValue(); - - // Optimize trunc store (of multiple scalars) to shuffle and store. First, - // pack all of the elements in one place. Next, store to memory in fewer - // chunks. +// Optimize trunc store (of multiple scalars) to shuffle and store. First, +// pack all of the elements in one place. Next, store to memory in fewer +// chunks. +static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, + SelectionDAG &DAG) { SDValue StVal = St->getValue(); EVT VT = StVal.getValueType(); - if (St->isTruncatingStore() && VT.isVector()) { - SelectionDAG &DAG = DCI.DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT StVT = St->getMemoryVT(); - unsigned NumElems = VT.getVectorNumElements(); - assert(StVT != VT && "Cannot truncate to the same type"); - unsigned FromEltSz = VT.getScalarSizeInBits(); - unsigned ToEltSz = StVT.getScalarSizeInBits(); + if (!St->isTruncatingStore() || !VT.isVector()) + return SDValue(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT StVT = St->getMemoryVT(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromEltSz = VT.getScalarSizeInBits(); + unsigned ToEltSz = StVT.getScalarSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) + return SDValue(); - // From, To sizes and ElemCount must be pow of two - if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromEltSz) % ToEltSz) + return SDValue(); - // We are going to use the original vector elt for storing. - // Accumulated smaller vector elements must be a multiple of the store size. - if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); + unsigned SizeRatio = FromEltSz / ToEltSz; + assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); - unsigned SizeRatio = FromEltSz / ToEltSz; - assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), + NumElems * SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - // Create a type on which we perform the shuffle. - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), - NumElems*SizeRatio); - assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + SDLoc DL(St); + SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; ++i) + ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 + : i * SizeRatio; - SDLoc DL(St); - SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); - SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i < NumElems; ++i) - ShuffleVec[i] = DAG.getDataLayout().isBigEndian() - ? (i + 1) * SizeRatio - 1 - : i * SizeRatio; - - // Can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, - DAG.getUNDEF(WideVec.getValueType()), - ShuffleVec); - // At this point all of the data is stored at the bottom of the - // register. We now need to save it to mem. - - // Find the largest store unit - MVT StoreType = MVT::i8; - for (MVT Tp : MVT::integer_valuetypes()) { - if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) - StoreType = Tp; - } - // Didn't find a legal store type. - if (!TLI.isTypeLegal(StoreType)) - return SDValue(); + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); - // Bitcast the original vector into a vector of store-size units - EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), - StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); - assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); - SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); - SmallVector<SDValue, 8> Chains; - SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, - TLI.getPointerTy(DAG.getDataLayout())); - SDValue BasePtr = St->getBasePtr(); + SDValue Shuff = DAG.getVectorShuffle( + WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. - // Perform one or more big stores into memory. - unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); - for (unsigned I = 0; I < E; I++) { - SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, - StoreType, ShuffWide, - DAG.getIntPtrConstant(I, DL)); - SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, - St->getPointerInfo(), St->getAlignment(), - St->getMemOperand()->getFlags()); - BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, - Increment); - Chains.push_back(Ch); - } - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + // Find the largest store unit + MVT StoreType = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) + StoreType = Tp; } + // Didn't find a legal store type. + if (!TLI.isTypeLegal(StoreType)) + return SDValue(); + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = + EVT::getVectorVT(*DAG.getContext(), StoreType, + VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); + SmallVector<SDValue, 8> Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, + TLI.getPointerTy(DAG.getDataLayout())); + SDValue BasePtr = St->getBasePtr(); + + // Perform one or more big stores into memory. + unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); + for (unsigned I = 0; I < E; I++) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, + ShuffWide, DAG.getIntPtrConstant(I, DL)); + SDValue Ch = + DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), + St->getAlignment(), St->getMemOperand()->getFlags()); + BasePtr = + DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); + Chains.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); +} + +// Try taking a single vector store from an truncate (which would otherwise turn +// into an expensive buildvector) and splitting it into a series of narrowing +// stores. +static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, + SelectionDAG &DAG) { + if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) + return SDValue(); + SDValue Trunc = St->getValue(); + if (Trunc->getOpcode() != ISD::TRUNCATE) + return SDValue(); + EVT FromVT = Trunc->getOperand(0).getValueType(); + EVT ToVT = Trunc.getValueType(); + if (!ToVT.isVector()) + return SDValue(); + assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); + EVT ToEltVT = ToVT.getVectorElementType(); + EVT FromEltVT = FromVT.getVectorElementType(); + + unsigned NumElements = 0; + if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) + NumElements = 4; + if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) + NumElements = 8; + if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || + FromVT.getVectorNumElements() % NumElements != 0) + return SDValue(); + + SDLoc DL(St); + // Details about the old store + SDValue Ch = St->getChain(); + SDValue BasePtr = St->getBasePtr(); + unsigned Alignment = St->getOriginalAlignment(); + MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); + AAMDNodes AAInfo = St->getAAInfo(); + + EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); + EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); + + SmallVector<SDValue, 4> Stores; + for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { + unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); + + SDValue Extract = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), + DAG.getConstant(i * NumElements, DL, MVT::i32)); + SDValue Store = DAG.getTruncStore( + Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), + NewToVT, Alignment, MMOFlags, AAInfo); + Stores.push_back(Store); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); +} + +/// PerformSTORECombine - Target-specific dag combine xforms for +/// ISD::STORE. +static SDValue PerformSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + StoreSDNode *St = cast<StoreSDNode>(N); + if (St->isVolatile()) + return SDValue(); + SDValue StVal = St->getValue(); + EVT VT = StVal.getValueType(); + + if (Subtarget->hasNEON()) + if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) + return Store; + + if (Subtarget->hasMVEIntegerOps()) + if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) + return NewToken; if (!ISD::isNormalStore(St)) return SDValue(); @@ -12522,7 +13405,7 @@ static SDValue PerformSTORECombine(SDNode *N, } // If this is a legal vector store, try to combine it into a VST1_UPD. - if (ISD::isNormalStore(N) && VT.isVector() && + if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) return CombineBaseUpdate(N, DCI); @@ -12890,6 +13773,71 @@ static SDValue PerformShiftCombine(SDNode *N, return SDValue(); } +// Look for a sign/zero extend of a larger than legal load. This can be split +// into two extending loads, which are simpler to deal with than an arbitrary +// sign extend. +static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::LOAD) + return SDValue(); + LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); + if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || + LD->getExtensionType() != ISD::NON_EXTLOAD) + return SDValue(); + EVT FromVT = LD->getValueType(0); + EVT ToVT = N->getValueType(0); + if (!ToVT.isVector()) + return SDValue(); + assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); + EVT ToEltVT = ToVT.getVectorElementType(); + EVT FromEltVT = FromVT.getVectorElementType(); + + unsigned NumElements = 0; + if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) + NumElements = 4; + if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) + NumElements = 8; + if (NumElements == 0 || + FromVT.getVectorNumElements() == NumElements || + FromVT.getVectorNumElements() % NumElements != 0 || + !isPowerOf2_32(NumElements)) + return SDValue(); + + SDLoc DL(LD); + // Details about the old load + SDValue Ch = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + unsigned Alignment = LD->getOriginalAlignment(); + MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); + AAMDNodes AAInfo = LD->getAAInfo(); + + ISD::LoadExtType NewExtType = + N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); + EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); + EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); + unsigned NewOffset = NewFromVT.getSizeInBits() / 8; + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); + + // Split the load in half, each side of which is extended separately. This + // is good enough, as legalisation will take it from there. They are either + // already legal or they will be split further into something that is + // legal. + SDValue NewLoad1 = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, + LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo); + SDValue NewLoad2 = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, + LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, + Alignment, MMOFlags, AAInfo); + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(NewLoad1.getNode(), 1), + SDValue(NewLoad2.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); +} + /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, @@ -12927,6 +13875,10 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, } } + if (ST->hasMVEIntegerOps()) + if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) + return NewLoad; + return SDValue(); } @@ -13028,43 +13980,169 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D return V; } +// Given N, the value controlling the conditional branch, search for the loop +// intrinsic, returning it, along with how the value is used. We need to handle +// patterns such as the following: +// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) +// (brcond (setcc (loop.decrement), 0, eq), exit) +// (brcond (setcc (loop.decrement), 0, ne), header) +static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, + bool &Negate) { + switch (N->getOpcode()) { + default: + break; + case ISD::XOR: { + if (!isa<ConstantSDNode>(N.getOperand(1))) + return SDValue(); + if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) + return SDValue(); + Negate = !Negate; + return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); + } + case ISD::SETCC: { + auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!Const) + return SDValue(); + if (Const->isNullValue()) + Imm = 0; + else if (Const->isOne()) + Imm = 1; + else + return SDValue(); + CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); + return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); + } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); + if (IntOp != Intrinsic::test_set_loop_iterations && + IntOp != Intrinsic::loop_decrement_reg) + return SDValue(); + return N; + } + } + return SDValue(); +} + static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { - // Look for (brcond (xor test.set.loop.iterations, -1) - SDValue CC = N->getOperand(1); - unsigned Opc = CC->getOpcode(); - SDValue Int; - if ((Opc == ISD::XOR || Opc == ISD::SETCC) && - (CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) { + // The hwloop intrinsics that we're interested are used for control-flow, + // either for entering or exiting the loop: + // - test.set.loop.iterations will test whether its operand is zero. If it + // is zero, the proceeding branch should not enter the loop. + // - loop.decrement.reg also tests whether its operand is zero. If it is + // zero, the proceeding branch should not branch back to the beginning of + // the loop. + // So here, we need to check that how the brcond is using the result of each + // of the intrinsics to ensure that we're branching to the right place at the + // right time. + + ISD::CondCode CC; + SDValue Cond; + int Imm = 1; + bool Negate = false; + SDValue Chain = N->getOperand(0); + SDValue Dest; - assert((isa<ConstantSDNode>(CC->getOperand(1)) && - cast<ConstantSDNode>(CC->getOperand(1))->isOne()) && - "Expected to compare against 1"); + if (N->getOpcode() == ISD::BRCOND) { + CC = ISD::SETEQ; + Cond = N->getOperand(1); + Dest = N->getOperand(2); + } else { + assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); + CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); + Cond = N->getOperand(2); + Dest = N->getOperand(4); + if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { + if (!Const->isOne() && !Const->isNullValue()) + return SDValue(); + Imm = Const->getZExtValue(); + } else + return SDValue(); + } - Int = CC->getOperand(0); - } else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN) - Int = CC; - else + SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); + if (!Int) return SDValue(); - unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue(); - if (IntOp != Intrinsic::test_set_loop_iterations) - return SDValue(); + if (Negate) + CC = ISD::getSetCCInverse(CC, true); + + auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { + return (CC == ISD::SETEQ && Imm == 0) || + (CC == ISD::SETNE && Imm == 1) || + (CC == ISD::SETLT && Imm == 1) || + (CC == ISD::SETULT && Imm == 1); + }; + + auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { + return (CC == ISD::SETEQ && Imm == 1) || + (CC == ISD::SETNE && Imm == 0) || + (CC == ISD::SETGT && Imm == 0) || + (CC == ISD::SETUGT && Imm == 0) || + (CC == ISD::SETGE && Imm == 1) || + (CC == ISD::SETUGE && Imm == 1); + }; + + assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && + "unsupported condition"); SDLoc dl(Int); - SDValue Chain = N->getOperand(0); + SelectionDAG &DAG = DCI.DAG; SDValue Elements = Int.getOperand(2); - SDValue ExitBlock = N->getOperand(2); + unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); + assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) + && "expected single br user"); + SDNode *Br = *N->use_begin(); + SDValue OtherTarget = Br->getOperand(1); + + // Update the unconditional branch to branch to the given Dest. + auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { + SDValue NewBrOps[] = { Br->getOperand(0), Dest }; + SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); + DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); + }; - // TODO: Once we start supporting tail predication, we can add another - // operand to WLS for the number of elements processed in a vector loop. + if (IntOp == Intrinsic::test_set_loop_iterations) { + SDValue Res; + // We expect this 'instruction' to branch when the counter is zero. + if (IsTrueIfZero(CC, Imm)) { + SDValue Ops[] = { Chain, Elements, Dest }; + Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); + } else { + // The logic is the reverse of what we need for WLS, so find the other + // basic block target: the target of the proceeding br. + UpdateUncondBr(Br, Dest, DAG); - SDValue Ops[] = { Chain, Elements, ExitBlock }; - SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); - DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); - return Res; + SDValue Ops[] = { Chain, Elements, OtherTarget }; + Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); + } + DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); + return Res; + } else { + SDValue Size = DAG.getTargetConstant( + cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); + SDValue Args[] = { Int.getOperand(0), Elements, Size, }; + SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, + DAG.getVTList(MVT::i32, MVT::Other), Args); + DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); + + // We expect this instruction to branch when the count is not zero. + SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; + + // Update the unconditional branch to target the loop preheader if we've + // found the condition has been reversed. + if (Target == OtherTarget) + UpdateUncondBr(Br, Dest, DAG); + + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + SDValue(LoopDec.getNode(), 1), Chain); + + SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; + return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); + } + return SDValue(); } /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. @@ -13298,14 +14376,15 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); - case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget); + case ISD::BRCOND: + case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); case ARMISD::ADDC: case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); - case ISD::STORE: return PerformSTORECombine(N, DCI); + case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); @@ -13334,6 +14413,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); + case ARMISD::PREDICATE_CAST: + return PerformPREDICATE_CASTCombine(N, DCI); case ARMISD::SMULWB: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); @@ -13348,7 +14429,9 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); break; } - case ARMISD::SMLALBB: { + case ARMISD::SMLALBB: + case ARMISD::QADD16b: + case ARMISD::QSUB16b: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || @@ -13384,6 +14467,15 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); break; } + case ARMISD::QADD8b: + case ARMISD::QSUB8b: { + unsigned BitWidth = N->getValueType(0).getSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); + if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || + (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) + return SDValue(); + break; + } case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { @@ -13457,47 +14549,38 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, if (!Subtarget->hasMVEIntegerOps()) return false; - if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 && - Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 && - Ty != MVT::v2f64 && - // These are for truncated stores - Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16) - return false; - if (Subtarget->isLittle()) { - // In little-endian MVE, the store instructions VSTRB.U8, - // VSTRH.U16 and VSTRW.U32 all store the vector register in - // exactly the same format, and differ only in the range of - // their immediate offset field and the required alignment. - // - // In particular, VSTRB.U8 can store a vector at byte alignment. - // So at this stage we can simply say that loads/stores of all - // 128-bit wide vector types are permitted at any alignment, - // because we know at least _one_ instruction can manage that. - // - // Later on we might find that some of those loads are better - // generated as VLDRW.U32 if alignment permits, to take - // advantage of the larger immediate range. But for the moment, - // all that matters is that if we don't lower the load then - // _some_ instruction can handle it. + // These are for predicates + if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { + if (Fast) + *Fast = true; + return true; + } + + // These are for truncated stores/narrowing loads. They are fine so long as + // the alignment is at least the size of the item being loaded + if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && + Alignment >= VT.getScalarSizeInBits() / 8) { + if (Fast) + *Fast = true; + return true; + } + + // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and + // VSTRW.U32 all store the vector register in exactly the same format, and + // differ only in the range of their immediate offset field and the required + // alignment. So there is always a store that can be used, regardless of + // actual type. + // + // For big endian, that is not the case. But can still emit a (VSTRB.U8; + // VREV64.8) pair and get the same effect. This will likely be better than + // aligning the vector through the stack. + if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || + Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || + Ty == MVT::v2f64) { if (Fast) *Fast = true; return true; - } else { - // In big-endian MVE, those instructions aren't so similar - // after all, because they reorder the bytes of the vector - // differently. So this time we can only store a particular - // kind of vector if its alignment is at least the element - // type. And we can't store vectors of i64 or f64 at all - // without having to do some postprocessing, because there's - // no VSTRD.U64. - if (Ty == MVT::v16i8 || - ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) || - ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) { - if (Fast) - *Fast = true; - return true; - } } return false; @@ -13617,22 +14700,60 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) { /// sext/zext can be folded into vsubl. bool ARMTargetLowering::shouldSinkOperands(Instruction *I, SmallVectorImpl<Use *> &Ops) const { - if (!Subtarget->hasNEON() || !I->getType()->isVectorTy()) + if (!I->getType()->isVectorTy()) return false; - switch (I->getOpcode()) { - case Instruction::Sub: - case Instruction::Add: { - if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + if (Subtarget->hasNEON()) { + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::Add: { + if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + return false; + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + return true; + } + default: return false; - Ops.push_back(&I->getOperandUse(0)); - Ops.push_back(&I->getOperandUse(1)); - return true; + } } - default: + + if (!Subtarget->hasMVEIntegerOps()) + return false; + + auto IsSinker = [](Instruction *I, int Operand) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Mul: + return true; + case Instruction::Sub: + return Operand == 1; + default: + return false; + } + }; + + int Op = 0; + if (!isa<ShuffleVectorInst>(I->getOperand(Op))) + Op = 1; + if (!IsSinker(I, Op)) + return false; + if (!match(I->getOperand(Op), + m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), + m_Undef(), m_Zero()))) { return false; } - return false; + Instruction *Shuffle = cast<Instruction>(I->getOperand(Op)); + // All uses of the shuffle should be sunk to avoid duplicating it across gpr + // and vector registers + for (Use &U : Shuffle->uses()) { + Instruction *Insn = cast<Instruction>(U.getUser()); + if (!IsSinker(Insn, U.getOperandNo())) + return false; + } + Ops.push_back(&Shuffle->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(Op)); + return true; } bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { @@ -13641,6 +14762,11 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { if (!isTypeLegal(VT)) return false; + if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { + if (Ld->isExpandingLoad()) + return false; + } + // Don't create a loadext if we can fold the extension into a wide/long // instruction. // If there's more than one user instruction, the loadext is desirable no @@ -14028,6 +15154,52 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, return false; } +static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, + bool isSEXTLoad, bool isLE, SDValue &Base, + SDValue &Offset, bool &isInc, + SelectionDAG &DAG) { + if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) + return false; + if (!isa<ConstantSDNode>(Ptr->getOperand(1))) + return false; + + ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); + int RHSC = (int)RHS->getZExtValue(); + + auto IsInRange = [&](int RHSC, int Limit, int Scale) { + if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { + assert(Ptr->getOpcode() == ISD::ADD); + isInc = false; + Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); + return true; + } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { + isInc = Ptr->getOpcode() == ISD::ADD; + Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); + return true; + } + return false; + }; + + // Try to find a matching instruction based on s/zext, Alignment, Offset and + // (in BE) type. + Base = Ptr->getOperand(0); + if (VT == MVT::v4i16) { + if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) + return true; + } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { + if (IsInRange(RHSC, 0x80, 1)) + return true; + } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) && + IsInRange(RHSC, 0x80, 4)) + return true; + else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) && + IsInRange(RHSC, 0x80, 2)) + return true; + else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) + return true; + return false; +} + /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. @@ -14041,25 +15213,35 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, EVT VT; SDValue Ptr; + unsigned Align; bool isSEXTLoad = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { Ptr = LD->getBasePtr(); - VT = LD->getMemoryVT(); + VT = LD->getMemoryVT(); + Align = LD->getAlignment(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { Ptr = ST->getBasePtr(); - VT = ST->getMemoryVT(); + VT = ST->getMemoryVT(); + Align = ST->getAlignment(); } else return false; bool isInc; bool isLegal = false; - if (Subtarget->isThumb2()) - isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, - Offset, isInc, DAG); - else - isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, - Offset, isInc, DAG); + if (VT.isVector()) + isLegal = Subtarget->hasMVEIntegerOps() && + getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, + Subtarget->isLittle(), Base, Offset, + isInc, DAG); + else { + if (Subtarget->isThumb2()) + isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, + Offset, isInc, DAG); + else + isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, + Offset, isInc, DAG); + } if (!isLegal) return false; @@ -14077,15 +15259,18 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; + unsigned Align; bool isSEXTLoad = false, isNonExt; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { - VT = LD->getMemoryVT(); + VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); + Align = LD->getAlignment(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { - VT = ST->getMemoryVT(); + VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); + Align = ST->getAlignment(); isNonExt = !ST->isTruncatingStore(); } else return false; @@ -14108,12 +15293,19 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isInc; bool isLegal = false; - if (Subtarget->isThumb2()) - isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, - isInc, DAG); - else - isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + if (VT.isVector()) + isLegal = Subtarget->hasMVEIntegerOps() && + getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, + Subtarget->isLittle(), Base, Offset, isInc, DAG); + else { + if (Subtarget->isThumb2()) + isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + else + isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + } if (!isLegal) return false; @@ -14369,7 +15561,8 @@ const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { /// constraint it is for this target. ARMTargetLowering::ConstraintType ARMTargetLowering::getConstraintType(StringRef Constraint) const { - if (Constraint.size() == 1) { + unsigned S = Constraint.size(); + if (S == 1) { switch (Constraint[0]) { default: break; case 'l': return C_RegisterClass; @@ -14377,12 +15570,12 @@ ARMTargetLowering::getConstraintType(StringRef Constraint) const { case 'h': return C_RegisterClass; case 'x': return C_RegisterClass; case 't': return C_RegisterClass; - case 'j': return C_Other; // Constant for movw. - // An address with a single base register. Due to the way we - // currently handle addresses it is the same as an 'r' memory constraint. + case 'j': return C_Immediate; // Constant for movw. + // An address with a single base register. Due to the way we + // currently handle addresses it is the same as an 'r' memory constraint. case 'Q': return C_Memory; } - } else if (Constraint.size() == 2) { + } else if (S == 2) { switch (Constraint[0]) { default: break; case 'T': return C_RegisterClass; @@ -14535,7 +15728,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'j': // Constant suitable for movw, must be between 0 and // 65535. - if (Subtarget->hasV6T2Ops()) + if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) if (CVal >= 0 && CVal <= 65535) break; return; @@ -14643,7 +15836,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'N': - if (Subtarget->isThumb()) { // FIXME thumb2 + if (Subtarget->isThumb1Only()) { // This must be a constant between 0 and 31, for shift amounts. if (CVal >= 0 && CVal <= 31) break; @@ -14651,7 +15844,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'O': - if (Subtarget->isThumb()) { // FIXME thumb2 + if (Subtarget->isThumb1Only()) { // This must be a multiple of 4 between -508 and 508, for // ADD/SUB sp = sp + immediate. if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) @@ -14874,6 +16067,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { // without FP16. So we must do a function call. SDLoc Loc(Op); RTLIB::Libcall LC; + MakeLibCallOptions CallOptions; if (SrcSz == 16) { // Instruction from 16 -> 32 if (Subtarget->hasFP16()) @@ -14884,7 +16078,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected type for custom-lowering FP_EXTEND"); SrcVal = - makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first; + makeLibCall(DAG, LC, MVT::f32, SrcVal, CallOptions, Loc).first; } } @@ -14897,7 +16091,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { LC = RTLIB::getFPEXT(MVT::f32, MVT::f64); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected type for custom-lowering FP_EXTEND"); - return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first; + return makeLibCall(DAG, LC, MVT::f64, SrcVal, CallOptions, Loc).first; } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { @@ -14923,7 +16117,8 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected type for custom-lowering FP_ROUND"); - return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first; + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, Loc).first; } void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, @@ -15015,7 +16210,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); - Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; @@ -15030,7 +16225,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; - Info.align = 0; + Info.align.reset(); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; @@ -15056,7 +16251,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); - Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; @@ -15077,7 +16272,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = 0; + Info.align.reset(); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; @@ -15090,7 +16285,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -15102,7 +16297,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -15112,7 +16307,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; - Info.align = 8; + Info.align = Align(8); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; @@ -15122,7 +16317,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = 8; + Info.align = Align(8); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; @@ -15473,6 +16668,12 @@ bool ARMTargetLowering::isLegalInterleavedAccessType( return VecSize == 64 || VecSize % 128 == 0; } +unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { + if (Subtarget->hasNEON()) + return 4; + return TargetLoweringBase::getMaxSupportedInterleaveFactor(); +} + /// Lower an interleaved load into a vldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -15792,15 +16993,15 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, } /// Return the correct alignment for the current calling convention. -unsigned -ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, - DataLayout DL) const { +Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, + DataLayout DL) const { + const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy)); if (!ArgTy->isVectorTy()) - return DL.getABITypeAlignment(ArgTy); + return ABITypeAlign; // Avoid over-aligning vector parameters. It would require realigning the // stack and waste space for no real benefit. - return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment()); + return std::min(ABITypeAlign, DL.getStackAlignment()); } /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of @@ -15861,7 +17062,7 @@ void ARMTargetLowering::insertCopiesSplitCSR( else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 1675ec59a354..53813fad5afd 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -103,6 +103,7 @@ class VectorType; ADDE, // Add using carry SUBC, // Sub with carry SUBE, // Sub using carry + LSLS, // Shift left producing carry VMOVRRD, // double to two gprs. VMOVDRR, // Two gprs to double. @@ -126,17 +127,13 @@ class VectorType; WIN__DBZCHK, // Windows' divide by zero check WLS, // Low-overhead loops, While Loop Start + LOOP_DEC, // Really a part of LE, performs the sub + LE, // Low-overhead loops, Loop End - VCEQ, // Vector compare equal. - VCEQZ, // Vector compare equal to zero. - VCGE, // Vector compare greater than or equal. - VCGEZ, // Vector compare greater than or equal to zero. - VCLEZ, // Vector compare less than or equal to zero. - VCGEU, // Vector compare unsigned greater than or equal. - VCGT, // Vector compare greater than. - VCGTZ, // Vector compare greater than zero. - VCLTZ, // Vector compare less than zero. - VCGTU, // Vector compare unsigned greater than. + PREDICATE_CAST, // Predicate cast for MVE i1 types + + VCMP, // Vector compare. + VCMPZ, // Vector compare to zero. VTST, // Vector test bits. // Vector shift by vector @@ -200,6 +197,7 @@ class VectorType; VTRN, // transpose VTBL1, // 1-register shuffle with mask VTBL2, // 2-register shuffle with mask + VMOVN, // MVE vmovn // Vector multiply long: VMULLs, // ...signed @@ -221,6 +219,12 @@ class VectorType; SMMLAR, // Signed multiply long, round and add SMMLSR, // Signed multiply long, subtract and round + // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b stands for. + QADD8b, + QSUB8b, + QADD16b, + QSUB16b, + // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other // operations, but for ARM some BUILD_VECTORs are legal as-is and their @@ -243,6 +247,11 @@ class VectorType; // instructions. MEMCPY, + // V8.1MMainline condition select + CSINV, // Conditional select invert. + CSNEG, // Conditional select negate. + CSINC, // Conditional select increment. + // Vector load N-element structure to all lanes: VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD2DUP, @@ -539,7 +548,7 @@ class VectorType; Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const override; - unsigned getMaxSupportedInterleaveFactor() const override { return 4; } + unsigned getMaxSupportedInterleaveFactor() const override; bool lowerInterleavedLoad(LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, @@ -608,8 +617,8 @@ class VectorType; void finalizeLowering(MachineFunction &MF) const override; /// Return the correct alignment for the current calling convention. - unsigned getABIAlignmentForCallingConv(Type *ArgTy, - DataLayout DL) const override; + Align getABIAlignmentForCallingConv(Type *ArgTy, + DataLayout DL) const override; bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; @@ -670,6 +679,8 @@ class VectorType; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; @@ -721,8 +732,8 @@ class VectorType; void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const; - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl<SDNode *> &Created) const override; @@ -814,7 +825,7 @@ class VectorType; SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const; SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, - const SDLoc &dl, bool InvalidOnQNaN) const; + const SDLoc &dl) const; SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const; SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const; @@ -838,7 +849,7 @@ class VectorType; void setAllExpand(MVT VT); }; - enum NEONModImmType { + enum VMOVModImmType { VMOVModImm, VMVNModImm, MVEVMVNModImm, diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index bc93a058720c..1da32ad2af6c 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -188,6 +188,13 @@ def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> { let DecoderMethod = "DecodeCCOutOperand"; } +// Transform to generate the inverse of a condition code during ISel +def inv_cond_XFORM : SDNodeXForm<imm, [{ + ARMCC::CondCodes CC = static_cast<ARMCC::CondCodes>(N->getZExtValue()); + return CurDAG->getTargetConstant(ARMCC::getOppositeCondition(CC), SDLoc(N), + MVT::i32); +}]>; + // VPT predicate def VPTPredNOperand : AsmOperandClass { @@ -401,6 +408,8 @@ class InstTemplate<AddrMode am, int sz, IndexMode im, // mnemonic (when not in an IT block) or preclude it (when in an IT block). bit thumbArithFlagSetting = 0; + bit validForTailPredication = 0; + // If this is a pseudo instruction, mark it isCodeGenOnly. let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo"); @@ -412,6 +421,7 @@ class InstTemplate<AddrMode am, int sz, IndexMode im, let TSFlags{14} = canXformTo16Bit; let TSFlags{18-15} = D.Value; let TSFlags{19} = thumbArithFlagSetting; + let TSFlags{20} = validForTailPredication; let Constraints = cstr; let Itinerary = itin; @@ -455,6 +465,7 @@ class AsmPseudoInst<string asm, dag iops, dag oops = (outs)> let isCodeGenOnly = 0; // So we get asm matcher for it. let AsmString = asm; let isPseudo = 1; + let hasNoSchedulingInfo = 1; } class ARMAsmPseudo<string asm, dag iops, dag oops = (outs)> @@ -2282,7 +2293,7 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6, let Inst{24} = SIMM{7}; let Inst{18-16} = SIMM{6-4}; let Inst{3-0} = SIMM{3-0}; - let DecoderMethod = "DecodeNEONModImmInstruction"; + let DecoderMethod = "DecodeVMOVModImmInstruction"; } // NEON 2 vector register format. @@ -2724,6 +2735,16 @@ def complexrotateopodd : Operand<i32> { let PrintMethod = "printComplexRotationOp<180, 90>"; } +def MveSaturateOperand : AsmOperandClass { + let PredicateMethod = "isMveSaturateOp"; + let DiagnosticString = "saturate operand must be 48 or 64"; + let Name = "MveSaturate"; +} +def saturateop : Operand<i32> { + let ParserMatchClass = MveSaturateOperand; + let PrintMethod = "printMveSaturateOp"; +} + // Data type suffix token aliases. Implements Table A7-3 in the ARM ARM. def : TokenAlias<".s8", ".i8">; def : TokenAlias<".u8", ".i8">; diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp index 388c889349b7..a802d5a06f07 100644 --- a/lib/Target/ARM/ARMInstrInfo.cpp +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -117,7 +117,7 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const { MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - unsigned Reg = MI->getOperand(0).getReg(); + Register Reg = MI->getOperand(0).getReg(); MachineInstrBuilder MIB; MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg) diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index e35145463852..fe696222ec70 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -51,8 +51,6 @@ def SDT_ARMAnd : SDTypeProfile<1, 2, SDTCisVT<2, i32>]>; def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; -def SDT_ARMFCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, - SDTCisVT<2, i32>]>; def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>; @@ -108,14 +106,24 @@ def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>, // TODO Add another operand for 'Size' so that we can re-use this node when we // start supporting *TP versions. -def SDT_ARMWhileLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, - SDTCisVT<1, OtherVT>]>; +def SDT_ARMLoLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, + SDTCisVT<1, OtherVT>]>; def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>; def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>; def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>; def ARMSmlsldx : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>; +def SDT_ARMCSel : SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisInt<3>, + SDTCisVT<3, i32>]>; + +def ARMcsinv : SDNode<"ARMISD::CSINV", SDT_ARMCSel, [SDNPOptInGlue]>; +def ARMcsneg : SDNode<"ARMISD::CSNEG", SDT_ARMCSel, [SDNPOptInGlue]>; +def ARMcsinc : SDNode<"ARMISD::CSINC", SDT_ARMCSel, [SDNPOptInGlue]>; + def SDT_MulHSR : SDTypeProfile<1, 3, [SDTCisVT<0,i32>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, @@ -194,6 +202,7 @@ def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>; def ARMaddc : SDNode<"ARMISD::ADDC", SDTBinaryArithWithFlags, [SDNPCommutative]>; def ARMsubc : SDNode<"ARMISD::SUBC", SDTBinaryArithWithFlags>; +def ARMlsls : SDNode<"ARMISD::LSLS", SDTBinaryArithWithFlags>; def ARMadde : SDNode<"ARMISD::ADDE", SDTBinaryArithWithFlagsInOut>; def ARMsube : SDNode<"ARMISD::SUBE", SDTBinaryArithWithFlagsInOut>; @@ -229,6 +238,11 @@ def ARMsmlalbt : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>; def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>; def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>; +def ARMqadd8b : SDNode<"ARMISD::QADD8b", SDT_ARMAnd, []>; +def ARMqsub8b : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>; +def ARMqadd16b : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>; +def ARMqsub16b : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>; + // Vector operations shared between NEON and MVE def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; @@ -265,8 +279,16 @@ def ARMvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>; def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>; def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>; -def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop, - [SDNPHasChain]>; +def SDTARMVCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, + SDTCisInt<3>]>; +def SDTARMVCMPZ : SDTypeProfile<1, 2, [SDTCisInt<2>]>; + +def ARMvcmp : SDNode<"ARMISD::VCMP", SDTARMVCMP>; +def ARMvcmpz : SDNode<"ARMISD::VCMPZ", SDTARMVCMPZ>; + +def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>; +def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>; +def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>; //===----------------------------------------------------------------------===// // ARM Flag Definitions. @@ -1948,7 +1970,7 @@ multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii, /// the function. The first operand is the ID# for this instruction, the second /// is the index into the MachineConstantPool that this is, the third is the /// size in bytes of this constant pool entry. -let hasSideEffects = 0, isNotDuplicable = 1 in +let hasSideEffects = 0, isNotDuplicable = 1, hasNoSchedulingInfo = 1 in def CONSTPOOL_ENTRY : PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, i32imm:$size), NoItinerary, []>; @@ -2361,6 +2383,12 @@ let isCall = 1, def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func), 8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>, Requires<[IsARM]>, Sched<[WriteBr]>; + + // push lr before the call + def BL_PUSHLR : ARMPseudoInst<(outs), (ins GPRlr:$ra, arm_bl_target:$func), + 4, IIC_Br, + []>, + Requires<[IsARM]>, Sched<[WriteBr]>; } let isBranch = 1, isTerminator = 1 in { @@ -3727,6 +3755,23 @@ let DecoderMethod = "DecodeQADDInstruction" in [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))]>; } +def : ARMV5TEPat<(saddsat GPR:$a, GPR:$b), + (QADD GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(ssubsat GPR:$a, GPR:$b), + (QSUB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn), + (QDADD rGPR:$Rm, rGPR:$Rn)>; +def : ARMV5TEPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)), + (QDSUB rGPR:$Rm, rGPR:$Rn)>; +def : ARMV6Pat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn), + (QADD8 rGPR:$Rm, rGPR:$Rn)>; +def : ARMV6Pat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn), + (QSUB8 rGPR:$Rm, rGPR:$Rn)>; +def : ARMV6Pat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn), + (QADD16 rGPR:$Rm, rGPR:$Rn)>; +def : ARMV6Pat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn), + (QSUB16 rGPR:$Rm, rGPR:$Rn)>; + def UQADD16 : AAIIntrinsic<0b01100110, 0b11110001, "uqadd16", int_arm_uqadd16>; def UQADD8 : AAIIntrinsic<0b01100110, 0b11111001, "uqadd8", int_arm_uqadd8>; def UQSUB16 : AAIIntrinsic<0b01100110, 0b11110111, "uqsub16", int_arm_uqsub16>; @@ -4870,14 +4915,13 @@ def SB : AInoP<(outs), (ins), MiscFrm, NoItinerary, "sb", "", []>, let hasSideEffects = 1; } -let usesCustomInserter = 1, Defs = [CPSR] in { - -// Pseudo instruction that combines movs + predicated rsbmi -// to implement integer ABS +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in { + // Pseudo instruction that combines movs + predicated rsbmi + // to implement integer ABS def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>; } -let usesCustomInserter = 1, Defs = [CPSR] in { +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in { def COPY_STRUCT_BYVAL_I32 : PseudoInst< (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment), NoItinerary, @@ -5085,8 +5129,8 @@ def SWPB: AIswp<1, (outs GPRnopc:$Rt), def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, - imm:$CRm, imm:$opc2)]>, + [(int_arm_cdp timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn, + timm:$CRm, timm:$opc2)]>, Requires<[IsARM,PreV8]> { bits<4> opc1; bits<4> CRn; @@ -5109,8 +5153,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, - imm:$CRm, imm:$opc2)]>, + [(int_arm_cdp2 timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn, + timm:$CRm, timm:$opc2)]>, Requires<[IsARM,PreV8]> { let Inst{31-28} = 0b1111; bits<4> opc1; @@ -5289,15 +5333,15 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> { } } -defm LDC : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm LDCL : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm LDC2 : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; -defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; +defm LDC : LdStCop <1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm LDCL : LdStCop <1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm LDC2 : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; +defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; -defm STC : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm STCL : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm STC2 : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; -defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; +defm STC : LdStCop <0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm STCL : LdStCop <0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm STC2 : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; +defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>; } // DecoderNamespace = "CoProc" @@ -5333,8 +5377,8 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */, (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), - [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, - imm:$CRm, imm:$opc2)]>, + [(int_arm_mcr timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn, + timm:$CRm, timm:$opc2)]>, ComplexDeprecationPredicate<"MCR">; def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm", (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, @@ -5347,8 +5391,8 @@ def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm", (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, 0, pred:$p)>; -def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), - (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; +def : ARMPat<(int_arm_mrc timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2), + (MRC p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>; class MovRCopro2<string opc, bit direction, dag oops, dag iops, list<dag> pattern> @@ -5379,8 +5423,8 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */, (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), - [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, - imm:$CRm, imm:$opc2)]>, + [(int_arm_mcr2 timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn, + timm:$CRm, timm:$opc2)]>, Requires<[IsARM,PreV8]>; def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm", (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, @@ -5394,9 +5438,9 @@ def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm", (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, 0)>; -def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, - imm:$CRm, imm:$opc2), - (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; +def : ARMV5TPat<(int_arm_mrc2 timm:$cop, timm:$opc1, timm:$CRn, + timm:$CRm, timm:$opc2), + (MRC2 p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>; class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag> pattern = []> @@ -5422,8 +5466,8 @@ class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag> def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */, (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), - [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt, - GPRnopc:$Rt2, imm:$CRm)]>; + [(int_arm_mcrr timm:$cop, timm:$opc1, GPRnopc:$Rt, + GPRnopc:$Rt2, timm:$CRm)]>; def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */, (outs GPRnopc:$Rt, GPRnopc:$Rt2), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>; @@ -5455,8 +5499,8 @@ class MovRRCopro2<string opc, bit direction, dag oops, dag iops, def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */, (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), - [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt, - GPRnopc:$Rt2, imm:$CRm)]>; + [(int_arm_mcrr2 timm:$cop, timm:$opc1, GPRnopc:$Rt, + GPRnopc:$Rt2, timm:$CRm)]>; def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */, (outs GPRnopc:$Rt, GPRnopc:$Rt2), @@ -5579,12 +5623,12 @@ def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn), def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone, [SDNPHasChain, SDNPSideEffect]>; -let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in +let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP], hasNoSchedulingInfo = 1 in def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>; def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK, [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; -let usesCustomInserter = 1, Defs = [CPSR] in +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary, [(win__dbzchk tGPR:$divisor)]>; @@ -6131,10 +6175,10 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm", def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>, ComplexDeprecationPredicate<"IT">; -let mayLoad = 1, mayStore =1, hasSideEffects = 1 in +let mayLoad = 1, mayStore =1, hasSideEffects = 1, hasNoSchedulingInfo = 1 in def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn), NoItinerary, - [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>; + [(set GPR:$Rd, (int_arm_space timm:$size, GPR:$Rn))]>; //===---------------------------------- // Atomic cmpxchg for -O0 @@ -6174,4 +6218,5 @@ def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary, let hasSideEffects = 1; let Size = 0; let AsmString = "@ COMPILER BARRIER"; + let hasNoSchedulingInfo = 1; } diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td index 3e7ae55c7fc8..4f67cd6e47cc 100644 --- a/lib/Target/ARM/ARMInstrMVE.td +++ b/lib/Target/ARM/ARMInstrMVE.td @@ -160,7 +160,8 @@ class TMemImm7ShiftOffsetAsmOperand<int shift> : AsmOperandClass { let RenderMethod = "addMemImmOffsetOperands"; } -class taddrmode_imm7<int shift> : MemOperand { +class taddrmode_imm7<int shift> : MemOperand, + ComplexPattern<i32, 2, "SelectTAddrModeImm7<"#shift#">", []> { let ParserMatchClass = TMemImm7ShiftOffsetAsmOperand<shift>; // They are printed the same way as the T2 imm8 version let PrintMethod = "printT2AddrModeImm8Operand<false>"; @@ -221,7 +222,9 @@ def t2am_imm7shift0OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<0>; def t2am_imm7shift1OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<1>; def t2am_imm7shift2OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<2>; -class t2am_imm7_offset<int shift> : MemOperand { +class t2am_imm7_offset<int shift> : MemOperand, + ComplexPattern<i32, 1, "SelectT2AddrModeImm7Offset<"#shift#">", + [], [SDNPWantRoot]> { // They are printed the same way as the imm8 version let PrintMethod = "printT2AddrModeImm8OffsetOperand"; let ParserMatchClass = @@ -371,6 +374,8 @@ class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4, list<dag> pattern=[]> let Inst{7-6} = 0b00; let Inst{5-4} = op5_4{1-0}; let Inst{3-0} = 0b1101; + + let Unpredictable{8-6} = 0b111; } def MVE_SQRSHR : MVE_ScalarShiftSRegReg<"sqrshr", 0b10>; @@ -403,18 +408,17 @@ class MVE_ScalarShiftDRegImm<string iname, bits<2> op5_4, bit op16, let Inst{3-0} = 0b1111; } -class MVE_ScalarShiftDRegReg<string iname, bit op5, bit op16, - list<dag> pattern=[]> +class MVE_ScalarShiftDRegRegBase<string iname, dag iops, string asm, + bit op5, bit op16, list<dag> pattern=[]> : MVE_ScalarShiftDoubleReg< - iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm), - "$RdaLo, $RdaHi, $Rm", "@earlyclobber $RdaHi,@earlyclobber $RdaLo," - "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", + iname, iops, asm, "@earlyclobber $RdaHi,@earlyclobber $RdaLo," + "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", pattern> { bits<4> Rm; let Inst{16} = op16; let Inst{15-12} = Rm{3-0}; - let Inst{7-6} = 0b00; + let Inst{6} = 0b0; let Inst{5} = op5; let Inst{4} = 0b0; let Inst{3-0} = 0b1101; @@ -427,27 +431,44 @@ class MVE_ScalarShiftDRegReg<string iname, bit op5, bit op16, let DecoderMethod = "DecodeMVEOverlappingLongShift"; } -def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, +class MVE_ScalarShiftDRegReg<string iname, bit op5, list<dag> pattern=[]> + : MVE_ScalarShiftDRegRegBase< + iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm), + "$RdaLo, $RdaHi, $Rm", op5, 0b0, pattern> { + + let Inst{7} = 0b0; +} + +class MVE_ScalarShiftDRegRegWithSat<string iname, bit op5, list<dag> pattern=[]> + : MVE_ScalarShiftDRegRegBase< + iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm, saturateop:$sat), + "$RdaLo, $RdaHi, $sat, $Rm", op5, 0b1, pattern> { + bit sat; + + let Inst{7} = sat; +} + +def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMasrl tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm))]>; def MVE_ASRLi : MVE_ScalarShiftDRegImm<"asrl", 0b10, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMasrl tGPREven:$RdaLo_src, - tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>; -def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, + tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>; +def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMlsll tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm))]>; def MVE_LSLLi : MVE_ScalarShiftDRegImm<"lsll", 0b00, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMlsll tGPREven:$RdaLo_src, - tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>; + tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>; def MVE_LSRL : MVE_ScalarShiftDRegImm<"lsrl", 0b01, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi, (ARMlsrl tGPREven:$RdaLo_src, - tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>; + tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>; -def MVE_SQRSHRL : MVE_ScalarShiftDRegReg<"sqrshrl", 0b1, 0b1>; +def MVE_SQRSHRL : MVE_ScalarShiftDRegRegWithSat<"sqrshrl", 0b1>; def MVE_SQSHLL : MVE_ScalarShiftDRegImm<"sqshll", 0b11, 0b1>; def MVE_SRSHRL : MVE_ScalarShiftDRegImm<"srshrl", 0b10, 0b1>; -def MVE_UQRSHLL : MVE_ScalarShiftDRegReg<"uqrshll", 0b0, 0b1>; +def MVE_UQRSHLL : MVE_ScalarShiftDRegRegWithSat<"uqrshll", 0b0>; def MVE_UQSHLL : MVE_ScalarShiftDRegImm<"uqshll", 0b00, 0b1>; def MVE_URSHRL : MVE_ScalarShiftDRegImm<"urshrl", 0b01, 0b1>; @@ -531,6 +552,19 @@ defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>; defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>; defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>; +let Predicates = [HasMVEInt] in { + def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>; + def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>; + def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>; + def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))), + (i32 (MVE_VADDVu32acc $src2, $src1))>; + def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))), + (i32 (MVE_VADDVu16acc $src2, $src1))>; + def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))), + (i32 (MVE_VADDVu8acc $src2, $src1))>; + +} + class MVE_VADDLV<string iname, string suffix, dag iops, string cstr, bit A, bit U, list<dag> pattern=[]> : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname, @@ -636,6 +670,35 @@ multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, list<dag> pattern=[]> { defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 0b1>; defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0b0>; +let Predicates = [HasMVEInt] in { + def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))), + (i32 (MVE_VMAXVs8 (t2MVNi (i32 127)), $src))>; + def : Pat<(i32 (vecreduce_smax (v8i16 MQPR:$src))), + (i32 (MVE_VMAXVs16 (t2MOVi32imm (i32 -32768)), $src))>; + def : Pat<(i32 (vecreduce_smax (v4i32 MQPR:$src))), + (i32 (MVE_VMAXVs32 (t2MOVi (i32 -2147483648)), $src))>; + def : Pat<(i32 (vecreduce_umax (v16i8 MQPR:$src))), + (i32 (MVE_VMAXVu8 (t2MOVi (i32 0)), $src))>; + def : Pat<(i32 (vecreduce_umax (v8i16 MQPR:$src))), + (i32 (MVE_VMAXVu16 (t2MOVi (i32 0)), $src))>; + def : Pat<(i32 (vecreduce_umax (v4i32 MQPR:$src))), + (i32 (MVE_VMAXVu32 (t2MOVi (i32 0)), $src))>; + + def : Pat<(i32 (vecreduce_smin (v16i8 MQPR:$src))), + (i32 (MVE_VMINVs8 (t2MOVi (i32 127)), $src))>; + def : Pat<(i32 (vecreduce_smin (v8i16 MQPR:$src))), + (i32 (MVE_VMINVs16 (t2MOVi16 (i32 32767)), $src))>; + def : Pat<(i32 (vecreduce_smin (v4i32 MQPR:$src))), + (i32 (MVE_VMINVs32 (t2MVNi (i32 -2147483648)), $src))>; + def : Pat<(i32 (vecreduce_umin (v16i8 MQPR:$src))), + (i32 (MVE_VMINVu8 (t2MOVi (i32 255)), $src))>; + def : Pat<(i32 (vecreduce_umin (v8i16 MQPR:$src))), + (i32 (MVE_VMINVu16 (t2MOVi16 (i32 65535)), $src))>; + def : Pat<(i32 (vecreduce_umin (v4i32 MQPR:$src))), + (i32 (MVE_VMINVu32 (t2MOVi (i32 4294967295)), $src))>; + +} + multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> { def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b0, bit_7>; def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>; @@ -667,57 +730,57 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr, let Inst{0} = bit_0; } -multiclass MVE_VMLAMLSDAV_X<string iname, string suffix, dag iops, string cstr, - bit sz, bit bit_28, bit A, bit bit_8, bit bit_0, - list<dag> pattern=[]> { - def _noexch : MVE_VMLAMLSDAV<iname, suffix, iops, cstr, sz, - bit_28, A, 0b0, bit_8, bit_0, pattern>; - def _exch : MVE_VMLAMLSDAV<iname # "x", suffix, iops, cstr, sz, - bit_28, A, 0b1, bit_8, bit_0, pattern>; +multiclass MVE_VMLAMLSDAV_A<string iname, string x, string suffix, + bit sz, bit bit_28, bit X, bit bit_8, bit bit_0, + list<dag> pattern=[]> { + def ""#x#suffix : MVE_VMLAMLSDAV<iname # x, suffix, + (ins MQPR:$Qn, MQPR:$Qm), "", + sz, bit_28, 0b0, X, bit_8, bit_0, pattern>; + def "a"#x#suffix : MVE_VMLAMLSDAV<iname # "a" # x, suffix, + (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm), + "$RdaDest = $RdaSrc", + sz, bit_28, 0b1, X, bit_8, bit_0, pattern>; +} + +multiclass MVE_VMLAMLSDAV_AX<string iname, string suffix, bit sz, bit bit_28, + bit bit_8, bit bit_0, list<dag> pattern=[]> { + defm "" : MVE_VMLAMLSDAV_A<iname, "", suffix, sz, bit_28, + 0b0, bit_8, bit_0, pattern>; + defm "" : MVE_VMLAMLSDAV_A<iname, "x", suffix, sz, bit_28, + 0b1, bit_8, bit_0, pattern>; } -multiclass MVE_VMLAMLSDAV_XA<string iname, string suffix, bit sz, bit bit_28, - bit bit_8, bit bit_0, list<dag> pattern=[]> { - defm _noacc : MVE_VMLAMLSDAV_X<iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "", - sz, bit_28, 0b0, bit_8, bit_0, pattern>; - defm _acc : MVE_VMLAMLSDAV_X<iname # "a", suffix, - (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm), - "$RdaDest = $RdaSrc", - sz, bit_28, 0b1, bit_8, bit_0, pattern>; +multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit bit_8, + list<dag> pattern=[]> { + defm "" : MVE_VMLAMLSDAV_AX<"vmladav", "s"#suffix, + sz, 0b0, bit_8, 0b0, pattern>; + defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", "u"#suffix, + sz, 0b1, 0b0, bit_8, 0b0, pattern>; } -multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit U, bit bit_8, - list<dag> pattern=[]> { - defm "" : MVE_VMLAMLSDAV_XA<"vmladav", suffix, sz, U, bit_8, 0b0, pattern>; +multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28, + list<dag> pattern=[]> { + defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", "s"#suffix, + sz, bit_28, 0b0, 0b1, pattern>; } -defm MVE_VMLADAVs16 : MVE_VMLADAV_multi<"s16", 0b0, 0b0, 0b0>; -defm MVE_VMLADAVs32 : MVE_VMLADAV_multi<"s32", 0b1, 0b0, 0b0>; -defm MVE_VMLADAVu16 : MVE_VMLADAV_multi<"u16", 0b0, 0b1, 0b0>; -defm MVE_VMLADAVu32 : MVE_VMLADAV_multi<"u32", 0b1, 0b1, 0b0>; +defm MVE_VMLADAV : MVE_VMLADAV_multi< "8", 0b0, 0b1>; +defm MVE_VMLADAV : MVE_VMLADAV_multi<"16", 0b0, 0b0>; +defm MVE_VMLADAV : MVE_VMLADAV_multi<"32", 0b1, 0b0>; -defm MVE_VMLADAVs8 : MVE_VMLADAV_multi<"s8", 0b0, 0b0, 0b1>; -defm MVE_VMLADAVu8 : MVE_VMLADAV_multi<"u8", 0b0, 0b1, 0b1>; +defm MVE_VMLSDAV : MVE_VMLSDAV_multi< "8", 0b0, 0b1>; +defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"16", 0b0, 0b0>; +defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"32", 0b1, 0b0>; // vmlav aliases vmladav -foreach acc = ["_acc", "_noacc"] in { +foreach acc = ["", "a"] in { foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in { - def : MVEInstAlias<!strconcat("vmlav", !if(!eq(acc, "_acc"), "a", ""), - "${vp}.", suffix, "\t$RdaDest, $Qn, $Qm"), - (!cast<Instruction>("MVE_VMLADAV"#suffix#acc#"_noexch") + def : MVEInstAlias<"vmlav"#acc#"${vp}."#suffix#"\t$RdaDest, $Qn, $Qm", + (!cast<Instruction>("MVE_VMLADAV"#acc#suffix) tGPREven:$RdaDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; } } -multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28, - list<dag> pattern=[]> { - defm "" : MVE_VMLAMLSDAV_XA<"vmlsdav", suffix, sz, bit_28, 0b0, 0b1, pattern>; -} - -defm MVE_VMLSDAVs8 : MVE_VMLSDAV_multi<"s8", 0, 0b1>; -defm MVE_VMLSDAVs16 : MVE_VMLSDAV_multi<"s16", 0, 0b0>; -defm MVE_VMLSDAVs32 : MVE_VMLSDAV_multi<"s32", 1, 0b0>; - // Base class for VMLALDAV and VMLSLDAV, VRMLALDAVH, VRMLSLDAVH class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr, bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0, @@ -742,82 +805,83 @@ class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr, let Inst{0} = bit_0; } -multiclass MVE_VMLALDAVBase_X<string iname, string suffix, dag iops, - string cstr, bit sz, bit bit_28, bit A, - bit bit_8, bit bit_0, list<dag> pattern=[]> { - def _noexch : MVE_VMLALDAVBase<iname, suffix, iops, cstr, sz, - bit_28, A, 0b0, bit_8, bit_0, pattern>; - def _exch : MVE_VMLALDAVBase<iname # "x", suffix, iops, cstr, sz, - bit_28, A, 0b1, bit_8, bit_0, pattern>; +multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix, + bit sz, bit bit_28, bit X, bit bit_8, bit bit_0, + list<dag> pattern=[]> { + def ""#x#suffix : MVE_VMLALDAVBase< + iname # x, suffix, (ins MQPR:$Qn, MQPR:$Qm), "", + sz, bit_28, 0b0, X, bit_8, bit_0, pattern>; + def "a"#x#suffix : MVE_VMLALDAVBase< + iname # "a" # x, suffix, + (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, MQPR:$Qn, MQPR:$Qm), + "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc", + sz, bit_28, 0b1, X, bit_8, bit_0, pattern>; } -multiclass MVE_VMLALDAVBase_XA<string iname, string suffix, bit sz, bit bit_28, - bit bit_8, bit bit_0, list<dag> pattern=[]> { - defm _noacc : MVE_VMLALDAVBase_X< - iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "", - sz, bit_28, 0b0, bit_8, bit_0, pattern>; - defm _acc : MVE_VMLALDAVBase_X< - iname # "a", suffix, (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, - MQPR:$Qn, MQPR:$Qm), - "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc", - sz, bit_28, 0b1, bit_8, bit_0, pattern>; + +multiclass MVE_VMLALDAVBase_AX<string iname, string suffix, bit sz, bit bit_28, + bit bit_8, bit bit_0, list<dag> pattern=[]> { + defm "" : MVE_VMLALDAVBase_A<iname, "", suffix, sz, + bit_28, 0b0, bit_8, bit_0, pattern>; + defm "" : MVE_VMLALDAVBase_A<iname, "x", suffix, sz, + bit_28, 0b1, bit_8, bit_0, pattern>; } -multiclass MVE_VRMLALDAVH_multi<string suffix, bit U, list<dag> pattern=[]> { - defm "" : MVE_VMLALDAVBase_XA< - "vrmlaldavh", suffix, 0b0, U, 0b1, 0b0, pattern>; +multiclass MVE_VRMLALDAVH_multi<string suffix, list<dag> pattern=[]> { + defm "" : MVE_VMLALDAVBase_AX<"vrmlaldavh", "s"#suffix, + 0b0, 0b0, 0b1, 0b0, pattern>; + defm "" : MVE_VMLALDAVBase_A<"vrmlaldavh", "", "u"#suffix, + 0b0, 0b1, 0b0, 0b1, 0b0, pattern>; } -defm MVE_VRMLALDAVHs32 : MVE_VRMLALDAVH_multi<"s32", 0>; -defm MVE_VRMLALDAVHu32 : MVE_VRMLALDAVH_multi<"u32", 1>; +defm MVE_VRMLALDAVH : MVE_VRMLALDAVH_multi<"32">; // vrmlalvh aliases for vrmlaldavh def : MVEInstAlias<"vrmlalvh${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm", - (MVE_VRMLALDAVHs32_noacc_noexch + (MVE_VRMLALDAVHs32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; def : MVEInstAlias<"vrmlalvha${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm", - (MVE_VRMLALDAVHs32_acc_noexch + (MVE_VRMLALDAVHas32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; def : MVEInstAlias<"vrmlalvh${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm", - (MVE_VRMLALDAVHu32_noacc_noexch + (MVE_VRMLALDAVHu32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; def : MVEInstAlias<"vrmlalvha${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm", - (MVE_VRMLALDAVHu32_acc_noexch + (MVE_VRMLALDAVHau32 tGPREven:$RdaLo, tGPROdd:$RdaHi, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; -multiclass MVE_VMLALDAV_multi<string suffix, bit sz, bit U, - list<dag> pattern=[]> { - defm "" : MVE_VMLALDAVBase_XA<"vmlaldav", suffix, sz, U, 0b0, 0b0, pattern>; +multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> { + defm "" : MVE_VMLALDAVBase_AX<"vmlaldav", "s"#suffix, sz, 0b0, 0b0, 0b0, pattern>; + defm "" : MVE_VMLALDAVBase_A<"vmlaldav", "", "u"#suffix, + sz, 0b1, 0b0, 0b0, 0b0, pattern>; } -defm MVE_VMLALDAVs16 : MVE_VMLALDAV_multi<"s16", 0b0, 0b0>; -defm MVE_VMLALDAVs32 : MVE_VMLALDAV_multi<"s32", 0b1, 0b0>; -defm MVE_VMLALDAVu16 : MVE_VMLALDAV_multi<"u16", 0b0, 0b1>; -defm MVE_VMLALDAVu32 : MVE_VMLALDAV_multi<"u32", 0b1, 0b1>; +defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>; +defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>; // vmlalv aliases vmlaldav -foreach acc = ["_acc", "_noacc"] in { +foreach acc = ["", "a"] in { foreach suffix = ["s16", "s32", "u16", "u32"] in { - def : MVEInstAlias<!strconcat("vmlalv", !if(!eq(acc, "_acc"), "a", ""), - "${vp}.", suffix, "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm"), - (!cast<Instruction>("MVE_VMLALDAV"#suffix#acc#"_noexch") + def : MVEInstAlias<"vmlalv" # acc # "${vp}." # suffix # + "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm", + (!cast<Instruction>("MVE_VMLALDAV"#acc#suffix) tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; } } multiclass MVE_VMLSLDAV_multi<string iname, string suffix, bit sz, - bit bit_28, list<dag> pattern=[]> { - defm "" : MVE_VMLALDAVBase_XA<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>; + bit bit_28, list<dag> pattern=[]> { + defm "" : MVE_VMLALDAVBase_AX<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>; } -defm MVE_VMLSLDAVs16 : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>; -defm MVE_VMLSLDAVs32 : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>; -defm MVE_VRMLSLDAVHs32 : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>; +defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>; +defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>; +defm MVE_VRMLSLDAVH : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>; // end of mve_rDest instructions @@ -967,11 +1031,12 @@ def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), let Inst{6} = 0b1; let Inst{4} = 0b1; let Inst{0} = 0b0; + let validForTailPredication = 1; } -class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7> +class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7, string cstr=""> : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), iname, - suffix, "$Qd, $Qm", ""> { + suffix, "$Qd, $Qm", cstr> { let Inst{28} = 0b1; let Inst{25-23} = 0b111; @@ -985,9 +1050,9 @@ class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7> let Inst{0} = 0b0; } -def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00>; -def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00>; -def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00>; +def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00, "@earlyclobber $Qd">; +def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00, "@earlyclobber $Qd">; +def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00, "@earlyclobber $Qd">; def MVE_VREV32_8 : MVE_VREV<"vrev32", "8", 0b00, 0b01>; def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>; @@ -995,6 +1060,13 @@ def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>; def MVE_VREV16_8 : MVE_VREV<"vrev16", "8", 0b00, 0b10>; let Predicates = [HasMVEInt] in { + def : Pat<(v8i16 (bswap (v8i16 MQPR:$src))), + (v8i16 (MVE_VREV16_8 (v8i16 MQPR:$src)))>; + def : Pat<(v4i32 (bswap (v4i32 MQPR:$src))), + (v4i32 (MVE_VREV32_8 (v4i32 MQPR:$src)))>; +} + +let Predicates = [HasMVEInt] in { def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))), (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>; def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))), @@ -1026,6 +1098,7 @@ def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), let Inst{12-6} = 0b0010111; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } let Predicates = [HasMVEInt] in { @@ -1054,6 +1127,7 @@ class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28> let Inst{6} = 0b1; let Inst{4} = 0b1; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VEOR : MVE_bit_ops<"veor", 0b00, 0b1>; @@ -1145,6 +1219,7 @@ class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps> class MVE_VORR<string suffix, bits<4> cmode, ExpandImm imm_type> : MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { let Inst{5} = 0b0; + let validForTailPredication = 1; } def MVE_VORRIZ0v4i32 : MVE_VORR<"i32", 0b0001, expzero00>; @@ -1173,6 +1248,7 @@ def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm", class MVE_VBIC<string suffix, bits<4> cmode, ExpandImm imm_type> : MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { let Inst{5} = 0b1; + let validForTailPredication = 1; } def MVE_VBICIZ0v4i32 : MVE_VBIC<"i32", 0b0001, expzero00>; @@ -1315,8 +1391,12 @@ let Predicates = [HasMVEInt] in { def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane), (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>; - def : Pat<(extractelt (v8f16 MQPR:$src), imm:$lane), - (COPY_TO_REGCLASS (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane), HPR)>; + def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane), + (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>; + def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane), + (COPY_TO_REGCLASS + (VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))), + HPR)>; def : Pat<(v4f32 (scalar_to_vector SPR:$src)), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; @@ -1408,6 +1488,7 @@ class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract, let Inst{12-8} = 0b01000; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } class MVE_VADD<string suffix, bits<2> size, list<dag> pattern=[]> @@ -1442,8 +1523,8 @@ let Predicates = [HasMVEInt] in { } class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract, - bits<2> size, list<dag> pattern=[]> - : MVE_int<iname, suffix, size, pattern> { + bits<2> size, ValueType vt> + : MVE_int<iname, suffix, size, []> { let Inst{28} = U; let Inst{25-23} = 0b110; @@ -1453,26 +1534,49 @@ class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract, let Inst{8} = 0b0; let Inst{4} = 0b1; let Inst{0} = 0b0; + let validForTailPredication = 1; + + ValueType VT = vt; } -class MVE_VQADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]> - : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, pattern>; -class MVE_VQSUB<string suffix, bit U, bits<2> size, list<dag> pattern=[]> - : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, pattern>; +class MVE_VQADD<string suffix, bit U, bits<2> size, ValueType VT> + : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, VT>; +class MVE_VQSUB<string suffix, bit U, bits<2> size, ValueType VT> + : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, VT>; -def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00>; -def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01>; -def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10>; -def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00>; -def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01>; -def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10>; +def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00, v16i8>; +def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01, v8i16>; +def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10, v4i32>; +def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00, v16i8>; +def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01, v8i16>; +def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10, v4i32>; + +def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00, v16i8>; +def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01, v8i16>; +def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10, v4i32>; +def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00, v16i8>; +def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01, v8i16>; +def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10, v4i32>; + +let Predicates = [HasMVEInt] in { + foreach instr = [MVE_VQADDu8, MVE_VQADDu16, MVE_VQADDu32] in + foreach VT = [instr.VT] in + def : Pat<(VT (uaddsat (VT MQPR:$Qm), (VT MQPR:$Qn))), + (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; + foreach instr = [MVE_VQADDs8, MVE_VQADDs16, MVE_VQADDs32] in + foreach VT = [instr.VT] in + def : Pat<(VT (saddsat (VT MQPR:$Qm), (VT MQPR:$Qn))), + (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; + foreach instr = [MVE_VQSUBu8, MVE_VQSUBu16, MVE_VQSUBu32] in + foreach VT = [instr.VT] in + def : Pat<(VT (usubsat (VT MQPR:$Qm), (VT MQPR:$Qn))), + (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; + foreach instr = [MVE_VQSUBs8, MVE_VQSUBs16, MVE_VQSUBs32] in + foreach VT = [instr.VT] in + def : Pat<(VT (ssubsat (VT MQPR:$Qm), (VT MQPR:$Qn))), + (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>; +} -def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00>; -def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01>; -def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10>; -def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00>; -def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01>; -def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10>; class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]> : MVE_int<"vabd", suffix, size, pattern> { @@ -1483,6 +1587,7 @@ class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]> let Inst{12-8} = 0b00111; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VABDs8 : MVE_VABD_int<"s8", 0b0, 0b00>; @@ -1501,6 +1606,7 @@ class MVE_VRHADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]> let Inst{12-8} = 0b00001; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VRHADDs8 : MVE_VRHADD<"s8", 0b0, 0b00>; @@ -1522,6 +1628,7 @@ class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract, let Inst{8} = 0b0; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } class MVE_VHADD<string suffix, bit U, bits<2> size, @@ -1545,6 +1652,60 @@ def MVE_VHSUBu8 : MVE_VHSUB<"u8", 0b1, 0b00>; def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>; def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (ARMvshrsImm + (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHADDs8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshrsImm + (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHADDs16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshrsImm + (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHADDs32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; + + def : Pat<(v16i8 (ARMvshruImm + (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHADDu8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshruImm + (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHADDu16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshruImm + (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHADDu32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; + + def : Pat<(v16i8 (ARMvshrsImm + (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHSUBs8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshrsImm + (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHSUBs16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshrsImm + (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHSUBs32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; + + def : Pat<(v16i8 (ARMvshruImm + (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHSUBu8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshruImm + (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHSUBu16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshruImm + (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHSUBu32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; +} + class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary, "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> { @@ -1563,6 +1724,7 @@ class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]> let Inst{6} = 0b0; let Inst{5} = E; let Inst{4-0} = 0b10000; + let validForTailPredication = 1; } def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0>; @@ -1625,6 +1787,7 @@ class MVE_VCLSCLZ<string iname, string suffix, bits<2> size, let Inst{6} = 0b1; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VCLSs8 : MVE_VCLSCLZ<"vcls", "s8", 0b00, 0b0>; @@ -1635,6 +1798,15 @@ def MVE_VCLZs8 : MVE_VCLSCLZ<"vclz", "i8", 0b00, 0b1>; def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>; def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 ( ctlz (v16i8 MQPR:$val1))), + (v16i8 ( MVE_VCLZs8 (v16i8 MQPR:$val1)))>; + def : Pat<(v4i32 ( ctlz (v4i32 MQPR:$val1))), + (v4i32 ( MVE_VCLZs32 (v4i32 MQPR:$val1)))>; + def : Pat<(v8i16 ( ctlz (v8i16 MQPR:$val1))), + (v8i16 ( MVE_VCLZs16 (v8i16 MQPR:$val1)))>; +} + class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate, list<dag> pattern=[]> : MVEIntSingleSrc<iname, suffix, size, pattern> { @@ -1648,6 +1820,7 @@ class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate, let Inst{6} = 0b1; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VABSs8 : MVE_VABSNEG_int<"vabs", "s8", 0b00, 0b0>; @@ -1689,6 +1862,7 @@ class MVE_VQABSNEG<string iname, string suffix, bits<2> size, let Inst{6} = 0b1; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VQABSs8 : MVE_VQABSNEG<"vqabs", "s8", 0b00, 0b0>; @@ -1720,6 +1894,7 @@ class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op, let Inst{3-0} = imm{3-0}; let DecoderMethod = "DecodeMVEModImmInstruction"; + let validForTailPredication = 1; } let isReMaterializable = 1 in { @@ -2115,6 +2290,7 @@ class MVE_shift_by_vec<string iname, string suffix, bit U, let Inst{4} = bit_4; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; + let validForTailPredication = 1; } multiclass mve_shift_by_vec_multi<string iname, bit bit_4, bit bit_8> { @@ -2163,6 +2339,7 @@ class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops, let Inst{4} = 0b1; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; + let validForTailPredication = 1; } class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm> @@ -2175,6 +2352,7 @@ class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm> let Inst{21-16} = imm; let Inst{10-9} = 0b10; let Inst{8} = bit_8; + let validForTailPredication = 1; } def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, (ins shr_imm8:$imm)> { @@ -2427,6 +2605,7 @@ class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size, let Inst{11-10} = 0b01; let Inst{9-7} = op{2-0}; let Inst{4} = 0b0; + let validForTailPredication = 1; } @@ -2489,6 +2668,7 @@ class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]> let Inst{12-8} = 0b01101; let Inst{7} = Qn{3}; let Inst{4} = 0b1; + let validForTailPredication = 1; } def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>; @@ -2556,8 +2736,38 @@ def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1, def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1, (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; -def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>; -def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>; +let Predicates = [HasMVEFloat, UseFusedMAC] in { + def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1), + (fmul (v8f16 MQPR:$src2), + (v8f16 MQPR:$src3)))), + (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>; + def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1), + (fmul (v4f32 MQPR:$src2), + (v4f32 MQPR:$src3)))), + (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>; + + def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1), + (fmul (v8f16 MQPR:$src2), + (v8f16 MQPR:$src3)))), + (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>; + def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1), + (fmul (v4f32 MQPR:$src2), + (v4f32 MQPR:$src3)))), + (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>; +} + +let Predicates = [HasMVEFloat] in { + def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), + (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>; + def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), + (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>; +} + + +let validForTailPredication = 1 in { + def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>; + def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>; +} let Predicates = [HasMVEFloat] in { def : Pat<(v4f32 (fadd (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), @@ -2566,8 +2776,11 @@ let Predicates = [HasMVEFloat] in { (v8f16 (MVE_VADDf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; } -def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>; -def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>; + +let validForTailPredication = 1 in { + def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>; + def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>; +} let Predicates = [HasMVEFloat] in { def : Pat<(v4f32 (fsub (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))), @@ -2576,10 +2789,10 @@ let Predicates = [HasMVEFloat] in { (v8f16 (MVE_VSUBf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>; } -class MVE_VCADD<string suffix, bit size, list<dag> pattern=[]> +class MVE_VCADD<string suffix, bit size, string cstr="", list<dag> pattern=[]> : MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot), - "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> { + "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> { bits<4> Qd; bits<4> Qn; bit rot; @@ -2598,7 +2811,7 @@ class MVE_VCADD<string suffix, bit size, list<dag> pattern=[]> } def MVE_VCADDf16 : MVE_VCADD<"f16", 0b0>; -def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1>; +def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1, "@earlyclobber $Qd">; class MVE_VABD_fp<string suffix, bit size> : MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), @@ -2617,6 +2830,7 @@ class MVE_VABD_fp<string suffix, bit size> let Inst{11-8} = 0b1101; let Inst{7} = Qn{3}; let Inst{4} = 0b0; + let validForTailPredication = 1; } def MVE_VABDf32 : MVE_VABD_fp<"f32", 0b0>; @@ -2643,6 +2857,7 @@ class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op, let Inst{4} = 0b1; let DecoderMethod = "DecodeMVEVCVTt1fp"; + let validForTailPredication = 1; } class MVE_VCVT_imm_asmop<int Bits> : AsmOperandClass { @@ -2693,6 +2908,7 @@ class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm, let Inst{9-8} = rm; let Inst{7} = op; let Inst{4} = 0b0; + let validForTailPredication = 1; } multiclass MVE_VCVT_fp_int_anpm_multi<string suffix, bits<2> size, bit op, @@ -2727,6 +2943,7 @@ class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op, let Inst{12-9} = 0b0011; let Inst{8-7} = op; let Inst{4} = 0b0; + let validForTailPredication = 1; } // The unsuffixed VCVT for float->int implicitly rounds toward zero, @@ -2776,6 +2993,7 @@ class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate, let Inst{11-8} = 0b0111; let Inst{7} = negate; let Inst{4} = 0b0; + let validForTailPredication = 1; } def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>; @@ -2863,6 +3081,7 @@ class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20, // decoder to emit an operand that isn't affected by any instruction // bit. let DecoderMethod = "DecodeMVEVCMP<false," # predtype.DecoderMethod # ">"; + let validForTailPredication = 1; } class MVE_VCMPqqf<string suffix, bit size> @@ -2927,6 +3146,7 @@ class MVE_VCMPqr<string suffix, bit bit_28, bits<2> bits_21_20, let Constraints = ""; // Custom decoder method, for the same reason as MVE_VCMPqq let DecoderMethod = "DecodeMVEVCMP<true," # predtype.DecoderMethod # ">"; + let validForTailPredication = 1; } class MVE_VCMPqrf<string suffix, bit size> @@ -2966,6 +3186,168 @@ def MVE_VCMPs8r : MVE_VCMPqrs<"s8", 0b00>; def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>; def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>; +multiclass unpred_vcmp_z<string suffix, int fc> { + def i8 : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc))>; + def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc))>; + def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc))>; + + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; +} + +multiclass unpred_vcmp_r<string suffix, int fc> { + def i8 : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc))>; + def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc))>; + def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>; + + def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>; + def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>; + def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>; + + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, 1, VCCR:$p1))>; + + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>; +} + +multiclass unpred_vcmpf_z<int fc> { + def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc))>; + def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>; + + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))))), + (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>; +} + +multiclass unpred_vcmpf_r<int fc> { + def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))), + (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; + def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))), + (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; + + def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>; + def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>; + + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))))), + (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))))), + (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, 1, VCCR:$p1))>; + + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, 1, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, 1, VCCR:$p1))>; +} + +let Predicates = [HasMVEInt] in { + defm MVE_VCEQZ : unpred_vcmp_z<"i", 0>; + defm MVE_VCNEZ : unpred_vcmp_z<"i", 1>; + defm MVE_VCGEZ : unpred_vcmp_z<"s", 10>; + defm MVE_VCLTZ : unpred_vcmp_z<"s", 11>; + defm MVE_VCGTZ : unpred_vcmp_z<"s", 12>; + defm MVE_VCLEZ : unpred_vcmp_z<"s", 13>; + defm MVE_VCGTUZ : unpred_vcmp_z<"u", 8>; + defm MVE_VCGEUZ : unpred_vcmp_z<"u", 2>; + + defm MVE_VCEQ : unpred_vcmp_r<"i", 0>; + defm MVE_VCNE : unpred_vcmp_r<"i", 1>; + defm MVE_VCGE : unpred_vcmp_r<"s", 10>; + defm MVE_VCLT : unpred_vcmp_r<"s", 11>; + defm MVE_VCGT : unpred_vcmp_r<"s", 12>; + defm MVE_VCLE : unpred_vcmp_r<"s", 13>; + defm MVE_VCGTU : unpred_vcmp_r<"u", 8>; + defm MVE_VCGEU : unpred_vcmp_r<"u", 2>; +} + +let Predicates = [HasMVEFloat] in { + defm MVE_VFCEQZ : unpred_vcmpf_z<0>; + defm MVE_VFCNEZ : unpred_vcmpf_z<1>; + defm MVE_VFCGEZ : unpred_vcmpf_z<10>; + defm MVE_VFCLTZ : unpred_vcmpf_z<11>; + defm MVE_VFCGTZ : unpred_vcmpf_z<12>; + defm MVE_VFCLEZ : unpred_vcmpf_z<13>; + + defm MVE_VFCEQ : unpred_vcmpf_r<0>; + defm MVE_VFCNE : unpred_vcmpf_r<1>; + defm MVE_VFCGE : unpred_vcmpf_r<10>; + defm MVE_VFCLT : unpred_vcmpf_r<11>; + defm MVE_VFCGT : unpred_vcmpf_r<12>; + defm MVE_VFCLE : unpred_vcmpf_r<13>; +} + + +// Extra "worst case" and/or/xor partterns, going into and out of GRP +multiclass two_predops<SDPatternOperator opnode, Instruction insn> { + def v16i1 : Pat<(v16i1 (opnode (v16i1 VCCR:$p1), (v16i1 VCCR:$p2))), + (v16i1 (COPY_TO_REGCLASS + (insn (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p1), rGPR)), + (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p2), rGPR))), + VCCR))>; + def v8i1 : Pat<(v8i1 (opnode (v8i1 VCCR:$p1), (v8i1 VCCR:$p2))), + (v8i1 (COPY_TO_REGCLASS + (insn (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p1), rGPR)), + (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p2), rGPR))), + VCCR))>; + def v4i1 : Pat<(v4i1 (opnode (v4i1 VCCR:$p1), (v4i1 VCCR:$p2))), + (v4i1 (COPY_TO_REGCLASS + (insn (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p1), rGPR)), + (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p2), rGPR))), + VCCR))>; +} + +let Predicates = [HasMVEInt] in { + defm POR : two_predops<or, t2ORRrr>; + defm PAND : two_predops<and, t2ANDrr>; + defm PEOR : two_predops<xor, t2EORrr>; +} + +// Occasionally we need to cast between a i32 and a boolean vector, for +// example when moving between rGPR and VPR.P0 as part of predicate vector +// shuffles. We also sometimes need to cast between different predicate +// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles. + +def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>; + +let Predicates = [HasMVEInt] in { + foreach VT = [ v4i1, v8i1, v16i1 ] in { + def : Pat<(i32 (predicate_cast (VT VCCR:$src))), + (i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>; + def : Pat<(VT (predicate_cast (i32 VCCR:$src))), + (VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>; + + foreach VT2 = [ v4i1, v8i1, v16i1 ] in + def : Pat<(VT (predicate_cast (VT2 VCCR:$src))), + (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>; + } +} + // end of MVE compares // start of MVE_qDest_qSrc @@ -2989,10 +3371,10 @@ class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops, } class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract, - string suffix, bits<2> size, list<dag> pattern=[]> + string suffix, bits<2> size, string cstr="", list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", - vpred_n, "$Qd = $Qd_src", pattern> { + vpred_n, "$Qd = $Qd_src"#cstr, pattern> { bits<4> Qn; let Inst{28} = subtract; @@ -3009,7 +3391,7 @@ multiclass MVE_VQxDMLxDH_multi<string iname, bit exch, bit round, bit subtract> { def s8 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s8", 0b00>; def s16 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s16", 0b01>; - def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10>; + def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10, ",@earlyclobber $Qd">; } defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>; @@ -3021,10 +3403,10 @@ defm MVE_VQDMLSDHX : MVE_VQxDMLxDH_multi<"vqdmlsdhx", 0b1, 0b0, 0b1>; defm MVE_VQRDMLSDH : MVE_VQxDMLxDH_multi<"vqrdmlsdh", 0b0, 0b1, 0b1>; defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>; -class MVE_VCMUL<string iname, string suffix, bit size, list<dag> pattern=[]> +class MVE_VCMUL<string iname, string suffix, bit size, string cstr="", list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot), - "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> { + "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> { bits<4> Qn; bits<2> rot; @@ -3041,13 +3423,13 @@ class MVE_VCMUL<string iname, string suffix, bit size, list<dag> pattern=[]> } def MVE_VCMULf16 : MVE_VCMUL<"vcmul", "f16", 0b0>; -def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1>; +def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1, "@earlyclobber $Qd">; class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20, - bit T, list<dag> pattern=[]> + bit T, string cstr, list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", - vpred_r, "", pattern> { + vpred_r, cstr, pattern> { bits<4> Qd; bits<4> Qn; bits<4> Qm; @@ -3063,9 +3445,9 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20, } multiclass MVE_VMULL_multi<string iname, string suffix, - bit bit_28, bits<2> bits_21_20> { - def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0>; - def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1>; + bit bit_28, bits<2> bits_21_20, string cstr=""> { + def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0, cstr>; + def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1, cstr>; } // For integer multiplies, bits 21:20 encode size, and bit 28 signedness. @@ -3074,10 +3456,10 @@ multiclass MVE_VMULL_multi<string iname, string suffix, defm MVE_VMULLs8 : MVE_VMULL_multi<"vmull", "s8", 0b0, 0b00>; defm MVE_VMULLs16 : MVE_VMULL_multi<"vmull", "s16", 0b0, 0b01>; -defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10>; +defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10, "@earlyclobber $Qd">; defm MVE_VMULLu8 : MVE_VMULL_multi<"vmull", "u8", 0b1, 0b00>; defm MVE_VMULLu16 : MVE_VMULL_multi<"vmull", "u16", 0b1, 0b01>; -defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10>; +defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10, "@earlyclobber $Qd">; defm MVE_VMULLp8 : MVE_VMULL_multi<"vmull", "p8", 0b0, 0b11>; defm MVE_VMULLp16 : MVE_VMULL_multi<"vmull", "p16", 0b1, 0b11>; @@ -3144,6 +3526,18 @@ defm MVE_VQMOVNu32 : MVE_VxMOVxN_halves<"vqmovn", "u32", 0b1, 0b1, 0b01>; defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>; defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>; +def MVEvmovn : SDNode<"ARMISD::VMOVN", SDTARMVEXT>; +let Predicates = [HasMVEInt] in { + def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), + (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), + (v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))), + (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))), + (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; +} + class MVE_VCVT_ff<string iname, string suffix, bit op, bit T, list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm), @@ -3166,11 +3560,10 @@ defm MVE_VCVTf16f32 : MVE_VCVT_ff_halves<"f16.f32", 0b0>; defm MVE_VCVTf32f16 : MVE_VCVT_ff_halves<"f32.f16", 0b1>; class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve, - list<dag> pattern=[]> + string cstr="", list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot), - "$Qd, $Qn, $Qm, $rot", vpred_r, "", - pattern> { + "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> { bits<4> Qn; bit rot; @@ -3186,11 +3579,11 @@ class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve, def MVE_VCADDi8 : MVE_VxCADD<"vcadd", "i8", 0b00, 0b1>; def MVE_VCADDi16 : MVE_VxCADD<"vcadd", "i16", 0b01, 0b1>; -def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1>; +def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1, "@earlyclobber $Qd">; def MVE_VHCADDs8 : MVE_VxCADD<"vhcadd", "s8", 0b00, 0b0>; def MVE_VHCADDs16 : MVE_VxCADD<"vhcadd", "s16", 0b01, 0b0>; -def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0>; +def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0, "@earlyclobber $Qd">; class MVE_VADCSBC<string iname, bit I, bit subtract, dag carryin, list<dag> pattern=[]> @@ -3220,10 +3613,10 @@ def MVE_VSBC : MVE_VADCSBC<"vsbc", 0b0, 0b1, (ins cl_FPSCR_NZCV:$carryin)>; def MVE_VSBCI : MVE_VADCSBC<"vsbci", 0b1, 0b1, (ins)>; class MVE_VQDMULL<string iname, string suffix, bit size, bit T, - list<dag> pattern=[]> + string cstr="", list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", - vpred_r, "", pattern> { + vpred_r, cstr, pattern> { bits<4> Qn; let Inst{28} = size; @@ -3236,13 +3629,13 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T, let Inst{0} = 0b1; } -multiclass MVE_VQDMULL_halves<string suffix, bit size> { - def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0>; - def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1>; +multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> { + def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>; + def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>; } defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>; -defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1>; +defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">; // end of mve_qDest_qSrc @@ -3267,9 +3660,9 @@ class MVE_qr_base<dag oops, dag iops, InstrItinClass itin, string iname, let Inst{3-0} = Rm{3-0}; } -class MVE_qDest_rSrc<string iname, string suffix, list<dag> pattern=[]> +class MVE_qDest_rSrc<string iname, string suffix, string cstr="", list<dag> pattern=[]> : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qn, rGPR:$Rm), - NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, "", + NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, cstr, pattern>; class MVE_qDestSrc_rSrc<string iname, string suffix, list<dag> pattern=[]> @@ -3291,7 +3684,7 @@ class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]> class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size, bit bit_5, bit bit_12, bit bit_16, bit bit_28, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + : MVE_qDest_rSrc<iname, suffix, "", pattern> { let Inst{28} = bit_28; let Inst{21-20} = size; @@ -3299,6 +3692,7 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size, let Inst{12} = bit_12; let Inst{8} = 0b1; let Inst{5} = bit_5; + let validForTailPredication = 1; } multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix, @@ -3320,9 +3714,27 @@ defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>; defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>; defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), + (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), + (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), + (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; +} + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), + (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), + (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), + (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; +} + class MVE_VQDMULL_qr<string iname, string suffix, bit size, - bit T, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + bit T, string cstr="", list<dag> pattern=[]> + : MVE_qDest_rSrc<iname, suffix, cstr, pattern> { let Inst{28} = size; let Inst{21-20} = 0b11; @@ -3332,18 +3744,18 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size, let Inst{5} = 0b1; } -multiclass MVE_VQDMULL_qr_halves<string suffix, bit size> { - def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0>; - def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1>; +multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> { + def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>; + def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>; } defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>; -defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1>; +defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">; class MVE_VxADDSUB_qr<string iname, string suffix, bit bit_28, bits<2> bits_21_20, bit subtract, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + : MVE_qDest_rSrc<iname, suffix, "", pattern> { let Inst{28} = bit_28; let Inst{21-20} = bits_21_20; @@ -3351,6 +3763,7 @@ class MVE_VxADDSUB_qr<string iname, string suffix, let Inst{12} = subtract; let Inst{8} = 0b1; let Inst{5} = 0b0; + let validForTailPredication = 1; } def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>; @@ -3388,6 +3801,7 @@ class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size, let Inst{12-8} = 0b11110; let Inst{7} = bit_7; let Inst{6-4} = 0b110; + let validForTailPredication = 1; } multiclass MVE_VxSHL_qr_types<string iname, bit bit_7, bit bit_17> { @@ -3421,7 +3835,7 @@ let Predicates = [HasMVEInt] in { } class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + : MVE_qDest_rSrc<iname, suffix, "", pattern> { let Inst{28} = 0b1; let Inst{21-20} = size; @@ -3429,15 +3843,27 @@ class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]> let Inst{12} = 0b1; let Inst{8} = 0b0; let Inst{5} = 0b1; + let validForTailPredication = 1; } def MVE_VBRSR8 : MVE_VBRSR<"vbrsr", "8", 0b00>; def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>; def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 ( bitreverse (v16i8 MQPR:$val1))), + (v16i8 ( MVE_VBRSR8 (v16i8 MQPR:$val1), (t2MOVi (i32 8)) ))>; + + def : Pat<(v4i32 ( bitreverse (v4i32 MQPR:$val1))), + (v4i32 ( MVE_VBRSR32 (v4i32 MQPR:$val1), (t2MOVi (i32 32)) ))>; + + def : Pat<(v8i16 ( bitreverse (v8i16 MQPR:$val1))), + (v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>; +} + class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + : MVE_qDest_rSrc<iname, suffix, "", pattern> { let Inst{28} = 0b0; let Inst{21-20} = size; @@ -3445,15 +3871,25 @@ class MVE_VMUL_qr_int<string iname, string suffix, let Inst{12} = 0b1; let Inst{8} = 0b0; let Inst{5} = 0b1; + let validForTailPredication = 1; } def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>; def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>; def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), + (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), + (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; + def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), + (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; +} + class MVE_VxxMUL_qr<string iname, string suffix, bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, pattern> { + : MVE_qDest_rSrc<iname, suffix, "", pattern> { let Inst{28} = bit_28; let Inst{21-20} = bits_21_20; @@ -3471,14 +3907,14 @@ def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>; def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>; def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>; -let Predicates = [HasMVEFloat] in { +let Predicates = [HasMVEFloat], validForTailPredication = 1 in { def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>; def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>; } class MVE_VFMAMLA_qr<string iname, string suffix, - bit bit_28, bits<2> bits_21_20, bit S, - list<dag> pattern=[]> + bit bit_28, bits<2> bits_21_20, bit S, + list<dag> pattern=[]> : MVE_qDestSrc_rSrc<iname, suffix, pattern> { let Inst{28} = bit_28; @@ -3487,6 +3923,7 @@ class MVE_VFMAMLA_qr<string iname, string suffix, let Inst{12} = S; let Inst{8} = 0b0; let Inst{5} = 0b0; + let validForTailPredication = 1; } def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>; @@ -3503,6 +3940,21 @@ def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>; def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>; def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>; +let Predicates = [HasMVEInt] in { + def : Pat<(v4i32 (add (v4i32 MQPR:$src1), + (v4i32 (mul (v4i32 MQPR:$src2), + (v4i32 (ARMvdup (i32 rGPR:$x))))))), + (v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>; + def : Pat<(v8i16 (add (v8i16 MQPR:$src1), + (v8i16 (mul (v8i16 MQPR:$src2), + (v8i16 (ARMvdup (i32 rGPR:$x))))))), + (v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>; + def : Pat<(v16i8 (add (v16i8 MQPR:$src1), + (v16i8 (mul (v16i8 MQPR:$src2), + (v16i8 (ARMvdup (i32 rGPR:$x))))))), + (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>; +} + let Predicates = [HasMVEFloat] in { def MVE_VFMA_qr_f16 : MVE_VFMAMLA_qr<"vfma", "f16", 0b1, 0b11, 0b0>; def MVE_VFMA_qr_f32 : MVE_VFMAMLA_qr<"vfma", "f32", 0b0, 0b11, 0b0>; @@ -3555,6 +4007,7 @@ class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12, let Inst{7} = imm{1}; let Inst{6-1} = 0b110111; let Inst{0} = imm{0}; + let validForTailPredication = 1; } def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>; @@ -3589,6 +4042,7 @@ class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12, let Inst{6-4} = 0b110; let Inst{3-1} = Rm{3-1}; let Inst{0} = imm{0}; + let validForTailPredication = 1; } def MVE_VIWDUPu8 : MVE_VxWDUP<"viwdup", "u8", 0b00, 0b0>; @@ -3599,6 +4053,7 @@ def MVE_VDWDUPu8 : MVE_VxWDUP<"vdwdup", "u8", 0b00, 0b1>; def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>; def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>; +let hasSideEffects = 1 in class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]> : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix, "$Rn", vpred_n, "", pattern> { @@ -3614,6 +4069,7 @@ class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]> let Constraints = ""; let DecoderMethod = "DecodeMveVCTP"; + let validForTailPredication = 1; } def MVE_VCTP8 : MVE_VCTP<"8", 0b00>; @@ -3621,6 +4077,15 @@ def MVE_VCTP16 : MVE_VCTP<"16", 0b01>; def MVE_VCTP32 : MVE_VCTP<"32", 0b10>; def MVE_VCTP64 : MVE_VCTP<"64", 0b11>; +let Predicates = [HasMVEInt] in { + def : Pat<(int_arm_vctp8 rGPR:$Rn), + (v16i1 (MVE_VCTP8 rGPR:$Rn))>; + def : Pat<(int_arm_vctp16 rGPR:$Rn), + (v8i1 (MVE_VCTP16 rGPR:$Rn))>; + def : Pat<(int_arm_vctp32 rGPR:$Rn), + (v4i1 (MVE_VCTP32 rGPR:$Rn))>; +} + // end of mve_qDest_rSrc // start of coproc mov @@ -3863,6 +4328,7 @@ class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc, let mayLoad = dir.load; let mayStore = !eq(dir.load,0); + let validForTailPredication = 1; } // Contiguous load and store instructions. These come in two main @@ -4165,7 +4631,8 @@ class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> patte let Inst{7} = fc{0}; let Inst{4} = 0b0; - let Defs = [VPR, P0]; + let Defs = [VPR]; + let validForTailPredication = 1; } class MVE_VPTt1<string suffix, bits<2> size, dag iops> @@ -4177,11 +4644,12 @@ class MVE_VPTt1<string suffix, bits<2> size, dag iops> let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; let Inst{0} = fc{1}; + let validForTailPredication = 1; } class MVE_VPTt1i<string suffix, bits<2> size> : MVE_VPTt1<suffix, size, - (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, MQPR:$Qm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_i:$fc)> { let Inst{12} = 0b0; let Inst{0} = 0b0; } @@ -4192,7 +4660,7 @@ def MVE_VPTv16i8 : MVE_VPTt1i<"i8", 0b00>; class MVE_VPTt1u<string suffix, bits<2> size> : MVE_VPTt1<suffix, size, - (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, MQPR:$Qm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_u:$fc)> { let Inst{12} = 0b0; let Inst{0} = 0b1; } @@ -4203,7 +4671,7 @@ def MVE_VPTv16u8 : MVE_VPTt1u<"u8", 0b00>; class MVE_VPTt1s<string suffix, bits<2> size> : MVE_VPTt1<suffix, size, - (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, MQPR:$Qm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_s:$fc)> { let Inst{12} = 0b1; } @@ -4225,7 +4693,7 @@ class MVE_VPTt2<string suffix, bits<2> size, dag iops> class MVE_VPTt2i<string suffix, bits<2> size> : MVE_VPTt2<suffix, size, - (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, GPRwithZR:$Rm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_i:$fc)> { let Inst{12} = 0b0; let Inst{5} = 0b0; } @@ -4236,7 +4704,7 @@ def MVE_VPTv16i8r : MVE_VPTt2i<"i8", 0b00>; class MVE_VPTt2u<string suffix, bits<2> size> : MVE_VPTt2<suffix, size, - (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, GPRwithZR:$Rm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_u:$fc)> { let Inst{12} = 0b0; let Inst{5} = 0b1; } @@ -4247,7 +4715,7 @@ def MVE_VPTv16u8r : MVE_VPTt2u<"u8", 0b00>; class MVE_VPTt2s<string suffix, bits<2> size> : MVE_VPTt2<suffix, size, - (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, GPRwithZR:$Rm)> { + (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_s:$fc)> { let Inst{12} = 0b1; } @@ -4276,12 +4744,13 @@ class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern= let Inst{7} = fc{0}; let Inst{4} = 0b0; - let Defs = [P0]; + let Defs = [VPR]; let Predicates = [HasMVEFloat]; + let validForTailPredication = 1; } class MVE_VPTft1<string suffix, bit size> - : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, MQPR:$Qm), + : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_fp:$fc), "$fc, $Qn, $Qm"> { bits<3> fc; bits<4> Qm; @@ -4296,7 +4765,7 @@ def MVE_VPTv4f32 : MVE_VPTft1<"f32", 0b0>; def MVE_VPTv8f16 : MVE_VPTft1<"f16", 0b1>; class MVE_VPTft2<string suffix, bit size> - : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, GPRwithZR:$Rm), + : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_fp:$fc), "$fc, $Qn, $Rm"> { bits<3> fc; bits<4> Rm; @@ -4322,7 +4791,8 @@ def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary, let Unpredictable{7} = 0b1; let Unpredictable{5} = 0b1; - let Defs = [P0]; + let Uses = [VPR]; + let validForTailPredication = 1; } def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary, @@ -4346,6 +4816,7 @@ def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary, let Inst{4} = 0b0; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; + let validForTailPredication = 1; } foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32", @@ -4353,19 +4824,113 @@ foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32", def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm", (MVE_VPSEL MQPR:$Qd, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>; -def MVE_VPNOT : MVE_p<(outs), (ins), NoItinerary, +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), + (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), + (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), + (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + + def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))), + (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))), + (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>; + + def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), + (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), 1)))>; + def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), + (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>; + def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), + (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>; + + def : Pat<(v8f16 (vselect (v8i16 MQPR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))), + (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>; + def : Pat<(v4f32 (vselect (v4i32 MQPR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))), + (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, + (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>; + + // Pred <-> Int + def : Pat<(v16i8 (zext (v16i1 VCCR:$pred))), + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + def : Pat<(v8i16 (zext (v8i1 VCCR:$pred))), + (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))), + (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; + + def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))), + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + def : Pat<(v8i16 (sext (v8i1 VCCR:$pred))), + (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))), + (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; + + def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))), + (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>; + def : Pat<(v8i16 (anyext (v8i1 VCCR:$pred))), + (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>; + def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))), + (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>; + + def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))), + (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, 1))>; + def : Pat<(v8i1 (trunc (v8i16 MQPR:$v1))), + (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, 1))>; + def : Pat<(v4i1 (trunc (v4i32 MQPR:$v1))), + (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, 1))>; +} + +let Predicates = [HasMVEFloat] in { + // Pred <-> Float + // 112 is 1.0 in float + def : Pat<(v4f32 (uint_to_fp (v4i1 VCCR:$pred))), + (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>; + // 2620 in 1.0 in half + def : Pat<(v8f16 (uint_to_fp (v8i1 VCCR:$pred))), + (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>; + // 240 is -1.0 in float + def : Pat<(v4f32 (sint_to_fp (v4i1 VCCR:$pred))), + (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>; + // 2748 is -1.0 in half + def : Pat<(v8f16 (sint_to_fp (v8i1 VCCR:$pred))), + (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>; + + def : Pat<(v4i1 (fp_to_uint (v4f32 MQPR:$v1))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>; + def : Pat<(v8i1 (fp_to_uint (v8f16 MQPR:$v1))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>; + def : Pat<(v4i1 (fp_to_sint (v4f32 MQPR:$v1))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>; + def : Pat<(v8i1 (fp_to_sint (v8f16 MQPR:$v1))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>; +} + +def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary, "vpnot", "", "", vpred_n, "", []> { let Inst{31-0} = 0b11111110001100010000111101001101; let Unpredictable{19-17} = 0b111; let Unpredictable{12} = 0b1; let Unpredictable{7} = 0b1; let Unpredictable{5} = 0b1; - let Defs = [P0]; - let Uses = [P0]; let Constraints = ""; + let DecoderMethod = "DecodeMVEVPNOT"; } +let Predicates = [HasMVEInt] in { + def : Pat<(v4i1 (xor (v4i1 VCCR:$pred), (v4i1 (predicate_cast (i32 65535))))), + (v4i1 (MVE_VPNOT (v4i1 VCCR:$pred)))>; + def : Pat<(v8i1 (xor (v8i1 VCCR:$pred), (v8i1 (predicate_cast (i32 65535))))), + (v8i1 (MVE_VPNOT (v8i1 VCCR:$pred)))>; + def : Pat<(v16i1 (xor (v16i1 VCCR:$pred), (v16i1 (predicate_cast (i32 65535))))), + (v16i1 (MVE_VPNOT (v16i1 VCCR:$pred)))>; +} + + class MVE_loltp_start<dag iops, string asm, string ops, bits<2> size> : t2LOL<(outs GPRlr:$LR), iops, asm, ops> { bits<4> Rn; @@ -4433,159 +4998,440 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> { // Patterns //===----------------------------------------------------------------------===// -class MVE_unpred_vector_store_typed<ValueType Ty, Instruction RegImmInst, +class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst, + PatFrag StoreKind, int shift> + : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr), + (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>; +class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst, + PatFrag StoreKind, int shift> + : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred), + (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>; + +multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind, + int shift> { + def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>; + def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>; +} + +class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst, + PatFrag LoadKind, int shift> + : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)), + (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>; +class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst, + PatFrag LoadKind, int shift> + : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))), + (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>; + +multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind, + int shift> { + def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>; + def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>; +} + +class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode, PatFrag StoreKind, int shift> - : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr), - (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>; + : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr), + (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>; -multiclass MVE_unpred_vector_store<Instruction RegImmInst, PatFrag StoreKind, +multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind, int shift> { - def : MVE_unpred_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>; - def : MVE_unpred_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>; -} - -class MVE_unpred_vector_load_typed<ValueType Ty, Instruction RegImmInst, - PatFrag LoadKind, int shift> - : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)), - (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>; - -multiclass MVE_unpred_vector_load<Instruction RegImmInst, PatFrag LoadKind, - int shift> { - def : MVE_unpred_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>; - def : MVE_unpred_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>; -} + def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>; + def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>; +} + +def aligned32_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), + (pre_store node:$val, node:$ptr, node:$offset), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 4; +}]>; +def aligned32_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), + (post_store node:$val, node:$ptr, node:$offset), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 4; +}]>; +def aligned16_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), + (pre_store node:$val, node:$ptr, node:$offset), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 2; +}]>; +def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), + (post_store node:$val, node:$ptr, node:$offset), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 2; +}]>; + + +def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast<MaskedLoadSDNode>(N); + return Ld->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; +}]>; +def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD; +}]>; +def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (maskedload8 node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast<MaskedLoadSDNode>(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; +}]>; +def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast<MaskedLoadSDNode>(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2; +}]>; +def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; +}]>; +def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD; +}]>; +def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast<MaskedLoadSDNode>(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD; +}]>; +def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru), + (masked_ld node:$ptr, node:$pred, node:$passthru), [{ + auto *Ld = cast<MaskedLoadSDNode>(N); + EVT ScalarVT = Ld->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4; +}]>; + +def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (maskedstore8 node:$val, node:$ptr, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; +def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + auto *St = cast<MaskedStoreSDNode>(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2; +}]>; + +def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (maskedstore16 node:$val, node:$ptr, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; +def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, node:$pred), [{ + auto *St = cast<MaskedStoreSDNode>(N); + EVT ScalarVT = St->getMemoryVT().getScalarType(); + return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4; +}]>; let Predicates = [HasMVEInt, IsLE] in { - defm : MVE_unpred_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>; - defm : MVE_unpred_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>; - defm : MVE_unpred_vector_store<MVE_VSTRWU32, alignedstore32, 2>; + // Stores + defm : MVE_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>; + defm : MVE_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>; + defm : MVE_vector_store<MVE_VSTRWU32, alignedstore32, 2>; - defm : MVE_unpred_vector_load<MVE_VLDRBU8, byte_alignedload, 0>; - defm : MVE_unpred_vector_load<MVE_VLDRHU16, hword_alignedload, 1>; - defm : MVE_unpred_vector_load<MVE_VLDRWU32, alignedload32, 2>; + // Loads + defm : MVE_vector_load<MVE_VLDRBU8, byte_alignedload, 0>; + defm : MVE_vector_load<MVE_VLDRHU16, hword_alignedload, 1>; + defm : MVE_vector_load<MVE_VLDRWU32, alignedload32, 2>; - def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)), - (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; - def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)), - (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; - def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)), - (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + // Pre/post inc stores + defm : MVE_vector_offset_store<MVE_VSTRBU8_pre, pre_store, 0>; + defm : MVE_vector_offset_store<MVE_VSTRBU8_post, post_store, 0>; + defm : MVE_vector_offset_store<MVE_VSTRHU16_pre, aligned16_pre_store, 1>; + defm : MVE_vector_offset_store<MVE_VSTRHU16_post, aligned16_post_store, 1>; + defm : MVE_vector_offset_store<MVE_VSTRWU32_pre, aligned32_pre_store, 2>; + defm : MVE_vector_offset_store<MVE_VSTRWU32_post, aligned32_post_store, 2>; } let Predicates = [HasMVEInt, IsBE] in { - def : MVE_unpred_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>; - def : MVE_unpred_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>; - def : MVE_unpred_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>; - def : MVE_unpred_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>; - def : MVE_unpred_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>; - - def : MVE_unpred_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>; - def : MVE_unpred_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>; - def : MVE_unpred_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>; - def : MVE_unpred_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>; - def : MVE_unpred_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>; + // Aligned Stores + def : MVE_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>; + def : MVE_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>; + def : MVE_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>; + def : MVE_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>; + def : MVE_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>; + + // Aligned Loads + def : MVE_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>; + def : MVE_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>; + def : MVE_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>; + def : MVE_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>; + def : MVE_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>; + + // Other unaligned loads/stores need to go though a VREV + def : Pat<(v2f64 (load t2addrmode_imm7<0>:$addr)), + (v2f64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v2i64 (load t2addrmode_imm7<0>:$addr)), + (v2i64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v4i32 (load t2addrmode_imm7<0>:$addr)), + (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v4f32 (load t2addrmode_imm7<0>:$addr)), + (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v8i16 (load t2addrmode_imm7<0>:$addr)), + (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v8f16 (load t2addrmode_imm7<0>:$addr)), + (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(store (v2f64 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v2i64 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + + // Pre/Post inc stores + def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_pre, pre_store, 0>; + def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_post, post_store, 0>; + def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>; + def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_post, aligned16_post_store, 1>; + def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>; + def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_post, aligned16_post_store, 1>; + def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>; + def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_post, aligned32_post_store, 2>; + def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>; + def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>; } +let Predicates = [HasMVEInt] in { + // Aligned masked store, shared between LE and BE + def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore8, 0>; + def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, maskedstore16, 1>; + def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, maskedstore16, 1>; + def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, maskedstore32, 2>; + def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, maskedstore32, 2>; + // Truncating stores + def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred), + (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>; + def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred), + (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>; + // Aligned masked loads + def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload8, 0>; + def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>; + def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>; + def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>; + def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>; + // Extending masked loads. + def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v8i16 NEONimmAllZerosV))), + (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v8i16 NEONimmAllZerosV))), + (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v8i16 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v8i16 NEONimmAllZerosV))), + (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; + def : Pat<(v4i32 (extmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred, + (v4i32 NEONimmAllZerosV))), + (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>; +} // Widening/Narrowing Loads/Stores +let MinAlignment = 2 in { + def truncstorevi16_align2 : PatFrag<(ops node:$val, node:$ptr), + (truncstorevi16 node:$val, node:$ptr)>; + def post_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset), + (post_truncstvi16 node:$val, node:$base, node:$offset)>; + def pre_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset), + (pre_truncstvi16 node:$val, node:$base, node:$offset)>; +} + let Predicates = [HasMVEInt] in { - def : Pat<(truncstorevi8 (v8i16 MQPR:$val), t2addrmode_imm7<1>:$addr), - (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<1>:$addr)>; - def : Pat<(truncstorevi8 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr), - (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<1>:$addr)>; - def : Pat<(truncstorevi16 (v4i32 MQPR:$val), t2addrmode_imm7<2>:$addr), - (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<2>:$addr)>; + def : Pat<(truncstorevi8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr), + (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr)>; + def : Pat<(truncstorevi8 (v4i32 MQPR:$val), taddrmode_imm7<0>:$addr), + (MVE_VSTRB32 MQPR:$val, taddrmode_imm7<0>:$addr)>; + def : Pat<(truncstorevi16_align2 (v4i32 MQPR:$val), taddrmode_imm7<1>:$addr), + (MVE_VSTRH32 MQPR:$val, taddrmode_imm7<1>:$addr)>; + + def : Pat<(post_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), + (MVE_VSTRB16_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; + def : Pat<(post_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), + (MVE_VSTRB32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; + def : Pat<(post_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr), + (MVE_VSTRH32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>; + + def : Pat<(pre_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), + (MVE_VSTRB16_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; + def : Pat<(pre_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr), + (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>; + def : Pat<(pre_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr), + (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>; +} + + +let MinAlignment = 2 in { + def extloadvi16_align2 : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>; + def sextloadvi16_align2 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>; + def zextloadvi16_align2 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>; } multiclass MVEExtLoad<string DestLanes, string DestElemBits, string SrcElemBits, string SrcElemType, - Operand am> { + string Align, Operand am> { def _Any : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits) - (!cast<PatFrag>("extloadvi" # SrcElemBits) am:$addr)), + (!cast<PatFrag>("extloadvi" # SrcElemBits # Align) am:$addr)), (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits) am:$addr)>; def _Z : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits) - (!cast<PatFrag>("zextloadvi" # SrcElemBits) am:$addr)), + (!cast<PatFrag>("zextloadvi" # SrcElemBits # Align) am:$addr)), (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits) am:$addr)>; def _S : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits) - (!cast<PatFrag>("sextloadvi" # SrcElemBits) am:$addr)), + (!cast<PatFrag>("sextloadvi" # SrcElemBits # Align) am:$addr)), (!cast<Instruction>("MVE_VLDR" # SrcElemType # "S" # DestElemBits) am:$addr)>; } let Predicates = [HasMVEInt] in { - defm : MVEExtLoad<"4", "32", "8", "B", t2addrmode_imm7<1>>; - defm : MVEExtLoad<"8", "16", "8", "B", t2addrmode_imm7<1>>; - defm : MVEExtLoad<"4", "32", "16", "H", t2addrmode_imm7<2>>; + defm : MVEExtLoad<"4", "32", "8", "B", "", taddrmode_imm7<0>>; + defm : MVEExtLoad<"8", "16", "8", "B", "", taddrmode_imm7<0>>; + defm : MVEExtLoad<"4", "32", "16", "H", "_align2", taddrmode_imm7<1>>; } // Bit convert patterns let Predicates = [HasMVEInt] in { - def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 MQPR:$src))), (v2i64 MQPR:$src)>; - def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 MQPR:$src))), (v4f32 MQPR:$src)>; - def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v8f16 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v8i16 MQPR:$src))), (v8f16 MQPR:$src)>; } let Predicates = [IsLE,HasMVEInt] in { - def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; - - def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; - - def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; - - def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; - - def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>; - - def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; - - def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 MQPR:$src)>; + + def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 MQPR:$src)>; + + def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 MQPR:$src)>; + + def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 MQPR:$src)>; + + def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 MQPR:$src)>; + + def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 MQPR:$src)>; + + def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 MQPR:$src)>; +} + +let Predicates = [IsBE,HasMVEInt] in { + def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 (MVE_VREV64_8 MQPR:$src))>; + + def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 (MVE_VREV64_8 MQPR:$src))>; + + def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 (MVE_VREV32_8 MQPR:$src))>; + + def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 (MVE_VREV32_8 MQPR:$src))>; + + def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 (MVE_VREV16_8 MQPR:$src))>; + + def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 (MVE_VREV16_8 MQPR:$src))>; + + def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>; } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 806681df102c..60ca92e58041 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -15,22 +15,22 @@ // NEON-specific Operands. //===----------------------------------------------------------------------===// def nModImm : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; } def nImmSplatI8AsmOperand : AsmOperandClass { let Name = "NEONi8splat"; } def nImmSplatI8 : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmSplatI8AsmOperand; } def nImmSplatI16AsmOperand : AsmOperandClass { let Name = "NEONi16splat"; } def nImmSplatI16 : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmSplatI16AsmOperand; } def nImmSplatI32AsmOperand : AsmOperandClass { let Name = "NEONi32splat"; } def nImmSplatI32 : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmSplatI32AsmOperand; } def nImmSplatNotI16AsmOperand : AsmOperandClass { let Name = "NEONi16splatNot"; } @@ -43,7 +43,7 @@ def nImmSplatNotI32 : Operand<i32> { } def nImmVMOVI32AsmOperand : AsmOperandClass { let Name = "NEONi32vmov"; } def nImmVMOVI32 : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmVMOVI32AsmOperand; } @@ -62,18 +62,18 @@ class nImmVINVIAsmOperandReplicate<ValueType From, ValueType To> } class nImmVMOVIReplicate<ValueType From, ValueType To> : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmVMOVIAsmOperandReplicate<From, To>; } class nImmVINVIReplicate<ValueType From, ValueType To> : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmVINVIAsmOperandReplicate<From, To>; } def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; } def nImmVMOVI32Neg : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmVMOVI32NegAsmOperand; } def nImmVMOVF32 : Operand<i32> { @@ -82,7 +82,7 @@ def nImmVMOVF32 : Operand<i32> { } def nImmSplatI64AsmOperand : AsmOperandClass { let Name = "NEONi64splat"; } def nImmSplatI64 : Operand<i32> { - let PrintMethod = "printNEONModImmOperand"; + let PrintMethod = "printVMOVModImmOperand"; let ParserMatchClass = nImmSplatI64AsmOperand; } @@ -478,20 +478,8 @@ def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr), // NEON-specific DAG Nodes. //===----------------------------------------------------------------------===// -def SDTARMVCMP : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>; -def SDTARMVCMPZ : SDTypeProfile<1, 1, []>; - -def NEONvceq : SDNode<"ARMISD::VCEQ", SDTARMVCMP>; -def NEONvceqz : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>; -def NEONvcge : SDNode<"ARMISD::VCGE", SDTARMVCMP>; -def NEONvcgez : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>; -def NEONvclez : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>; -def NEONvcgeu : SDNode<"ARMISD::VCGEU", SDTARMVCMP>; -def NEONvcgt : SDNode<"ARMISD::VCGT", SDTARMVCMP>; -def NEONvcgtz : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>; -def NEONvcltz : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>; -def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>; -def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>; +def SDTARMVTST : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>; +def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVTST>; // Types for vector shift by immediates. The "SHX" version is for long and // narrow operations where the source and destination vectors have different @@ -559,14 +547,14 @@ def NEONvtbl2 : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>; def NEONimmAllZerosV: PatLeaf<(ARMvmovImm (i32 timm)), [{ ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); unsigned EltBits = 0; - uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits); return (EltBits == 32 && EltVal == 0); }]>; def NEONimmAllOnesV: PatLeaf<(ARMvmovImm (i32 timm)), [{ ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); unsigned EltBits = 0; - uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits); return (EltBits == 8 && EltVal == 0xff); }]>; @@ -3326,30 +3314,30 @@ class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, // source operand element sizes of 8, 16 and 32 bits: multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, bits<5> op11_7, bit op4, string opc, string Dt, - string asm, SDNode OpNode> { + string asm, int fc> { // 64-bit vector types. def v8i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, !strconcat(Dt, "8"), asm, "", - [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>; + [(set DPR:$Vd, (v8i8 (ARMvcmpz (v8i8 DPR:$Vm), (i32 fc))))]>; def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, !strconcat(Dt, "16"), asm, "", - [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>; + [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4i16 DPR:$Vm), (i32 fc))))]>; def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, !strconcat(Dt, "32"), asm, "", - [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>; + [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2i32 DPR:$Vm), (i32 fc))))]>; def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, "f32", asm, "", - [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> { + [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2f32 DPR:$Vm), (i32 fc))))]> { let Inst{10} = 1; // overwrite F = 1 } def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, opc, "f16", asm, "", - [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>, + [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4f16 DPR:$Vm), (i32 fc))))]>, Requires<[HasNEON,HasFullFP16]> { let Inst{10} = 1; // overwrite F = 1 } @@ -3358,30 +3346,83 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, !strconcat(Dt, "8"), asm, "", - [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>; + [(set QPR:$Vd, (v16i8 (ARMvcmpz (v16i8 QPR:$Vm), (i32 fc))))]>; def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, !strconcat(Dt, "16"), asm, "", - [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>; + [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8i16 QPR:$Vm), (i32 fc))))]>; def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, !strconcat(Dt, "32"), asm, "", - [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>; + [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4i32 QPR:$Vm), (i32 fc))))]>; def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, "f32", asm, "", - [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> { + [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4f32 QPR:$Vm), (i32 fc))))]> { let Inst{10} = 1; // overwrite F = 1 } def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, opc, "f16", asm, "", - [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>, + [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8f16 QPR:$Vm), (i32 fc))))]>, Requires<[HasNEON,HasFullFP16]> { let Inst{10} = 1; // overwrite F = 1 } } +// Neon 3-register comparisons. +class N3VQ_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, int fc, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (ResTy (ARMvcmp (OpTy QPR:$Vn), (OpTy QPR:$Vm), (i32 fc))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = Commutable; +} + +class N3VD_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, int fc, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (ResTy (ARMvcmp (OpTy DPR:$Vn), (OpTy DPR:$Vm), (i32 fc))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = Commutable; +} + +multiclass N3V_QHS_cmp<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + int fc, bit Commutable = 0> { + // 64-bit vector types. + def v8i8 : N3VD_cmp<op24, op23, 0b00, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "8"), + v8i8, v8i8, fc, Commutable>; + def v4i16 : N3VD_cmp<op24, op23, 0b01, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "16"), + v4i16, v4i16, fc, Commutable>; + def v2i32 : N3VD_cmp<op24, op23, 0b10, op11_8, op4, itinD32, + OpcodeStr, !strconcat(Dt, "32"), + v2i32, v2i32, fc, Commutable>; + + // 128-bit vector types. + def v16i8 : N3VQ_cmp<op24, op23, 0b00, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), + v16i8, v16i8, fc, Commutable>; + def v8i16 : N3VQ_cmp<op24, op23, 0b01, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), + v8i16, v8i16, fc, Commutable>; + def v4i32 : N3VQ_cmp<op24, op23, 0b10, op11_8, op4, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), + v4i32, v4i32, fc, Commutable>; +} + // Neon 2-register vector intrinsics, // element sizes of 8, 16 and 32 bits: @@ -5026,67 +5067,67 @@ def : Pat<(v2i32 (trunc (ARMvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))), // Vector Comparisons. // VCEQ : Vector Compare Equal -defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>; -def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32, - NEONvceq, 1>; -def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, - NEONvceq, 1>; -def VCEQhd : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16, - NEONvceq, 1>, +defm VCEQ : N3V_QHS_cmp<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vceq", "i", 0, 1>; +def VCEQfd : N3VD_cmp<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32, + 0, 1>; +def VCEQfq : N3VQ_cmp<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, + 0, 1>; +def VCEQhd : N3VD_cmp<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16, + 0, 1>, Requires<[HasNEON, HasFullFP16]>; -def VCEQhq : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16, - NEONvceq, 1>, +def VCEQhq : N3VQ_cmp<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16, + 0, 1>, Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i", - "$Vd, $Vm, #0", NEONvceqz>; + "$Vd, $Vm, #0", 0>; // VCGE : Vector Compare Greater Than or Equal -defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>; -defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>; -def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, - NEONvcge, 0>; -def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, - NEONvcge, 0>; -def VCGEhd : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16, - NEONvcge, 0>, +defm VCGEs : N3V_QHS_cmp<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcge", "s", 10, 0>; +defm VCGEu : N3V_QHS_cmp<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcge", "u", 2, 0>; +def VCGEfd : N3VD_cmp<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, + 10, 0>; +def VCGEfq : N3VQ_cmp<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, + 10, 0>; +def VCGEhd : N3VD_cmp<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16, + 10, 0>, Requires<[HasNEON, HasFullFP16]>; -def VCGEhq : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16, - NEONvcge, 0>, +def VCGEhq : N3VQ_cmp<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16, + 10, 0>, Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in { defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s", - "$Vd, $Vm, #0", NEONvcgez>; + "$Vd, $Vm, #0", 10>; defm VCLEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s", - "$Vd, $Vm, #0", NEONvclez>; + "$Vd, $Vm, #0", 13>; } // VCGT : Vector Compare Greater Than -defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>; -defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, - IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>; -def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32, - NEONvcgt, 0>; -def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32, - NEONvcgt, 0>; -def VCGThd : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16, - NEONvcgt, 0>, +defm VCGTs : N3V_QHS_cmp<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcgt", "s", 12, 0>; +defm VCGTu : N3V_QHS_cmp<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcgt", "u", 8, 0>; +def VCGTfd : N3VD_cmp<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32, + 12, 0>; +def VCGTfq : N3VQ_cmp<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32, + 12, 0>; +def VCGThd : N3VD_cmp<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16, + 12, 0>, Requires<[HasNEON, HasFullFP16]>; -def VCGThq : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16, - NEONvcgt, 0>, +def VCGThq : N3VQ_cmp<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16, + 12, 0>, Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in { defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s", - "$Vd, $Vm, #0", NEONvcgtz>; + "$Vd, $Vm, #0", 12>; defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s", - "$Vd, $Vm, #0", NEONvcltz>; + "$Vd, $Vm, #0", 11>; } // VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index cfeb13c6acb6..18bcbda44580 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -565,6 +565,13 @@ let isCall = 1, 4, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>; + + // Also used for Thumb2 + // push lr before the call + def tBL_PUSHLR : tPseudoInst<(outs), (ins GPRlr:$ra, pred:$p, thumb_bl_target:$func), + 4, IIC_Br, + []>, + Requires<[IsThumb]>, Sched<[WriteBr]>; } let isBranch = 1, isTerminator = 1, isBarrier = 1 in { @@ -592,6 +599,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in { [(ARMbrjt tGPR:$target, tjumptable:$jt)]>, Sched<[WriteBrTbl]> { let Size = 2; + let isNotDuplicable = 1; list<Predicate> Predicates = [IsThumb, IsThumb1Only]; } } @@ -1362,6 +1370,12 @@ let hasPostISelHook = 1, Defs = [CPSR] in { [(set tGPR:$Rd, CPSR, (ARMsubc 0, tGPR:$Rn))]>, Requires<[IsThumb1Only]>, Sched<[WriteALU]>; + + def tLSLSri : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, imm0_31:$imm5), + 2, IIC_iALUr, + [(set tGPR:$Rd, CPSR, (ARMlsls tGPR:$Rn, imm0_31:$imm5))]>, + Requires<[IsThumb1Only]>, + Sched<[WriteALU]>; } @@ -1465,7 +1479,7 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), // Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them // and make use of the same compressed jump table format as Thumb-2. let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1, - isIndirectBranch = 1 in { + isIndirectBranch = 1, isNotDuplicable = 1 in { def tTBB_JT : tPseudoInst<(outs), (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>, Sched<[WriteBr]>; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 7cbfaba7a8eb..25a45b39fa0c 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -45,7 +45,8 @@ def mve_shift_imm : AsmOperandClass { let RenderMethod = "addImmOperands"; let DiagnosticString = "operand must be an immediate in the range [1,32]"; } -def long_shift : Operand<i32> { +def long_shift : Operand<i32>, + ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> { let ParserMatchClass = mve_shift_imm; let DecoderMethod = "DecodeLongShiftOperand"; } @@ -2394,6 +2395,23 @@ def : Thumb2DSPPat<(int_arm_qadd(int_arm_qadd rGPR:$Rm, rGPR:$Rm), rGPR:$Rn), def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)), (t2QDSUB rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(saddsat rGPR:$Rm, rGPR:$Rn), + (t2QADD rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ssubsat rGPR:$Rm, rGPR:$Rn), + (t2QSUB rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn), + (t2QDADD rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)), + (t2QDSUB rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn), + (t2QADD8 rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn), + (t2QSUB8 rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn), + (t2QADD16 rGPR:$Rm, rGPR:$Rn)>; +def : Thumb2DSPPat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn), + (t2QSUB16 rGPR:$Rm, rGPR:$Rn)>; + // Signed/Unsigned add/subtract def t2SASX : T2I_pam_intrinsics<0b010, 0b0000, "sasx", int_arm_sasx>; @@ -4085,7 +4103,7 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp), // Pseudo isntruction that combines movs + predicated rsbmi // to implement integer ABS -let usesCustomInserter = 1, Defs = [CPSR] in { +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in { def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src), NoItinerary, []>, Requires<[IsThumb2]>; } @@ -4175,15 +4193,15 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm, list<dag> } let DecoderNamespace = "Thumb2CoProc" in { -defm t2LDC : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm t2LDCL : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm t2LDC2 : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; -defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; +defm t2LDC : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm t2LDCL : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm t2LDC2 : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; +defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; -defm t2STC : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm t2STCL : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>; -defm t2STC2 : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; -defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; +defm t2STC : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm t2STCL : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>; +defm t2STC2 : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; +defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>; } @@ -4368,8 +4386,8 @@ def t2MCR : t2MovRCopro<0b1110, "mcr", 0, (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), - [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, - imm:$CRm, imm:$opc2)]>, + [(int_arm_mcr timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn, + timm:$CRm, timm:$opc2)]>, ComplexDeprecationPredicate<"MCR">; def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm", (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, @@ -4377,8 +4395,8 @@ def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm", def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0, (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), - [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, - imm:$CRm, imm:$opc2)]> { + [(int_arm_mcr2 timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn, + timm:$CRm, timm:$opc2)]> { let Predicates = [IsThumb2, PreV8]; } def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm", @@ -4402,24 +4420,24 @@ def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm", (t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, 0, pred:$p)>; -def : T2v6Pat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), - (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; +def : T2v6Pat<(int_arm_mrc timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2), + (t2MRC p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>; -def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), - (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; +def : T2v6Pat<(int_arm_mrc2 timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2), + (t2MRC2 p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>; /* from ARM core register to coprocessor */ def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0, (outs), (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), - [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2, - imm:$CRm)]>; + [(int_arm_mcrr timm:$cop, timm:$opc1, GPR:$Rt, GPR:$Rt2, + timm:$CRm)]>; def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0, (outs), (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), - [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt, - GPR:$Rt2, imm:$CRm)]> { + [(int_arm_mcrr2 timm:$cop, timm:$opc1, GPR:$Rt, + GPR:$Rt2, timm:$CRm)]> { let Predicates = [IsThumb2, PreV8]; } @@ -4439,8 +4457,8 @@ def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1, (outs GPR:$Rt, GPR:$Rt2), def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, - imm:$CRm, imm:$opc2)]> { + [(int_arm_cdp timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn, + timm:$CRm, timm:$opc2)]> { let Inst{27-24} = 0b1110; bits<4> opc1; @@ -4465,8 +4483,8 @@ def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), "cdp2", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", - [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, - imm:$CRm, imm:$opc2)]> { + [(int_arm_cdp2 timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn, + timm:$CRm, timm:$opc2)]> { let Inst{27-24} = 0b1110; bits<4> opc1; @@ -5087,6 +5105,7 @@ def t2BF_LabelPseudo : t2PseudoInst<(outs ), (ins pclabel:$cp), 0, NoItinerary, []> { let isTerminator = 1; let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB]; + let hasNoSchedulingInfo = 1; } def t2BFi : t2BF<(ins bflabel_u4:$b_label, bflabel_s16:$label, pred:$p), @@ -5217,11 +5236,13 @@ def t2LoopDec : t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size), 4, IIC_Br, []>, Sched<[WriteBr]>; -let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in { +let isBranch = 1, isTerminator = 1, hasSideEffects = 1, Defs = [CPSR] in { +// Set WhileLoopStart and LoopEnd to occupy 8 bytes because they may +// get converted into t2CMP and t2Bcc. def t2WhileLoopStart : t2PseudoInst<(outs), (ins rGPR:$elts, brtarget:$target), - 4, IIC_Br, []>, + 8, IIC_Br, []>, Sched<[WriteBr]>; def t2LoopEnd : @@ -5233,7 +5254,7 @@ def t2LoopEnd : } // end isNotDuplicable class CS<string iname, bits<4> opcode, list<dag> pattern=[]> - : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZR:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond), + : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond), AddrModeNone, NoItinerary, iname, "$Rd, $Rn, $Rm, $fcond", "", pattern> { bits<4> Rd; bits<4> Rm; @@ -5255,6 +5276,25 @@ def t2CSINC : CS<"csinc", 0b1001>; def t2CSINV : CS<"csinv", 0b1010>; def t2CSNEG : CS<"csneg", 0b1011>; +let Predicates = [HasV8_1MMainline] in { + def : T2Pat<(ARMcsinc GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm), + (t2CSINC GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>; + def : T2Pat<(ARMcsinv GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm), + (t2CSINV GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>; + def : T2Pat<(ARMcsneg GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm), + (t2CSNEG GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>; + + multiclass ModifiedV8_1CSEL<Instruction Insn, dag modvalue> { + def : T2Pat<(ARMcmov modvalue, GPRwithZR:$tval, cmovpred:$imm), + (Insn GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>; + def : T2Pat<(ARMcmov GPRwithZR:$tval, modvalue, cmovpred:$imm), + (Insn GPRwithZR:$tval, GPRwithZR:$fval, + (i32 (inv_cond_XFORM imm:$imm)))>; + } + defm : ModifiedV8_1CSEL<t2CSINC, (add rGPR:$fval, 1)>; + defm : ModifiedV8_1CSEL<t2CSINV, (xor rGPR:$fval, -1)>; + defm : ModifiedV8_1CSEL<t2CSNEG, (sub 0, rGPR:$fval)>; +} // CS aliases. let Predicates = [HasV8_1MMainline] in { diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index a0dd25de07ee..fdd961bfbb2f 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -def SDT_CMPFP0 : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisVT<1, i32>]>; +def SDT_CMPFP0 : SDTypeProfile<0, 1, [SDTCisFP<0>]>; def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>; def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, @@ -19,7 +19,7 @@ def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, def SDT_VMOVSR : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>; def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInGlue, SDNPOutGlue]>; -def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMFCmp, [SDNPOutGlue]>; +def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>; def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>; def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>; def arm_fmrrd : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>; @@ -324,7 +324,7 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r", // However, there is no UAL syntax for them, so we keep them around for // (dis)assembly only. multiclass vfp_ldstx_mult<string asm, bit L_bit> { - let Predicates = [HasFPRegs] in { + let Predicates = [HasFPRegs], hasNoSchedulingInfo = 1 in { // Unknown precision def XIA : AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), @@ -548,12 +548,12 @@ let Defs = [FPSCR_NZCV] in { def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins DPR:$Dd, DPR:$Dm), IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm", - [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 1))]>; + [/* For disassembly only; pattern left blank */]>; def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins SPR:$Sd, SPR:$Sm), IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm", - [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 1))]> { + [/* For disassembly only; pattern left blank */]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -562,17 +562,17 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins HPR:$Sd, HPR:$Sm), IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm", - [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 1))]>; + [/* For disassembly only; pattern left blank */]>; def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins DPR:$Dd, DPR:$Dm), IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm", - [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 0))]>; + [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>; def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins SPR:$Sd, SPR:$Sm), IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm", - [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 0))]> { + [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -581,7 +581,7 @@ def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, def VCMPH : AHuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins HPR:$Sd, HPR:$Sm), IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm", - [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 0))]>; + [(arm_cmpfp HPR:$Sd, HPR:$Sm)]>; } // Defs = [FPSCR_NZCV] //===----------------------------------------------------------------------===// @@ -611,7 +611,7 @@ let Defs = [FPSCR_NZCV] in { def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins DPR:$Dd), IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0", - [(arm_cmpfp0 (f64 DPR:$Dd), (i32 1))]> { + [/* For disassembly only; pattern left blank */]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; } @@ -619,7 +619,7 @@ def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins SPR:$Sd), IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0", - [(arm_cmpfp0 SPR:$Sd, (i32 1))]> { + [/* For disassembly only; pattern left blank */]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; @@ -631,7 +631,7 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins HPR:$Sd), IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0", - [(arm_cmpfp0 HPR:$Sd, (i32 1))]> { + [/* For disassembly only; pattern left blank */]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; } @@ -639,7 +639,7 @@ def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0, def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins DPR:$Dd), IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0", - [(arm_cmpfp0 (f64 DPR:$Dd), (i32 0))]> { + [(arm_cmpfp0 (f64 DPR:$Dd))]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; } @@ -647,7 +647,7 @@ def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0, def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins SPR:$Sd), IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0", - [(arm_cmpfp0 SPR:$Sd, (i32 0))]> { + [(arm_cmpfp0 SPR:$Sd)]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; @@ -659,7 +659,7 @@ def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, def VCMPZH : AHuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins HPR:$Sd), IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0", - [(arm_cmpfp0 HPR:$Sd, (i32 0))]> { + [(arm_cmpfp0 HPR:$Sd)]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; } @@ -1732,7 +1732,8 @@ def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0, def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0, (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits), - IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> { + IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []>, + Sched<[WriteFPCVT]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -1740,7 +1741,8 @@ def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0, def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1, (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits), - IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> { + IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []>, + Sched<[WriteFPCVT]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -1748,7 +1750,8 @@ def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1, def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1, (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits), - IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> { + IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []>, + Sched<[WriteFPCVT]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -2297,6 +2300,8 @@ class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm, let Inst{6-5} = 0b00; let Inst{4} = 1; let Inst{3-0} = 0b0000; + let Unpredictable{7-5} = 0b111; + let Unpredictable{3-0} = 0b1111; } let DecoderMethod = "DecodeForVMRSandVMSR" in { @@ -2370,63 +2375,65 @@ class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm, VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> { // Instruction operand. - bits<4> src; - - // Encode instruction operand. - let Inst{15-12} = src; + bits<4> Rt; let Inst{27-20} = 0b11101110; let Inst{19-16} = opc19_16; + let Inst{15-12} = Rt; let Inst{11-8} = 0b1010; let Inst{7} = 0; + let Inst{6-5} = 0b00; let Inst{4} = 1; + let Inst{3-0} = 0b0000; let Predicates = [HasVFP2]; + let Unpredictable{7-5} = 0b111; + let Unpredictable{3-0} = 0b1111; } let DecoderMethod = "DecodeForVMRSandVMSR" in { let Defs = [FPSCR] in { let Predicates = [HasFPRegs] in // Application level GPR -> FPSCR - def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$src), - "vmsr", "\tfpscr, $src", - [(int_arm_set_fpscr GPRnopc:$src)]>; + def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$Rt), + "vmsr", "\tfpscr, $Rt", + [(int_arm_set_fpscr GPRnopc:$Rt)]>; // System level GPR -> FPEXC - def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPRnopc:$src), - "vmsr", "\tfpexc, $src", []>; + def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPRnopc:$Rt), + "vmsr", "\tfpexc, $Rt", []>; // System level GPR -> FPSID - def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPRnopc:$src), - "vmsr", "\tfpsid, $src", []>; - def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPRnopc:$src), - "vmsr", "\tfpinst, $src", []>; - def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$src), - "vmsr", "\tfpinst2, $src", []>; + def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPRnopc:$Rt), + "vmsr", "\tfpsid, $Rt", []>; + def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPRnopc:$Rt), + "vmsr", "\tfpinst, $Rt", []>; + def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$Rt), + "vmsr", "\tfpinst2, $Rt", []>; } let Predicates = [HasV8_1MMainline, Has8MSecExt] in { // System level GPR -> FPSCR with context saving for security extensions - def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$src), - "vmsr", "\tfpcxtns, $src", []>; + def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$Rt), + "vmsr", "\tfpcxtns, $Rt", []>; } let Predicates = [HasV8_1MMainline, Has8MSecExt] in { // System level GPR -> FPSCR with context saving for security extensions - def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$src), - "vmsr", "\tfpcxts, $src", []>; + def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$Rt), + "vmsr", "\tfpcxts, $Rt", []>; } let Predicates = [HasV8_1MMainline, HasFPRegs] in { // System level GPR -> FPSCR_NZCVQC def VMSR_FPSCR_NZCVQC : MovToVFP<0b0010 /* fpscr_nzcvqc */, - (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$src), - "vmsr", "\tfpscr_nzcvqc, $src", []>; + (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$Rt), + "vmsr", "\tfpscr_nzcvqc, $Rt", []>; } let Predicates = [HasV8_1MMainline, HasMVEInt] in { // System level GPR -> VPR/P0 let Defs = [VPR] in - def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$src), - "vmsr", "\tvpr, $src", []>; + def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$Rt), + "vmsr", "\tvpr, $Rt", []>; - def VMSR_P0 : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$src), - "vmsr", "\tp0, $src", []>; + def VMSR_P0 : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$Rt), + "vmsr", "\tp0, $Rt", []>; } } @@ -2614,7 +2621,8 @@ def VSCCLRMD : VFPXI<(outs), (ins pred:$p, fp_dreglist_with_vpr:$regs, variable_ let Inst{21-16} = 0b011111; let Inst{15-12} = regs{11-8}; let Inst{11-8} = 0b1011; - let Inst{7-0} = regs{7-0}; + let Inst{7-1} = regs{7-1}; + let Inst{0} = 0; let DecoderMethod = "DecodeVSCCLRM"; diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 4485a474a6df..8e5e474c0f59 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -34,7 +34,7 @@ public: ARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI, const ARMRegisterBankInfo &RBI); - bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + bool select(MachineInstr &I) override; static const char *getName() { return DEBUG_TYPE; } private: @@ -210,8 +210,8 @@ static const TargetRegisterClass *guessRegClass(unsigned Reg, static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { - unsigned DstReg = I.getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + Register DstReg = I.getOperand(0).getReg(); + if (Register::isPhysicalRegister(DstReg)) return true; const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI); @@ -236,17 +236,17 @@ static bool selectMergeValues(MachineInstrBuilder &MIB, // We only support G_MERGE_VALUES as a way to stick together two scalar GPRs // into one DPR. - unsigned VReg0 = MIB->getOperand(0).getReg(); + Register VReg0 = MIB->getOperand(0).getReg(); (void)VReg0; assert(MRI.getType(VReg0).getSizeInBits() == 64 && RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::FPRRegBankID && "Unsupported operand for G_MERGE_VALUES"); - unsigned VReg1 = MIB->getOperand(1).getReg(); + Register VReg1 = MIB->getOperand(1).getReg(); (void)VReg1; assert(MRI.getType(VReg1).getSizeInBits() == 32 && RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID && "Unsupported operand for G_MERGE_VALUES"); - unsigned VReg2 = MIB->getOperand(2).getReg(); + Register VReg2 = MIB->getOperand(2).getReg(); (void)VReg2; assert(MRI.getType(VReg2).getSizeInBits() == 32 && RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::GPRRegBankID && @@ -268,17 +268,17 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB, // We only support G_UNMERGE_VALUES as a way to break up one DPR into two // GPRs. - unsigned VReg0 = MIB->getOperand(0).getReg(); + Register VReg0 = MIB->getOperand(0).getReg(); (void)VReg0; assert(MRI.getType(VReg0).getSizeInBits() == 32 && RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::GPRRegBankID && "Unsupported operand for G_UNMERGE_VALUES"); - unsigned VReg1 = MIB->getOperand(1).getReg(); + Register VReg1 = MIB->getOperand(1).getReg(); (void)VReg1; assert(MRI.getType(VReg1).getSizeInBits() == 32 && RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID && "Unsupported operand for G_UNMERGE_VALUES"); - unsigned VReg2 = MIB->getOperand(2).getReg(); + Register VReg2 = MIB->getOperand(2).getReg(); (void)VReg2; assert(MRI.getType(VReg2).getSizeInBits() == 64 && RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::FPRRegBankID && @@ -833,8 +833,7 @@ void ARMInstructionSelector::renderVFPF64Imm( NewInstBuilder.addImm(FPImmEncoding); } -bool ARMInstructionSelector::select(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { +bool ARMInstructionSelector::select(MachineInstr &I) { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -851,7 +850,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, using namespace TargetOpcode; - if (selectImpl(I, CoverageInfo)) + if (selectImpl(I, *CoverageInfo)) return true; MachineInstrBuilder MIB{MF, I}; @@ -874,10 +873,10 @@ bool ARMInstructionSelector::select(MachineInstr &I, MIB.addImm(1).add(predOps(ARMCC::AL)).add(condCodeOp()); if (isSExt) { - unsigned SExtResult = I.getOperand(0).getReg(); + Register SExtResult = I.getOperand(0).getReg(); // Use a new virtual register for the result of the AND - unsigned AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass); + Register AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass); I.getOperand(0).setReg(AndResult); auto InsertBefore = std::next(I.getIterator()); @@ -928,7 +927,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, assert(MRI.getType(SrcReg).getSizeInBits() == 64 && "Unsupported size"); assert(MRI.getType(DstReg).getSizeInBits() <= 32 && "Unsupported size"); - unsigned IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass); + Register IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass); auto InsertBefore = std::next(I.getIterator()); auto MovI = BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::VMOVRRD)) @@ -1039,7 +1038,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, case G_FCMP: { assert(STI.hasVFP2Base() && "Can't select fcmp without VFP"); - unsigned OpReg = I.getOperand(2).getReg(); + Register OpReg = I.getOperand(2).getReg(); unsigned Size = MRI.getType(OpReg).getSizeInBits(); if (Size == 64 && !STI.hasFP64()) { @@ -1077,12 +1076,12 @@ bool ARMInstructionSelector::select(MachineInstr &I, case G_STORE: case G_LOAD: { const auto &MemOp = **I.memoperands_begin(); - if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { + if (MemOp.isAtomic()) { LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n"); return false; } - unsigned Reg = I.getOperand(0).getReg(); + Register Reg = I.getOperand(0).getReg(); unsigned RegBank = RBI.getRegBank(Reg, MRI, TRI)->getID(); LLT ValTy = MRI.getType(Reg); @@ -1097,9 +1096,9 @@ bool ARMInstructionSelector::select(MachineInstr &I, if (ValSize == 1 && NewOpc == Opcodes.STORE8) { // Before storing a 1-bit value, make sure to clear out any unneeded bits. - unsigned OriginalValue = I.getOperand(0).getReg(); + Register OriginalValue = I.getOperand(0).getReg(); - unsigned ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass); + Register ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass); I.getOperand(0).setReg(ValueToStore); auto InsertBefore = I.getIterator(); @@ -1159,7 +1158,7 @@ bool ARMInstructionSelector::select(MachineInstr &I, case G_PHI: { I.setDesc(TII.get(PHI)); - unsigned DstReg = I.getOperand(0).getReg(); + Register DstReg = I.getOperand(0).getReg(); const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI); if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) { break; diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index 73a57b297ad6..81414e6d76fe 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -84,6 +84,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) .legalForCartesianProduct({s8, s16, s32}, {s1, s8, s16}); + getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + getActionDefinitionsBuilder({G_MUL, G_AND, G_OR, G_XOR}) .legalFor({s32}) .minScalar(0, s32); diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 90a1ce238c3f..4a193fed04a3 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -509,7 +509,7 @@ void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB, Offset = MO.getImm() - WordOffset * getImmScale(Opc); // If storing the base register, it needs to be reset first. - unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg(); + Register InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg(); if (Offset >= 0 && !(IsStore && InstrSrcReg == Base)) MO.setImm(Offset); @@ -859,7 +859,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { // Determine list of registers and list of implicit super-register defs. for (const MachineInstr *MI : Cand.Instrs) { const MachineOperand &MO = getLoadStoreRegOp(*MI); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); bool IsKill = MO.isKill(); if (IsKill) KilledRegs.insert(Reg); @@ -874,7 +874,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { if (!MO.isReg() || !MO.isDef() || MO.isDead()) continue; assert(MO.isImplicit()); - unsigned DefReg = MO.getReg(); + Register DefReg = MO.getReg(); if (is_contained(ImpDefs, DefReg)) continue; @@ -893,7 +893,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { iterator InsertBefore = std::next(iterator(LatestMI)); MachineBasicBlock &MBB = *LatestMI->getParent(); unsigned Offset = getMemoryOpOffset(*First); - unsigned Base = getLoadStoreBaseOp(*First).getReg(); + Register Base = getLoadStoreBaseOp(*First).getReg(); bool BaseKill = LatestMI->killsRegister(Base); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg); @@ -1005,7 +1005,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { const MachineInstr *MI = MemOps[SIndex].MI; int Offset = MemOps[SIndex].Offset; const MachineOperand &PMO = getLoadStoreRegOp(*MI); - unsigned PReg = PMO.getReg(); + Register PReg = PMO.getReg(); unsigned PRegNum = PMO.isUndef() ? std::numeric_limits<unsigned>::max() : TRI->getEncodingValue(PReg); unsigned Latest = SIndex; @@ -1052,7 +1052,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { if (NewOffset != Offset + (int)Size) break; const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg == ARM::SP || Reg == ARM::PC) break; if (Count == Limit) @@ -1261,7 +1261,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { if (isThumb1) return false; const MachineOperand &BaseOP = MI->getOperand(0); - unsigned Base = BaseOP.getReg(); + Register Base = BaseOP.getReg(); bool BaseKill = BaseOP.isKill(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); @@ -1387,7 +1387,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { // FIXME: Use LDM/STM with single register instead. if (isThumb1) return false; - unsigned Base = getLoadStoreBaseOp(*MI).getReg(); + Register Base = getLoadStoreBaseOp(*MI).getReg(); bool BaseKill = getLoadStoreBaseOp(*MI).isKill(); unsigned Opcode = MI->getOpcode(); DebugLoc DL = MI->getDebugLoc(); @@ -1512,7 +1512,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { // Behaviour for writeback is undefined if base register is the same as one // of the others. const MachineOperand &BaseOp = MI.getOperand(2); - unsigned Base = BaseOp.getReg(); + Register Base = BaseOp.getReg(); const MachineOperand &Reg0Op = MI.getOperand(0); const MachineOperand &Reg1Op = MI.getOperand(1); if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base) @@ -1655,9 +1655,9 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, return false; const MachineOperand &BaseOp = MI->getOperand(2); - unsigned BaseReg = BaseOp.getReg(); - unsigned EvenReg = MI->getOperand(0).getReg(); - unsigned OddReg = MI->getOperand(1).getReg(); + Register BaseReg = BaseOp.getReg(); + Register EvenReg = MI->getOperand(0).getReg(); + Register OddReg = MI->getOperand(1).getReg(); unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); @@ -1783,8 +1783,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { if (isMemoryOp(*MBBI)) { unsigned Opcode = MBBI->getOpcode(); const MachineOperand &MO = MBBI->getOperand(0); - unsigned Reg = MO.getReg(); - unsigned Base = getLoadStoreBaseOp(*MBBI).getReg(); + Register Reg = MO.getReg(); + Register Base = getLoadStoreBaseOp(*MBBI).getReg(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg); int Offset = getMemoryOpOffset(*MBBI); @@ -2121,7 +2121,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, MachineOperand &MO = I->getOperand(j); if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (MO.isDef() && TRI->regsOverlap(Reg, Base)) return false; if (Reg != Base && !MemRegs.count(Reg)) @@ -2415,7 +2415,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { int Opc = MI.getOpcode(); bool isLd = isLoadSingle(Opc); - unsigned Base = MI.getOperand(1).getReg(); + Register Base = MI.getOperand(1).getReg(); int Offset = getMemoryOpOffset(MI); bool StopHere = false; auto FindBases = [&] (Base2InstMap &Base2Ops, BaseVec &Bases) { diff --git a/lib/Target/ARM/ARMLowOverheadLoops.cpp b/lib/Target/ARM/ARMLowOverheadLoops.cpp index cedf3bd3c74e..e1c5a9c3e223 100644 --- a/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -11,8 +11,7 @@ /// The expectation is that the loop contains three pseudo instructions: /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop /// form should be in the preheader, whereas the while form should be in the -/// preheaders only predecessor. TODO: Could DoLoopStart get moved into the -/// pre-preheader? +/// preheaders only predecessor. /// - t2LoopDec - placed within in the loop body. /// - t2LoopEnd - the loop latch terminator. /// @@ -35,6 +34,7 @@ using namespace llvm; namespace { class ARMLowOverheadLoops : public MachineFunctionPass { + MachineFunction *MF = nullptr; const ARMBaseInstrInfo *TII = nullptr; MachineRegisterInfo *MRI = nullptr; std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; @@ -52,17 +52,6 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override; - bool ProcessLoop(MachineLoop *ML); - - void RevertWhile(MachineInstr *MI) const; - - void RevertLoopDec(MachineInstr *MI) const; - - void RevertLoopEnd(MachineInstr *MI) const; - - void Expand(MachineLoop *ML, MachineInstr *Start, - MachineInstr *Dec, MachineInstr *End, bool Revert); - MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); @@ -71,36 +60,156 @@ namespace { StringRef getPassName() const override { return ARM_LOW_OVERHEAD_LOOPS_NAME; } + + private: + bool ProcessLoop(MachineLoop *ML); + + MachineInstr * IsSafeToDefineLR(MachineInstr *MI); + + bool RevertNonLoops(); + + void RevertWhile(MachineInstr *MI) const; + + bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const; + + void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; + + void Expand(MachineLoop *ML, MachineInstr *Start, + MachineInstr *InsertPt, MachineInstr *Dec, + MachineInstr *End, bool Revert); + }; } - + char ARMLowOverheadLoops::ID = 0; INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, false, false) -bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) { - if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB()) +bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { + const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget()); + if (!ST.hasLOB()) return false; - LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n"); + MF = &mf; + LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n"); auto &MLI = getAnalysis<MachineLoopInfo>(); - MRI = &MF.getRegInfo(); - TII = static_cast<const ARMBaseInstrInfo*>( - MF.getSubtarget().getInstrInfo()); - BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF)); + MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); + MRI = &MF->getRegInfo(); + TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); + BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF)); BBUtils->computeAllBlockSizes(); - BBUtils->adjustBBOffsetsAfter(&MF.front()); + BBUtils->adjustBBOffsetsAfter(&MF->front()); bool Changed = false; for (auto ML : MLI) { if (!ML->getParentLoop()) Changed |= ProcessLoop(ML); } + Changed |= RevertNonLoops(); return Changed; } +static bool IsLoopStart(MachineInstr &MI) { + return MI.getOpcode() == ARM::t2DoLoopStart || + MI.getOpcode() == ARM::t2WhileLoopStart; +} + +template<typename T> +static MachineInstr* SearchForDef(MachineInstr *Begin, T End, unsigned Reg) { + for(auto &MI : make_range(T(Begin), End)) { + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg) + continue; + return &MI; + } + } + return nullptr; +} + +static MachineInstr* SearchForUse(MachineInstr *Begin, + MachineBasicBlock::iterator End, + unsigned Reg) { + for(auto &MI : make_range(MachineBasicBlock::iterator(Begin), End)) { + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg) + continue; + return &MI; + } + } + return nullptr; +} + +// Is it safe to define LR with DLS/WLS? +// LR can defined if it is the operand to start, because it's the same value, +// or if it's going to be equivalent to the operand to Start. +MachineInstr *ARMLowOverheadLoops::IsSafeToDefineLR(MachineInstr *Start) { + + auto IsMoveLR = [](MachineInstr *MI, unsigned Reg) { + return MI->getOpcode() == ARM::tMOVr && + MI->getOperand(0).getReg() == ARM::LR && + MI->getOperand(1).getReg() == Reg && + MI->getOperand(2).getImm() == ARMCC::AL; + }; + + MachineBasicBlock *MBB = Start->getParent(); + unsigned CountReg = Start->getOperand(0).getReg(); + // Walk forward and backward in the block to find the closest instructions + // that define LR. Then also filter them out if they're not a mov lr. + MachineInstr *PredLRDef = SearchForDef(Start, MBB->rend(), ARM::LR); + if (PredLRDef && !IsMoveLR(PredLRDef, CountReg)) + PredLRDef = nullptr; + + MachineInstr *SuccLRDef = SearchForDef(Start, MBB->end(), ARM::LR); + if (SuccLRDef && !IsMoveLR(SuccLRDef, CountReg)) + SuccLRDef = nullptr; + + // We've either found one, two or none mov lr instructions... Now figure out + // if they are performing the equilvant mov that the Start instruction will. + // Do this by scanning forward and backward to see if there's a def of the + // register holding the count value. If we find a suitable def, return it as + // the insert point. Later, if InsertPt != Start, then we can remove the + // redundant instruction. + if (SuccLRDef) { + MachineBasicBlock::iterator End(SuccLRDef); + if (!SearchForDef(Start, End, CountReg)) { + return SuccLRDef; + } else + SuccLRDef = nullptr; + } + if (PredLRDef) { + MachineBasicBlock::reverse_iterator End(PredLRDef); + if (!SearchForDef(Start, End, CountReg)) { + return PredLRDef; + } else + PredLRDef = nullptr; + } + + // We can define LR because LR already contains the same value. + if (Start->getOperand(0).getReg() == ARM::LR) + return Start; + + // We've found no suitable LR def and Start doesn't use LR directly. Can we + // just define LR anyway? + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + LivePhysRegs LiveRegs(*TRI); + LiveRegs.addLiveOuts(*MBB); + + // Not if we've haven't found a suitable mov and LR is live out. + if (LiveRegs.contains(ARM::LR)) + return nullptr; + + // If LR is not live out, we can insert the instruction if nothing else + // uses LR after it. + if (!SearchForUse(Start, MBB->end(), ARM::LR)) + return Start; + + LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find suitable insertion point for" + << " LR\n"); + return nullptr; +} + bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { bool Changed = false; @@ -111,15 +220,10 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML); - auto IsLoopStart = [](MachineInstr &MI) { - return MI.getOpcode() == ARM::t2DoLoopStart || - MI.getOpcode() == ARM::t2WhileLoopStart; - }; - // Search the given block for a loop start instruction. If one isn't found, // and there's only one predecessor block, search that one too. std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = - [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { + [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { for (auto &MI : *MBB) { if (IsLoopStart(MI)) return &MI; @@ -165,41 +269,62 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { Dec = &MI; else if (MI.getOpcode() == ARM::t2LoopEnd) End = &MI; - else if (MI.getDesc().isCall()) + else if (IsLoopStart(MI)) + Start = &MI; + else if (MI.getDesc().isCall()) { // TODO: Though the call will require LE to execute again, does this // mean we should revert? Always executing LE hopefully should be // faster than performing a sub,cmp,br or even subs,br. Revert = true; + LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n"); + } - if (!Dec) + if (!Dec || End) continue; - // If we find that we load/store LR between LoopDec and LoopEnd, expect - // that the decremented value has been spilled to the stack. Because - // this value isn't actually going to be produced until the latch, by LE, - // we would need to generate a real sub. The value is also likely to be - // reloaded for use of LoopEnd - in which in case we'd need to perform - // an add because it gets negated again by LE! The other option is to - // then generate the other form of LE which doesn't perform the sub. - if (MI.mayLoad() || MI.mayStore()) - Revert = - MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR; + // If we find that LR has been written or read between LoopDec and + // LoopEnd, expect that the decremented value is being used else where. + // Because this value isn't actually going to be produced until the + // latch, by LE, we would need to generate a real sub. The value is also + // likely to be copied/reloaded for use of LoopEnd - in which in case + // we'd need to perform an add because it gets subtracted again by LE! + // The other option is to then generate the other form of LE which doesn't + // perform the sub. + for (auto &MO : MI.operands()) { + if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() && + MO.getReg() == ARM::LR) { + LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI); + Revert = true; + break; + } + } } if (Dec && End && Revert) break; } + LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; + if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; + if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;); + if (!Start && !Dec && !End) { LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n"); return Changed; - } if (!(Start && Dec && End)) { - report_fatal_error("Failed to find all loop components"); + } else if (!(Start && Dec && End)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n"); + return false; } - if (!End->getOperand(1).isMBB() || - End->getOperand(1).getMBB() != ML->getHeader()) - report_fatal_error("Expected LoopEnd to target Loop Header"); + if (!End->getOperand(1).isMBB()) + report_fatal_error("Expected LoopEnd to target basic block"); + + // TODO Maybe there's cases where the target doesn't have to be the header, + // but for now be safe and revert. + if (End->getOperand(1).getMBB() != ML->getHeader()) { + LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); + Revert = true; + } // The WLS and LE instructions have 12-bits for the label offset. WLS // requires a positive offset, while LE uses negative. @@ -216,41 +341,57 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { Revert = true; } - LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start - << " - Found Loop Dec: " << *Dec - << " - Found Loop End: " << *End); + MachineInstr *InsertPt = Revert ? nullptr : IsSafeToDefineLR(Start); + if (!InsertPt) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); + Revert = true; + } else + LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt); - Expand(ML, Start, Dec, End, Revert); + Expand(ML, Start, InsertPt, Dec, End, Revert); return true; } // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a // beq that branches to the exit branch. -// FIXME: Need to check that we're not trashing the CPSR when generating the -// cmp. We could also try to generate a cbz if the value in LR is also in +// TODO: We could also try to generate a cbz if the value in LR is also in // another low register. void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); MachineBasicBlock *MBB = MI->getParent(); MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri)); - MIB.addReg(ARM::LR); + MIB.add(MI->getOperand(0)); MIB.addImm(0); MIB.addImm(ARMCC::AL); - MIB.addReg(ARM::CPSR); + MIB.addReg(ARM::NoRegister); + + MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); + unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? + ARM::tBcc : ARM::t2Bcc; - // TODO: Try to use tBcc instead - MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc)); + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); MIB.add(MI->getOperand(1)); // branch target MIB.addImm(ARMCC::EQ); // condition code MIB.addReg(ARM::CPSR); MI->eraseFromParent(); } -// TODO: Check flags so that we can possibly generate a tSubs or tSub. -void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { +bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI, + bool AllowFlags) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); MachineBasicBlock *MBB = MI->getParent(); + + // If nothing uses or defines CPSR between LoopDec and LoopEnd, use a t2SUBS. + bool SetFlags = false; + if (AllowFlags) { + if (auto *Def = SearchForDef(MI, MBB->end(), ARM::CPSR)) { + if (!SearchForUse(MI, MBB->end(), ARM::CPSR) && + Def->getOpcode() == ARM::t2LoopEnd) + SetFlags = true; + } + } + MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); MIB.addDef(ARM::LR); @@ -258,28 +399,39 @@ void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { MIB.add(MI->getOperand(2)); MIB.addImm(ARMCC::AL); MIB.addReg(0); - MIB.addReg(0); + + if (SetFlags) { + MIB.addReg(ARM::CPSR); + MIB->getOperand(5).setIsDef(true); + } else + MIB.addReg(0); + MI->eraseFromParent(); + return SetFlags; } // Generate a subs, or sub and cmp, and a branch instead of an LE. -// FIXME: Need to check that we're not trashing the CPSR when generating -// the cmp. -void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const { +void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); - // Create cmp MachineBasicBlock *MBB = MI->getParent(); - MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(ARM::t2CMPri)); - MIB.addReg(ARM::LR); - MIB.addImm(0); - MIB.addImm(ARMCC::AL); - MIB.addReg(ARM::CPSR); + // Create cmp + if (!SkipCmp) { + MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(ARM::t2CMPri)); + MIB.addReg(ARM::LR); + MIB.addImm(0); + MIB.addImm(ARMCC::AL); + MIB.addReg(ARM::NoRegister); + } + + MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); + unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? + ARM::tBcc : ARM::t2Bcc; - // TODO Try to use tBcc instead. // Create bne - MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc)); + MachineInstrBuilder MIB = + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); MIB.add(MI->getOperand(1)); // branch target MIB.addImm(ARMCC::NE); // condition code MIB.addReg(ARM::CPSR); @@ -287,33 +439,13 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const { } void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, + MachineInstr *InsertPt, MachineInstr *Dec, MachineInstr *End, bool Revert) { - auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) { - // The trip count should already been held in LR since the instructions - // within the loop can only read and write to LR. So, there should be a - // mov to setup the count. WLS/DLS perform this move, so find the original - // and delete it - inserting WLS/DLS in its place. - MachineBasicBlock *MBB = Start->getParent(); - MachineInstr *InsertPt = Start; - for (auto &I : MRI->def_instructions(ARM::LR)) { - if (I.getParent() != MBB) - continue; - - // Always execute. - if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL) - continue; - - // Only handle move reg, if the trip count it will need moving into a reg - // before the setup instruction anyway. - if (!I.getDesc().isMoveReg() || - !I.getOperand(1).isIdenticalTo(Start->getOperand(0))) - continue; - InsertPt = &I; - break; - } - + auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start, + MachineInstr *InsertPt) { + MachineBasicBlock *MBB = InsertPt->getParent(); unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ? ARM::t2DLS : ARM::t2WLS; MachineInstrBuilder MIB = @@ -369,16 +501,54 @@ void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, RevertWhile(Start); else Start->eraseFromParent(); - RevertLoopDec(Dec); - RevertLoopEnd(End); + bool FlagsAlreadySet = RevertLoopDec(Dec, true); + RevertLoopEnd(End, FlagsAlreadySet); } else { - Start = ExpandLoopStart(ML, Start); + Start = ExpandLoopStart(ML, Start, InsertPt); RemoveDeadBranch(Start); End = ExpandLoopEnd(ML, Dec, End); RemoveDeadBranch(End); } } +bool ARMLowOverheadLoops::RevertNonLoops() { + LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n"); + bool Changed = false; + + for (auto &MBB : *MF) { + SmallVector<MachineInstr*, 4> Starts; + SmallVector<MachineInstr*, 4> Decs; + SmallVector<MachineInstr*, 4> Ends; + + for (auto &I : MBB) { + if (IsLoopStart(I)) + Starts.push_back(&I); + else if (I.getOpcode() == ARM::t2LoopDec) + Decs.push_back(&I); + else if (I.getOpcode() == ARM::t2LoopEnd) + Ends.push_back(&I); + } + + if (Starts.empty() && Decs.empty() && Ends.empty()) + continue; + + Changed = true; + + for (auto *Start : Starts) { + if (Start->getOpcode() == ARM::t2WhileLoopStart) + RevertWhile(Start); + else + Start->eraseFromParent(); + } + for (auto *Dec : Decs) + RevertLoopDec(Dec); + + for (auto *End : Ends) + RevertLoopEnd(End); + } + return Changed; +} + FunctionPass *llvm::createARMLowOverheadLoopsPass() { return new ARMLowOverheadLoops(); } diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp index 90c5ad025e56..c92689f4942e 100644 --- a/lib/Target/ARM/ARMMCInstLower.cpp +++ b/lib/Target/ARM/ARMMCInstLower.cpp @@ -74,8 +74,8 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO, switch (MO.getType()) { default: llvm_unreachable("unknown operand type"); case MachineOperand::MO_Register: - // Ignore all non-CPSR implicit register operands. - if (MO.isImplicit() && MO.getReg() != ARM::CPSR) + // Ignore all implicit register operands. + if (MO.isImplicit()) return false; assert(!MO.getSubReg() && "Subregs should be eliminated!"); MCOp = MCOperand::createReg(MO.getReg()); diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h index 90d794cd27b1..bb136e92329b 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/Support/ErrorHandling.h" #include <utility> @@ -130,6 +131,10 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// The amount the literal pool has been increasedby due to promoted globals. int PromotedGlobalsIncrease = 0; + /// True if r0 will be preserved by a call to this function (e.g. C++ + /// con/destructors). + bool PreservesR0 = false; + public: ARMFunctionInfo() = default; @@ -247,6 +252,9 @@ public: } DenseMap<unsigned, unsigned> EHPrologueRemappedRegs; + + void setPreservesR0() { PreservesR0 = true; } + bool getPreservesR0() const { return PreservesR0; } }; } // end namespace llvm diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp index 5389d09bf7d7..ae5657a0a2c1 100644 --- a/lib/Target/ARM/ARMParallelDSP.cpp +++ b/lib/Target/ARM/ARMParallelDSP.cpp @@ -1,4 +1,4 @@ -//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===// +//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -18,13 +18,11 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/NoFolder.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" @@ -45,54 +43,39 @@ static cl::opt<bool> DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass")); +static cl::opt<unsigned> +NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), + cl::desc("Limit the number of loads analysed")); + namespace { - struct OpChain; - struct BinOpChain; + struct MulCandidate; class Reduction; - using OpChainList = SmallVector<std::unique_ptr<OpChain>, 8>; - using ReductionList = SmallVector<Reduction, 8>; - using ValueList = SmallVector<Value*, 8>; - using MemInstList = SmallVector<LoadInst*, 8>; - using PMACPair = std::pair<BinOpChain*,BinOpChain*>; - using PMACPairList = SmallVector<PMACPair, 8>; - using Instructions = SmallVector<Instruction*,16>; - using MemLocList = SmallVector<MemoryLocation, 4>; + using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>; + using MemInstList = SmallVectorImpl<LoadInst*>; + using MulPairList = SmallVector<std::pair<MulCandidate*, MulCandidate*>, 8>; - struct OpChain { + // 'MulCandidate' holds the multiplication instructions that are candidates + // for parallel execution. + struct MulCandidate { Instruction *Root; - ValueList AllValues; - MemInstList VecLd; // List of all load instructions. - MemInstList Loads; + Value* LHS; + Value* RHS; + bool Exchange = false; bool ReadOnly = true; + bool Paired = false; + SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen. - OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { } - virtual ~OpChain() = default; + MulCandidate(Instruction *I, Value *lhs, Value *rhs) : + Root(I), LHS(lhs), RHS(rhs) { } - void PopulateLoads() { - for (auto *V : AllValues) { - if (auto *Ld = dyn_cast<LoadInst>(V)) - Loads.push_back(Ld); - } + bool HasTwoLoadInputs() const { + return isa<LoadInst>(LHS) && isa<LoadInst>(RHS); } - unsigned size() const { return AllValues.size(); } - }; - - // 'BinOpChain' holds the multiplication instructions that are candidates - // for parallel execution. - struct BinOpChain : public OpChain { - ValueList LHS; // List of all (narrow) left hand operands. - ValueList RHS; // List of all (narrow) right hand operands. - bool Exchange = false; - - BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) : - OpChain(I, lhs), LHS(lhs), RHS(rhs) { - for (auto *V : RHS) - AllValues.push_back(V); - } - - bool AreSymmetrical(BinOpChain *Other); + LoadInst *getBaseLoad() const { + return VecLd.front(); + } }; /// Represent a sequence of multiply-accumulate operations with the aim to @@ -100,9 +83,9 @@ namespace { class Reduction { Instruction *Root = nullptr; Value *Acc = nullptr; - OpChainList Muls; - PMACPairList MulPairs; - SmallPtrSet<Instruction*, 4> Adds; + MulCandList Muls; + MulPairList MulPairs; + SetVector<Instruction*> Adds; public: Reduction() = delete; @@ -112,10 +95,35 @@ namespace { /// Record an Add instruction that is a part of the this reduction. void InsertAdd(Instruction *I) { Adds.insert(I); } - /// Record a BinOpChain, rooted at a Mul instruction, that is a part of - /// this reduction. - void InsertMul(Instruction *I, ValueList &LHS, ValueList &RHS) { - Muls.push_back(make_unique<BinOpChain>(I, LHS, RHS)); + /// Create MulCandidates, each rooted at a Mul instruction, that is a part + /// of this reduction. + void InsertMuls() { + auto GetMulOperand = [](Value *V) -> Instruction* { + if (auto *SExt = dyn_cast<SExtInst>(V)) { + if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0))) + if (I->getOpcode() == Instruction::Mul) + return I; + } else if (auto *I = dyn_cast<Instruction>(V)) { + if (I->getOpcode() == Instruction::Mul) + return I; + } + return nullptr; + }; + + auto InsertMul = [this](Instruction *I) { + Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0); + Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0); + Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS)); + }; + + for (auto *Add : Adds) { + if (Add == Acc) + continue; + if (auto *Mul = GetMulOperand(Add->getOperand(0))) + InsertMul(Mul); + if (auto *Mul = GetMulOperand(Add->getOperand(1))) + InsertMul(Mul); + } } /// Add the incoming accumulator value, returns true if a value had not @@ -128,9 +136,17 @@ namespace { return true; } - /// Set two BinOpChains, rooted at muls, that can be executed as a single + /// Set two MulCandidates, rooted at muls, that can be executed as a single /// parallel operation. - void AddMulPair(BinOpChain *Mul0, BinOpChain *Mul1) { + void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1, + bool Exchange = false) { + LLVM_DEBUG(dbgs() << "Pairing:\n" + << *Mul0->Root << "\n" + << *Mul1->Root << "\n"); + Mul0->Paired = true; + Mul1->Paired = true; + if (Exchange) + Mul1->Exchange = true; MulPairs.push_back(std::make_pair(Mul0, Mul1)); } @@ -141,24 +157,40 @@ namespace { /// Return the add instruction which is the root of the reduction. Instruction *getRoot() { return Root; } + bool is64Bit() const { return Root->getType()->isIntegerTy(64); } + + Type *getType() const { return Root->getType(); } + /// Return the incoming value to be accumulated. This maybe null. Value *getAccumulator() { return Acc; } /// Return the set of adds that comprise the reduction. - SmallPtrSetImpl<Instruction*> &getAdds() { return Adds; } + SetVector<Instruction*> &getAdds() { return Adds; } - /// Return the BinOpChain, rooted at mul instruction, that comprise the + /// Return the MulCandidate, rooted at mul instruction, that comprise the /// the reduction. - OpChainList &getMuls() { return Muls; } + MulCandList &getMuls() { return Muls; } - /// Return the BinOpChain, rooted at mul instructions, that have been + /// Return the MulCandidate, rooted at mul instructions, that have been /// paired for parallel execution. - PMACPairList &getMulPairs() { return MulPairs; } + MulPairList &getMulPairs() { return MulPairs; } /// To finalise, replace the uses of the root with the intrinsic call. void UpdateRoot(Instruction *SMLAD) { Root->replaceAllUsesWith(SMLAD); } + + void dump() { + LLVM_DEBUG(dbgs() << "Reduction:\n"; + for (auto *Add : Adds) + LLVM_DEBUG(dbgs() << *Add << "\n"); + for (auto &Mul : Muls) + LLVM_DEBUG(dbgs() << *Mul->Root << "\n" + << " " << *Mul->LHS << "\n" + << " " << *Mul->RHS << "\n"); + LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n") + ); + } }; class WidenedLoad { @@ -176,13 +208,11 @@ namespace { } }; - class ARMParallelDSP : public LoopPass { + class ARMParallelDSP : public FunctionPass { ScalarEvolution *SE; AliasAnalysis *AA; TargetLibraryInfo *TLI; DominatorTree *DT; - LoopInfo *LI; - Loop *L; const DataLayout *DL; Module *M; std::map<LoadInst*, LoadInst*> LoadPairs; @@ -190,13 +220,12 @@ namespace { std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads; template<unsigned> - bool IsNarrowSequence(Value *V, ValueList &VL); - + bool IsNarrowSequence(Value *V); + bool Search(Value *V, BasicBlock *BB, Reduction &R); bool RecordMemoryOps(BasicBlock *BB); void InsertParallelMACs(Reduction &Reduction); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); - LoadInst* CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads, - IntegerType *LoadTy); + LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy); bool CreateParallelPairs(Reduction &R); /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate @@ -204,60 +233,38 @@ namespace { /// products to a 32-bit accumulate operand. Optionally, the instruction can /// exchange the halfwords of the second operand before performing the /// arithmetic. - bool MatchSMLAD(Loop *L); + bool MatchSMLAD(Function &F); public: static char ID; - ARMParallelDSP() : LoopPass(ID) { } - - bool doInitialization(Loop *L, LPPassManager &LPM) override { - LoadPairs.clear(); - WideLoads.clear(); - return true; - } + ARMParallelDSP() : FunctionPass(ID) { } void getAnalysisUsage(AnalysisUsage &AU) const override { - LoopPass::getAnalysisUsage(AU); + FunctionPass::getAnalysisUsage(AU); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetPassConfig>(); - AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.setPreservesCFG(); } - bool runOnLoop(Loop *TheLoop, LPPassManager &) override { + bool runOnFunction(Function &F) override { if (DisableParallelDSP) return false; - L = TheLoop; + if (skipFunction(F)) + return false; + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &TPC = getAnalysis<TargetPassConfig>(); - BasicBlock *Header = TheLoop->getHeader(); - if (!Header) - return false; - - // TODO: We assume the loop header and latch to be the same block. - // This is not a fundamental restriction, but lifting this would just - // require more work to do the transformation and then patch up the CFG. - if (Header != TheLoop->getLoopLatch()) { - LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not " - "running pass ARMParallelDSP\n"); - return false; - } - - if (!TheLoop->getLoopPreheader()) - InsertPreheaderForLoop(L, DT, LI, nullptr, true); - - Function &F = *Header->getParent(); M = F.getParent(); DL = &M->getDataLayout(); @@ -282,17 +289,10 @@ namespace { return false; } - LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI); - LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); - if (!RecordMemoryOps(Header)) { - LLVM_DEBUG(dbgs() << " - No sequential loads found.\n"); - return false; - } - - bool Changes = MatchSMLAD(L); + bool Changes = MatchSMLAD(F); return Changes; } }; @@ -331,40 +331,14 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, // TODO: we currently only collect i16, and will support i8 later, so that's // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth. template<unsigned MaxBitWidth> -bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) { - ConstantInt *CInt; - - if (match(V, m_ConstantInt(CInt))) { - // TODO: if a constant is used, it needs to fit within the bit width. - return false; - } - - auto *I = dyn_cast<Instruction>(V); - if (!I) - return false; - - Value *Val, *LHS, *RHS; - if (match(V, m_Trunc(m_Value(Val)))) { - if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth) - return IsNarrowSequence<MaxBitWidth>(Val, VL); - } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) { - // TODO: we need to implement sadd16/sadd8 for this, which enables to - // also do the rewrite for smlad8.ll, but it is unsupported for now. - return false; - } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) { - if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) +bool ARMParallelDSP::IsNarrowSequence(Value *V) { + if (auto *SExt = dyn_cast<SExtInst>(V)) { + if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) return false; - if (match(Val, m_Load(m_Value()))) { - auto *Ld = cast<LoadInst>(Val); - - // Check that these load could be paired. - if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld)) - return false; - - VL.push_back(Val); - VL.push_back(I); - return true; + if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) { + // Check that this load could be paired. + return LoadPairs.count(Ld) || OffsetLoads.count(Ld); } } return false; @@ -375,6 +349,9 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) { bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { SmallVector<LoadInst*, 8> Loads; SmallVector<Instruction*, 8> Writes; + LoadPairs.clear(); + WideLoads.clear(); + OrderedBasicBlock OrderedBB(BB); // Collect loads and instruction that may write to memory. For now we only // record loads which are simple, sign-extended and have a single user. @@ -389,21 +366,24 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { Loads.push_back(Ld); } + if (Loads.empty() || Loads.size() > NumLoadLimit) + return false; + using InstSet = std::set<Instruction*>; using DepMap = std::map<Instruction*, InstSet>; DepMap RAWDeps; // Record any writes that may alias a load. const auto Size = LocationSize::unknown(); - for (auto Read : Loads) { - for (auto Write : Writes) { + for (auto Write : Writes) { + for (auto Read : Loads) { MemoryLocation ReadLoc = MemoryLocation(Read->getPointerOperand(), Size); if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc), ModRefInfo::ModRef))) continue; - if (DT->dominates(Write, Read)) + if (OrderedBB.dominates(Write, Read)) RAWDeps[Read].insert(Write); } } @@ -411,17 +391,16 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { // Check whether there's not a write between the two loads which would // prevent them from being safely merged. auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) { - LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset; - LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base; + LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset; + LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base; if (RAWDeps.count(Dominated)) { InstSet &WritesBefore = RAWDeps[Dominated]; for (auto Before : WritesBefore) { - // We can't move the second load backward, past a write, to merge // with the first load. - if (DT->dominates(Dominator, Before)) + if (OrderedBB.dominates(Dominator, Before)) return false; } } @@ -431,7 +410,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { // Record base, offset load pairs. for (auto *Base : Loads) { for (auto *Offset : Loads) { - if (Base == Offset) + if (Base == Offset || OffsetLoads.count(Offset)) continue; if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) && @@ -453,7 +432,54 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { return LoadPairs.size() > 1; } -// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector +// Search recursively back through the operands to find a tree of values that +// form a multiply-accumulate chain. The search records the Add and Mul +// instructions that form the reduction and allows us to find a single value +// to be used as the initial input to the accumlator. +bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) { + // If we find a non-instruction, try to use it as the initial accumulator + // value. This may have already been found during the search in which case + // this function will return false, signaling a search fail. + auto *I = dyn_cast<Instruction>(V); + if (!I) + return R.InsertAcc(V); + + if (I->getParent() != BB) + return false; + + switch (I->getOpcode()) { + default: + break; + case Instruction::PHI: + // Could be the accumulator value. + return R.InsertAcc(V); + case Instruction::Add: { + // Adds should be adding together two muls, or another add and a mul to + // be within the mac chain. One of the operands may also be the + // accumulator value at which point we should stop searching. + R.InsertAdd(I); + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + bool ValidLHS = Search(LHS, BB, R); + bool ValidRHS = Search(RHS, BB, R); + + if (ValidLHS && ValidRHS) + return true; + + return R.InsertAcc(I); + } + case Instruction::Mul: { + Value *MulOp0 = I->getOperand(0); + Value *MulOp1 = I->getOperand(1); + return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1); + } + case Instruction::SExt: + return Search(I->getOperand(0), BB, R); + } + return false; +} + +// The pass needs to identify integer add/sub reductions of 16-bit vector // multiplications. // To use SMLAD: // 1) we first need to find integer add then look for this pattern: @@ -484,88 +510,39 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { // If loop invariants are used instead of loads, these need to be packed // before the loop begins. // -bool ARMParallelDSP::MatchSMLAD(Loop *L) { - // Search recursively back through the operands to find a tree of values that - // form a multiply-accumulate chain. The search records the Add and Mul - // instructions that form the reduction and allows us to find a single value - // to be used as the initial input to the accumlator. - std::function<bool(Value*, Reduction&)> Search = [&] - (Value *V, Reduction &R) -> bool { - - // If we find a non-instruction, try to use it as the initial accumulator - // value. This may have already been found during the search in which case - // this function will return false, signaling a search fail. - auto *I = dyn_cast<Instruction>(V); - if (!I) - return R.InsertAcc(V); - - switch (I->getOpcode()) { - default: - break; - case Instruction::PHI: - // Could be the accumulator value. - return R.InsertAcc(V); - case Instruction::Add: { - // Adds should be adding together two muls, or another add and a mul to - // be within the mac chain. One of the operands may also be the - // accumulator value at which point we should stop searching. - bool ValidLHS = Search(I->getOperand(0), R); - bool ValidRHS = Search(I->getOperand(1), R); - if (!ValidLHS && !ValidLHS) - return false; - else if (ValidLHS && ValidRHS) { - R.InsertAdd(I); - return true; - } else { - R.InsertAdd(I); - return R.InsertAcc(I); - } - } - case Instruction::Mul: { - Value *MulOp0 = I->getOperand(0); - Value *MulOp1 = I->getOperand(1); - if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) { - ValueList LHS; - ValueList RHS; - if (IsNarrowSequence<16>(MulOp0, LHS) && - IsNarrowSequence<16>(MulOp1, RHS)) { - R.InsertMul(I, LHS, RHS); - return true; - } - } - return false; - } - case Instruction::SExt: - return Search(I->getOperand(0), R); - } - return false; - }; - +bool ARMParallelDSP::MatchSMLAD(Function &F) { bool Changed = false; - SmallPtrSet<Instruction*, 4> AllAdds; - BasicBlock *Latch = L->getLoopLatch(); - for (Instruction &I : reverse(*Latch)) { - if (I.getOpcode() != Instruction::Add) + for (auto &BB : F) { + SmallPtrSet<Instruction*, 4> AllAdds; + if (!RecordMemoryOps(&BB)) continue; - if (AllAdds.count(&I)) - continue; + for (Instruction &I : reverse(BB)) { + if (I.getOpcode() != Instruction::Add) + continue; - const auto *Ty = I.getType(); - if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) - continue; + if (AllAdds.count(&I)) + continue; - Reduction R(&I); - if (!Search(&I, R)) - continue; + const auto *Ty = I.getType(); + if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) + continue; - if (!CreateParallelPairs(R)) - continue; + Reduction R(&I); + if (!Search(&I, &BB, R)) + continue; - InsertParallelMACs(R); - Changed = true; - AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); + R.InsertMuls(); + LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump()); + + if (!CreateParallelPairs(R)) + continue; + + InsertParallelMACs(R); + Changed = true; + AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); + } } return Changed; @@ -578,87 +555,57 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { return false; // Check that the muls operate directly upon sign extended loads. - for (auto &MulChain : R.getMuls()) { - // A mul has 2 operands, and a narrow op consist of sext and a load; thus - // we expect at least 4 items in this operand value list. - if (MulChain->size() < 4) { - LLVM_DEBUG(dbgs() << "Operand list too short.\n"); + for (auto &MulCand : R.getMuls()) { + if (!MulCand->HasTwoLoadInputs()) return false; - } - MulChain->PopulateLoads(); - ValueList &LHS = static_cast<BinOpChain*>(MulChain.get())->LHS; - ValueList &RHS = static_cast<BinOpChain*>(MulChain.get())->RHS; - - // Use +=2 to skip over the expected extend instructions. - for (unsigned i = 0, e = LHS.size(); i < e; i += 2) { - if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i])) - return false; - } } - auto CanPair = [&](Reduction &R, BinOpChain *PMul0, BinOpChain *PMul1) { - if (!PMul0->AreSymmetrical(PMul1)) - return false; - + auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) { // The first elements of each vector should be loads with sexts. If we // find that its two pairs of consecutive loads, then these can be // transformed into two wider loads and the users can be replaced with // DSP intrinsics. - for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) { - auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]); - auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]); - auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]); - auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]); - - if (!Ld0 || !Ld1 || !Ld2 || !Ld3) - return false; + auto Ld0 = static_cast<LoadInst*>(PMul0->LHS); + auto Ld1 = static_cast<LoadInst*>(PMul1->LHS); + auto Ld2 = static_cast<LoadInst*>(PMul0->RHS); + auto Ld3 = static_cast<LoadInst*>(PMul1->RHS); - LLVM_DEBUG(dbgs() << "Loads:\n" - << " - " << *Ld0 << "\n" - << " - " << *Ld1 << "\n" - << " - " << *Ld2 << "\n" - << " - " << *Ld3 << "\n"); - - if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { - if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { - LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - R.AddMulPair(PMul0, PMul1); - return true; - } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) { - LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n"); - PMul1->Exchange = true; - R.AddMulPair(PMul0, PMul1); - return true; - } - } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) && - AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { + if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { + if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n"); - LLVM_DEBUG(dbgs() << " and swapping muls\n"); - PMul0->Exchange = true; - // Only the second operand can be exchanged, so swap the muls. - R.AddMulPair(PMul1, PMul0); + R.AddMulPair(PMul0, PMul1); + return true; + } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) { + LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); + LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n"); + R.AddMulPair(PMul0, PMul1, true); return true; } + } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) && + AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { + LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); + LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n"); + LLVM_DEBUG(dbgs() << " and swapping muls\n"); + // Only the second operand can be exchanged, so swap the muls. + R.AddMulPair(PMul1, PMul0, true); + return true; } return false; }; - OpChainList &Muls = R.getMuls(); + MulCandList &Muls = R.getMuls(); const unsigned Elems = Muls.size(); - SmallPtrSet<const Instruction*, 4> Paired; for (unsigned i = 0; i < Elems; ++i) { - BinOpChain *PMul0 = static_cast<BinOpChain*>(Muls[i].get()); - if (Paired.count(PMul0->Root)) + MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get()); + if (PMul0->Paired) continue; for (unsigned j = 0; j < Elems; ++j) { if (i == j) continue; - BinOpChain *PMul1 = static_cast<BinOpChain*>(Muls[j].get()); - if (Paired.count(PMul1->Root)) + MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get()); + if (PMul1->Paired) continue; const Instruction *Mul0 = PMul0->Root; @@ -668,29 +615,19 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { assert(PMul0 != PMul1 && "expected different chains"); - if (CanPair(R, PMul0, PMul1)) { - Paired.insert(Mul0); - Paired.insert(Mul1); + if (CanPair(R, PMul0, PMul1)) break; - } } } return !R.getMulPairs().empty(); } - void ARMParallelDSP::InsertParallelMACs(Reduction &R) { - auto CreateSMLADCall = [&](SmallVectorImpl<LoadInst*> &VecLd0, - SmallVectorImpl<LoadInst*> &VecLd1, - Value *Acc, bool Exchange, - Instruction *InsertAfter) { + auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1, + Value *Acc, bool Exchange, + Instruction *InsertAfter) { // Replace the reduction chain with an intrinsic call - IntegerType *Ty = IntegerType::get(M->getContext(), 32); - LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ? - WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty); - LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ? - WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty); Value* Args[] = { WideLd0, WideLd1, Acc }; Function *SMLAD = nullptr; @@ -704,34 +641,95 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) { Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); IRBuilder<NoFolder> Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); + BasicBlock::iterator(InsertAfter)); Instruction *Call = Builder.CreateCall(SMLAD, Args); NumSMLAD++; return Call; }; - Instruction *InsertAfter = R.getRoot(); + // Return the instruction after the dominated instruction. + auto GetInsertPoint = [this](Value *A, Value *B) { + assert((isa<Instruction>(A) || isa<Instruction>(B)) && + "expected at least one instruction"); + + Value *V = nullptr; + if (!isa<Instruction>(A)) + V = B; + else if (!isa<Instruction>(B)) + V = A; + else + V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A; + + return &*++BasicBlock::iterator(cast<Instruction>(V)); + }; + Value *Acc = R.getAccumulator(); - if (!Acc) - Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0); - LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n" - << "Acc: " << *Acc << "\n"); + // For any muls that were discovered but not paired, accumulate their values + // as before. + IRBuilder<NoFolder> Builder(R.getRoot()->getParent()); + MulCandList &MulCands = R.getMuls(); + for (auto &MulCand : MulCands) { + if (MulCand->Paired) + continue; + + Instruction *Mul = cast<Instruction>(MulCand->Root); + LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n"); + + if (R.getType() != Mul->getType()) { + assert(R.is64Bit() && "expected 64-bit result"); + Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul)); + Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType())); + } + + if (!Acc) { + Acc = Mul; + continue; + } + + // If Acc is the original incoming value to the reduction, it could be a + // phi. But the phi will dominate Mul, meaning that Mul will be the + // insertion point. + Builder.SetInsertPoint(GetInsertPoint(Mul, Acc)); + Acc = Builder.CreateAdd(Mul, Acc); + } + + if (!Acc) { + Acc = R.is64Bit() ? + ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) : + ConstantInt::get(IntegerType::get(M->getContext(), 32), 0); + } else if (Acc->getType() != R.getType()) { + Builder.SetInsertPoint(R.getRoot()); + Acc = Builder.CreateSExt(Acc, R.getType()); + } + + // Roughly sort the mul pairs in their program order. + OrderedBasicBlock OrderedBB(R.getRoot()->getParent()); + llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) { + const Instruction *A = PairA.first->Root; + const Instruction *B = PairB.first->Root; + return OrderedBB.dominates(A, B); + }); + + IntegerType *Ty = IntegerType::get(M->getContext(), 32); for (auto &Pair : R.getMulPairs()) { - BinOpChain *PMul0 = Pair.first; - BinOpChain *PMul1 = Pair.second; - LLVM_DEBUG(dbgs() << "Muls:\n" - << "- " << *PMul0->Root << "\n" - << "- " << *PMul1->Root << "\n"); - - Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange, - InsertAfter); - InsertAfter = cast<Instruction>(Acc); + MulCandidate *LHSMul = Pair.first; + MulCandidate *RHSMul = Pair.second; + LoadInst *BaseLHS = LHSMul->getBaseLoad(); + LoadInst *BaseRHS = RHSMul->getBaseLoad(); + LoadInst *WideLHS = WideLoads.count(BaseLHS) ? + WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty); + LoadInst *WideRHS = WideLoads.count(BaseRHS) ? + WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty); + + Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS); + InsertAfter = GetInsertPoint(InsertAfter, Acc); + Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter); } R.UpdateRoot(cast<Instruction>(Acc)); } -LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads, +LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy) { assert(Loads.size() == 2 && "currently only support widening two loads"); @@ -758,8 +756,8 @@ LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads, return; Source->moveBefore(Sink); - for (auto &U : Source->uses()) - MoveBefore(Source, U.getUser()); + for (auto &Op : Source->operands()) + MoveBefore(Op, Source); }; // Insert the load at the point of the original dominating load. @@ -784,57 +782,30 @@ LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads, // Loads[0] needs trunc while Loads[1] needs a lshr and trunc. // TODO: Support big-endian as well. Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType()); - BaseSExt->setOperand(0, Bottom); + Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType()); + BaseSExt->replaceAllUsesWith(NewBaseSExt); IntegerType *OffsetTy = cast<IntegerType>(Offset->getType()); Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth()); Value *Top = IRB.CreateLShr(WideLoad, ShiftVal); Value *Trunc = IRB.CreateTrunc(Top, OffsetTy); - OffsetSExt->setOperand(0, Trunc); - + Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType()); + OffsetSExt->replaceAllUsesWith(NewOffsetSExt); + + LLVM_DEBUG(dbgs() << "From Base and Offset:\n" + << *Base << "\n" << *Offset << "\n" + << "Created Wide Load:\n" + << *WideLoad << "\n" + << *Bottom << "\n" + << *NewBaseSExt << "\n" + << *Top << "\n" + << *Trunc << "\n" + << *NewOffsetSExt << "\n"); WideLoads.emplace(std::make_pair(Base, - make_unique<WidenedLoad>(Loads, WideLoad))); + std::make_unique<WidenedLoad>(Loads, WideLoad))); return WideLoad; } -// Compare the value lists in Other to this chain. -bool BinOpChain::AreSymmetrical(BinOpChain *Other) { - // Element-by-element comparison of Value lists returning true if they are - // instructions with the same opcode or constants with the same value. - auto CompareValueList = [](const ValueList &VL0, - const ValueList &VL1) { - if (VL0.size() != VL1.size()) { - LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: " - << VL0.size() << " != " << VL1.size() << "\n"); - return false; - } - - const unsigned Pairs = VL0.size(); - - for (unsigned i = 0; i < Pairs; ++i) { - const Value *V0 = VL0[i]; - const Value *V1 = VL1[i]; - const auto *Inst0 = dyn_cast<Instruction>(V0); - const auto *Inst1 = dyn_cast<Instruction>(V1); - - if (!Inst0 || !Inst1) - return false; - - if (Inst0->isSameOperationAs(Inst1)) - continue; - - const APInt *C0, *C1; - if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1)) - return false; - } - - return true; - }; - - return CompareValueList(LHS, Other->LHS) && - CompareValueList(RHS, Other->RHS); -} - Pass *llvm::createARMParallelDSPPass() { return new ARMParallelDSP(); } @@ -842,6 +813,6 @@ Pass *llvm::createARMParallelDSPPass() { char ARMParallelDSP::ID = 0; INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", - "Transform loops to use DSP intrinsics", false, false) + "Transform functions to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp", - "Transform loops to use DSP intrinsics", false, false) + "Transform functions to use DSP intrinsics", false, false) diff --git a/lib/Target/ARM/ARMPredicates.td b/lib/Target/ARM/ARMPredicates.td index 0b6b40de80dd..b008d3e2e296 100644 --- a/lib/Target/ARM/ARMPredicates.td +++ b/lib/Target/ARM/ARMPredicates.td @@ -71,7 +71,7 @@ def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, AssemblerPredicate<"HasV8_5aOps", "armv8.5a">; def NoVFP : Predicate<"!Subtarget->hasVFP2Base()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2Base()">, - AssemblerPredicate<"FeatureVFP2_D16_SP", "VFP2">; + AssemblerPredicate<"FeatureVFP2_SP", "VFP2">; def HasVFP3 : Predicate<"Subtarget->hasVFP3Base()">, AssemblerPredicate<"FeatureVFP3_D16_SP", "VFP3">; def HasVFP4 : Predicate<"Subtarget->hasVFP4Base()">, diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td index 92ae26b3729d..56055a15483a 100644 --- a/lib/Target/ARM/ARMRegisterInfo.td +++ b/lib/Target/ARM/ARMRegisterInfo.td @@ -180,7 +180,7 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>; // models the APSR when it's accessed by some special instructions. In such cases // it has the same encoding as PC. def CPSR : ARMReg<0, "cpsr">; -def APSR : ARMReg<1, "apsr">; +def APSR : ARMReg<15, "apsr">; def APSR_NZCV : ARMReg<15, "apsr_nzcv">; def SPSR : ARMReg<2, "spsr">; def FPSCR : ARMReg<3, "fpscr">; @@ -486,12 +486,20 @@ def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], // Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP. // These are needed by instructions (e.g. ldrexd/strexd) requiring even-odd GPRs. -def Tuples2R : RegisterTuples<[gsub_0, gsub_1], - [(add R0, R2, R4, R6, R8, R10, R12), - (add R1, R3, R5, R7, R9, R11, SP)]>; +def Tuples2Rnosp : RegisterTuples<[gsub_0, gsub_1], + [(add R0, R2, R4, R6, R8, R10), + (add R1, R3, R5, R7, R9, R11)]>; + +def Tuples2Rsp : RegisterTuples<[gsub_0, gsub_1], + [(add R12), (add SP)]>; // Register class representing a pair of even-odd GPRs. -def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2R)> { +def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2Rnosp, Tuples2Rsp)> { + let Size = 64; // 2 x 32 bits, we have no predefined type of that size. +} + +// Register class representing a pair of even-odd GPRs, except (R12, SP). +def GPRPairnosp : RegisterClass<"ARM", [untyped], 64, (add Tuples2Rnosp)> { let Size = 64; // 2 x 32 bits, we have no predefined type of that size. } diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 21d32bde4710..3f0b71afd977 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -2239,9 +2239,9 @@ def A9WriteLMfpPostRA : SchedWriteVariant<[ // Distinguish between our multiple MI-level forms of the same // VLDM/VSTM instructions. def A9PreRA : SchedPredicate< - "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">; + "Register::isVirtualRegister(MI->getOperand(0).getReg())">; def A9PostRA : SchedPredicate< - "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">; + "Register::isPhysicalRegister(MI->getOperand(0).getReg())">; // VLDM represents all destination registers as a single register // tuple, unlike LDM. So the number of write operands is not variadic. diff --git a/lib/Target/ARM/ARMScheduleM4.td b/lib/Target/ARM/ARMScheduleM4.td index 38c8ea2b4f35..bfa5fc0d7131 100644 --- a/lib/Target/ARM/ARMScheduleM4.td +++ b/lib/Target/ARM/ARMScheduleM4.td @@ -18,6 +18,9 @@ def CortexM4Model : SchedMachineModel { let PostRAScheduler = 1; let CompleteModel = 0; + let UnsupportedFeatures = [IsARM, HasNEON, HasDotProd, HasZCZ, HasMVEInt, + IsNotMClass, HasDPVFP, HasFPARMv8, HasFullFP16, Has8MSecExt, HasV8, + HasV8_3a, HasTrustZone, HasDFB, IsWindows]; } @@ -50,6 +53,7 @@ def : M4UnitL2<WriteMAC16>; def : M4UnitL2<WriteDIV>; def : M4UnitL2I<(instregex "(t|t2)LDM")>; +def : M4UnitL2I<(instregex "(t|t2)LDR")>; // Stores we use a latency of 1 as they have no outputs @@ -78,9 +82,20 @@ def : M4UnitL1<WriteNoop>; def : M4UnitL1<WritePreLd>; def : M4UnitL1I<(instregex "(t|t2)MOV")>; def : M4UnitL1I<(instrs COPY)>; -def : M4UnitL1I<(instregex "t2IT")>; -def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", - "t2(S|Q|SH|U|UQ|UH)(ADD16|ASX|SAX|SUB16|ADD8|SUB8)", "t2USADA8", "(t|t2)REV")>; +def : M4UnitL1I<(instregex "t2IT", "t2MSR", "t2MRS")>; +def : M4UnitL1I<(instregex "t2CLREX")>; +def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", "t2SML[AS]", + "t2(S|Q|SH|U|UQ|UH|QD)(ADD|ASX|SAX|SUB)", "t2USADA8", "(t|t2)REV")>; + +// These instructions are not of much interest to scheduling as they will not +// be generated or it is not very useful to schedule them. They are here to make +// the model more complete. +def : M4UnitL1I<(instregex "t2CDP", "t2LDC", "t2MCR", "t2MRC", "t2MRRC", "t2STC")>; +def : M4UnitL1I<(instregex "tCPS", "t2ISB", "t2DSB", "t2DMB", "t2?HINT$")>; +def : M4UnitL1I<(instregex "t2?UDF$", "tBKPT", "t2DBG")>; +def : M4UnitL1I<(instregex "t?2?Int_eh_sjlj_", "tADDframe", "t?ADJCALL")>; +def : M4UnitL1I<(instregex "CMP_SWAP", "JUMPTABLE", "MEMCPY")>; +def : M4UnitL1I<(instregex "VSETLNi32", "VGETLNi32")>; def : ReadAdvance<ReadALU, 0>; def : ReadAdvance<ReadALUsr, 0>; @@ -112,6 +127,9 @@ def : M4UnitL1<WriteVST1>; def : M4UnitL1<WriteVST2>; def : M4UnitL1<WriteVST3>; def : M4UnitL1<WriteVST4>; +def : M4UnitL1I<(instregex "VMOVS", "FCONSTS", "VCMP", "VNEG", "VABS")>; +def : M4UnitL2I<(instregex "VMOVD")>; +def : M4UnitL1I<(instregex "VMRS", "VMSR", "FMSTAT")>; def : ReadAdvance<ReadFPMUL, 0>; def : ReadAdvance<ReadFPMAC, 0>; diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 978faed776b0..09603057b2c8 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -125,7 +125,7 @@ const CallLowering *ARMSubtarget::getCallLowering() const { return CallLoweringInfo.get(); } -const InstructionSelector *ARMSubtarget::getInstructionSelector() const { +InstructionSelector *ARMSubtarget::getInstructionSelector() const { return InstSelector.get(); } @@ -205,9 +205,9 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { NoARM = true; if (isAAPCS_ABI()) - stackAlignment = 8; + stackAlignment = Align(8); if (isTargetNaCl() || isAAPCS16_ABI()) - stackAlignment = 16; + stackAlignment = Align(16); // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as @@ -253,6 +253,10 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (isRWPI()) ReserveR9 = true; + // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2 + if (MVEVectorCostFactor == 0) + MVEVectorCostFactor = 2; + // FIXME: Teach TableGen to deal with these instead of doing it manually here. switch (ARMProcFamily) { case Others: @@ -296,13 +300,15 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { LdStMultipleTiming = SingleIssuePlusExtras; MaxInterleaveFactor = 4; if (!isThumb()) - PrefLoopAlignment = 3; + PrefLoopLogAlignment = 3; break; case Kryo: break; case Krait: PreISelOperandLatencyAdjustment = 1; break; + case NeoverseN1: + break; case Swift: MaxInterleaveFactor = 2; LdStMultipleTiming = SingleIssuePlusExtras; diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index c2b0f052b843..ef460342a69e 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -71,6 +71,7 @@ protected: Exynos, Krait, Kryo, + NeoverseN1, Swift }; enum ARMProcClassEnum { @@ -179,11 +180,9 @@ protected: bool HasVFPv3SP = false; bool HasVFPv4SP = false; bool HasFPARMv8SP = false; - bool HasVFPv2D16 = false; bool HasVFPv3D16 = false; bool HasVFPv4D16 = false; bool HasFPARMv8D16 = false; - bool HasVFPv2D16SP = false; bool HasVFPv3D16SP = false; bool HasVFPv4D16SP = false; bool HasFPARMv8D16SP = false; @@ -450,7 +449,7 @@ protected: /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. - unsigned stackAlignment = 4; + Align stackAlignment = Align(4); /// CPUString - String name of used CPU. std::string CPUString; @@ -469,7 +468,12 @@ protected: int PreISelOperandLatencyAdjustment = 2; /// What alignment is preferred for loop bodies, in log2(bytes). - unsigned PrefLoopAlignment = 0; + unsigned PrefLoopLogAlignment = 0; + + /// The cost factor for MVE instructions, representing the multiple beats an + // instruction can take. The default is 2, (set in initSubtargetFeatures so + // that we can use subtarget features less than 2). + unsigned MVEVectorCostFactor = 0; /// OptMinSize - True if we're optimising for minimum code size, equal to /// the function attribute. @@ -535,7 +539,7 @@ public: } const CallLowering *getCallLowering() const override; - const InstructionSelector *getInstructionSelector() const override; + InstructionSelector *getInstructionSelector() const override; const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; @@ -600,7 +604,7 @@ public: bool hasARMOps() const { return !NoARM; } - bool hasVFP2Base() const { return HasVFPv2D16SP; } + bool hasVFP2Base() const { return HasVFPv2SP; } bool hasVFP3Base() const { return HasVFPv3D16SP; } bool hasVFP4Base() const { return HasVFPv4D16SP; } bool hasFPARMv8Base() const { return HasFPARMv8D16SP; } @@ -668,6 +672,12 @@ public: bool hasSB() const { return HasSB; } bool genLongCalls() const { return GenLongCalls; } bool genExecuteOnly() const { return GenExecuteOnly; } + bool hasBaseDSP() const { + if (isThumb()) + return hasDSP(); + else + return hasV5TEOps(); + } bool hasFP16() const { return HasFP16; } bool hasD32() const { return HasD32; } @@ -812,7 +822,7 @@ public: /// getStackAlignment - Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. - unsigned getStackAlignment() const { return stackAlignment; } + Align getStackAlignment() const { return stackAlignment; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } @@ -853,9 +863,9 @@ public: return isROPI() || !isTargetELF(); } - unsigned getPrefLoopAlignment() const { - return PrefLoopAlignment; - } + unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; } + + unsigned getMVEVectorCostFactor() const { return MVEVectorCostFactor; } bool ignoreCSRForAllocationOrder(const MachineFunction &MF, unsigned PhysReg) const override; diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 7f0aae1739b3..5c8007f101d9 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -96,15 +96,16 @@ extern "C" void LLVMInitializeARMTarget() { initializeARMExpandPseudoPass(Registry); initializeThumb2SizeReducePass(Registry); initializeMVEVPTBlockPass(Registry); + initializeMVETailPredicationPass(Registry); initializeARMLowOverheadLoopsPass(Registry); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) - return llvm::make_unique<TargetLoweringObjectFileMachO>(); + return std::make_unique<TargetLoweringObjectFileMachO>(); if (TT.isOSWindows()) - return llvm::make_unique<TargetLoweringObjectFileCOFF>(); - return llvm::make_unique<ARMElfTargetObjectFile>(); + return std::make_unique<TargetLoweringObjectFileCOFF>(); + return std::make_unique<ARMElfTargetObjectFile>(); } static ARMBaseTargetMachine::ARMABI @@ -282,7 +283,7 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle, + I = std::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle, F.hasMinSize()); if (!I->isThumb() && !I->hasARMOps()) @@ -447,8 +448,10 @@ bool ARMPassConfig::addPreISel() { MergeExternalByDefault)); } - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createHardwareLoopsPass()); + addPass(createMVETailPredicationPass()); + } return false; } diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 2a8ec734a05f..86c8684d14dc 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -36,8 +36,12 @@ using namespace llvm; #define DEBUG_TYPE "armtti" +static cl::opt<bool> EnableMaskedLoadStores( + "enable-arm-maskedldst", cl::Hidden, cl::init(false), + cl::desc("Enable the generation of masked loads and stores")); + static cl::opt<bool> DisableLowOverheadLoops( - "disable-arm-loloops", cl::Hidden, cl::init(true), + "disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops")); bool ARMTTIImpl::areInlineCompatible(const Function *Caller, @@ -167,6 +171,42 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (!SrcTy.isSimple() || !DstTy.isSimple()) return BaseT::getCastInstrCost(Opcode, Dst, Src); + // The extend of a load is free + if (I && isa<LoadInst>(I->getOperand(0))) { + static const TypeConversionCostTblEntry LoadConversionTbl[] = { + {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, + {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, + {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0}, + {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0}, + {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0}, + {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0}, + {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1}, + {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1}, + {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1}, + {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1}, + {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1}, + {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1}, + }; + if (const auto *Entry = ConvertCostTableLookup( + LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) + return Entry->Cost; + + static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { + {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0}, + {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0}, + {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0}, + {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0}, + {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0}, + {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0}, + }; + if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { + if (const auto *Entry = + ConvertCostTableLookup(MVELoadConversionTbl, ISD, + DstTy.getSimpleVT(), SrcTy.getSimpleVT())) + return Entry->Cost; + } + } + // Some arithmetic, load and store operations have specific instructions // to cast up/down their types automatically at no extra cost. // TODO: Get these tables to know at least what the related operations are. @@ -313,6 +353,31 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return Entry->Cost; } + // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one + // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext + // are linearised so take more. + static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = { + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 }, + }; + + if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { + if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl, + ISD, DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost * ST->getMVEVectorCostFactor(); + } + // Scalar integer conversion costs. static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { // i16 -> i64 requires two dependent operations. @@ -332,7 +397,10 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return Entry->Cost; } - return BaseT::getCastInstrCost(Opcode, Dst, Src); + int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src); } int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, @@ -343,8 +411,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) return 3; - if ((Opcode == Instruction::InsertElement || - Opcode == Instruction::ExtractElement)) { + if (ST->hasNEON() && (Opcode == Instruction::InsertElement || + Opcode == Instruction::ExtractElement)) { // Cross-class copies are expensive on many microarchitectures, // so assume they are expensive by default. if (ValTy->getVectorElementType()->isIntegerTy()) @@ -357,6 +425,17 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); } + if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || + Opcode == Instruction::ExtractElement)) { + // We say MVE moves costs at least the MVEVectorCostFactor, even though + // they are scalar instructions. This helps prevent mixing scalar and + // vector, to prevent vectorising where we end up just scalarising the + // result anyway. + return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), + ST->getMVEVectorCostFactor()) * + ValTy->getVectorNumElements() / 2; + } + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } @@ -385,7 +464,10 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, return LT.first; } - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, @@ -397,13 +479,37 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, unsigned NumVectorInstToHideOverhead = 10; int MaxMergeDistance = 64; - if (Ty->isVectorTy() && SE && - !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) - return NumVectorInstToHideOverhead; + if (ST->hasNEON()) { + if (Ty->isVectorTy() && SE && + !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) + return NumVectorInstToHideOverhead; - // In many cases the address computation is not merged into the instruction - // addressing mode. - return 1; + // In many cases the address computation is not merged into the instruction + // addressing mode. + return 1; + } + return BaseT::getAddressComputationCost(Ty, SE, Ptr); +} + +bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { + if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) + return false; + + if (auto *VecTy = dyn_cast<VectorType>(DataTy)) { + // Don't support v2i1 yet. + if (VecTy->getNumElements() == 2) + return false; + + // We don't support extending fp types. + unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); + if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy()) + return false; + } + + unsigned EltWidth = DataTy->getScalarSizeInBits(); + return (EltWidth == 32 && (!Alignment || Alignment >= 4)) || + (EltWidth == 16 && (!Alignment || Alignment >= 2)) || + (EltWidth == 8); } int ARMTTIImpl::getMemcpyCost(const Instruction *I) { @@ -442,78 +548,96 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) { int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - if (Kind == TTI::SK_Broadcast) { - static const CostTblEntry NEONDupTbl[] = { - // VDUP handles these cases. - {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, - - {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; - - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - - if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, - LT.second)) - return LT.first * Entry->Cost; - - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); - } - if (Kind == TTI::SK_Reverse) { - static const CostTblEntry NEONShuffleTbl[] = { - // Reverse shuffle cost one instruction if we are shuffling within a - // double word (vrev) or two if we shuffle a quad word (vrev, vext). - {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, - - {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; - - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - - if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, - LT.second)) - return LT.first * Entry->Cost; - - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); - } - if (Kind == TTI::SK_Select) { - static const CostTblEntry NEONSelShuffleTbl[] = { - // Select shuffle cost table for ARM. Cost is the number of instructions - // required to create the shuffled vector. + if (ST->hasNEON()) { + if (Kind == TTI::SK_Broadcast) { + static const CostTblEntry NEONDupTbl[] = { + // VDUP handles these cases. + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + + if (const auto *Entry = + CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + } + if (Kind == TTI::SK_Reverse) { + static const CostTblEntry NEONShuffleTbl[] = { + // Reverse shuffle cost one instruction if we are shuffling within a + // double word (vrev) or two if we shuffle a quad word (vrev, vext). + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + + if (const auto *Entry = + CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + } + if (Kind == TTI::SK_Select) { + static const CostTblEntry NEONSelShuffleTbl[] = { + // Select shuffle cost table for ARM. Cost is the number of + // instructions + // required to create the shuffled vector. - {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, - ISD::VECTOR_SHUFFLE, LT.second)) - return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + } + } + if (ST->hasMVEIntegerOps()) { + if (Kind == TTI::SK_Broadcast) { + static const CostTblEntry MVEDupTbl[] = { + // VDUP handles these cases. + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + + if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, + LT.second)) + return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(); + } } - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } int ARMTTIImpl::getArithmeticInstrCost( @@ -567,38 +691,64 @@ int ARMTTIImpl::getArithmeticInstrCost( // Multiplication. }; - if (ST->hasNEON()) + if (ST->hasNEON()) { if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) return LT.first * Entry->Cost; - int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, - Opd1PropInfo, Opd2PropInfo); - - // This is somewhat of a hack. The problem that we are facing is that SROA - // creates a sequence of shift, and, or instructions to construct values. - // These sequences are recognized by the ISel and have zero-cost. Not so for - // the vectorized code. Because we have support for v2i64 but not i64 those - // sequences look particularly beneficial to vectorize. - // To work around this we increase the cost of v2i64 operations to make them - // seem less beneficial. - if (LT.second == MVT::v2i64 && - Op2Info == TargetTransformInfo::OK_UniformConstantValue) - Cost += 4; - - return Cost; + int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); + + // This is somewhat of a hack. The problem that we are facing is that SROA + // creates a sequence of shift, and, or instructions to construct values. + // These sequences are recognized by the ISel and have zero-cost. Not so for + // the vectorized code. Because we have support for v2i64 but not i64 those + // sequences look particularly beneficial to vectorize. + // To work around this we increase the cost of v2i64 operations to make them + // seem less beneficial. + if (LT.second == MVT::v2i64 && + Op2Info == TargetTransformInfo::OK_UniformConstantValue) + Cost += 4; + + return Cost; + } + + int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + + // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, + // without treating floats as more expensive that scalars or increasing the + // costs for custom operations. The results is also multiplied by the + // MVEVectorCostFactor where appropriate. + if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second)) + return LT.first * BaseCost; + + // Else this is expand, assume that we need to scalarize this op. + if (Ty->isVectorTy()) { + unsigned Num = Ty->getVectorNumElements(); + unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. + return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost; + } + + return BaseCost; } int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I) { std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); - if (Src->isVectorTy() && Alignment != 16 && + if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 && Src->getVectorElementType()->isDoubleTy()) { // Unaligned loads/stores are extremely inefficient. // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. return LT.first * 4; } - return LT.first; + int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() + ? ST->getMVEVectorCostFactor() + : 1; + return BaseCost * LT.first; } int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, @@ -893,6 +1043,11 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, } return; } + // Don't unroll vectorised loop. MVE does not benefit from it as much as + // scalar code. + if (I.getType()->isVectorTy()) + return; + SmallVector<const Value*, 4> Operands(I.value_op_begin(), I.value_op_end()); Cost += getUserCost(&I, Operands); @@ -914,3 +1069,28 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, if (Cost < 12) UP.Force = true; } + +bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type"); + unsigned ScalarBits = Ty->getScalarSizeInBits(); + if (!ST->hasMVEIntegerOps()) + return false; + + switch (Opcode) { + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Mul: + case Instruction::FCmp: + return false; + case Instruction::ICmp: + case Instruction::Add: + return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128; + default: + llvm_unreachable("Unhandled reduction opcode"); + } + return false; +} diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index 52f6ea4a6e2f..a878fdcfe3c7 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -101,9 +101,9 @@ public: /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD - /// is IEEE-754 compliant, but it's not covered in this target. + /// and Arm MVE are IEEE-754 compliant. bool isFPVectorizationPotentiallyUnsafe() { - return !ST->isTargetDarwin(); + return !ST->isTargetDarwin() && !ST->hasMVEFloatOps(); } /// \name Scalar TTI Implementations @@ -122,10 +122,13 @@ public: /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector) { + unsigned getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector) { if (ST->hasNEON()) return 16; + if (ST->hasMVEIntegerOps()) + return 8; return 0; } @@ -138,6 +141,8 @@ public: if (Vector) { if (ST->hasNEON()) return 128; + if (ST->hasMVEIntegerOps()) + return 128; return 0; } @@ -148,10 +153,23 @@ public: return ST->getMaxInterleaveFactor(); } + bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment); + + bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) { + return isLegalMaskedLoad(DataTy, Alignment); + } + int getMemcpyCost(const Instruction *I); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const; + + bool shouldExpandReduction(const IntrinsicInst *II) const { + return false; + } + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I = nullptr); diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 1da9452f1d22..d2c355c1da75 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -2275,6 +2275,14 @@ public: return Value >= 1 && Value <= 32; } + bool isMveSaturateOp() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + uint64_t Value = CE->getValue(); + return Value == 48 || Value == 64; + } + bool isITCondCodeNoAL() const { if (!isITCondCode()) return false; ARMCC::CondCodes CC = getCondCode(); @@ -2479,28 +2487,28 @@ public: void addModImmNotOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); uint32_t Enc = ARM_AM::getSOImmVal(~CE->getValue()); Inst.addOperand(MCOperand::createImm(Enc)); } void addModImmNegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); uint32_t Enc = ARM_AM::getSOImmVal(-CE->getValue()); Inst.addOperand(MCOperand::createImm(Enc)); } void addThumbModImmNeg8_255Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); uint32_t Val = -CE->getValue(); Inst.addOperand(MCOperand::createImm(Val)); } void addThumbModImmNeg1_7Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); uint32_t Val = -CE->getValue(); Inst.addOperand(MCOperand::createImm(Val)); } @@ -2523,19 +2531,19 @@ public: void addFBits16Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(16 - CE->getValue())); } void addFBits32Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(32 - CE->getValue())); } void addFPImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue())); Inst.addOperand(MCOperand::createImm(Val)); } @@ -2544,7 +2552,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // FIXME: We really want to scale the value here, but the LDRD/STRD // instruction don't encode operands that way yet. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } @@ -2552,35 +2560,31 @@ public: assert(N == 1 && "Invalid number of operands!"); // FIXME: We really want to scale the value here, but the VSTR/VLDR_VSYSR // instruction don't encode operands that way yet. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } void addImm7Shift0Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - assert(CE != nullptr && "Invalid operand type!"); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } void addImm7Shift1Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - assert(CE != nullptr && "Invalid operand type!"); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } void addImm7Shift2Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - assert(CE != nullptr && "Invalid operand type!"); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } void addImm7Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - assert(CE != nullptr && "Invalid operand type!"); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } @@ -2588,7 +2592,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The immediate is scaled by four in the encoding and is stored // in the MCInst as such. Lop off the low two bits here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() / 4)); } @@ -2596,7 +2600,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The immediate is scaled by four in the encoding and is stored // in the MCInst as such. Lop off the low two bits here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(-(CE->getValue() / 4))); } @@ -2604,7 +2608,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The immediate is scaled by four in the encoding and is stored // in the MCInst as such. Lop off the low two bits here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() / 4)); } @@ -2612,7 +2616,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The constant encodes as the immediate-1, and we store in the instruction // the bits as encoded, so subtract off one here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() - 1)); } @@ -2620,7 +2624,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The constant encodes as the immediate-1, and we store in the instruction // the bits as encoded, so subtract off one here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() - 1)); } @@ -2628,7 +2632,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The constant encodes as the immediate, except for 32, which encodes as // zero. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Imm = CE->getValue(); Inst.addOperand(MCOperand::createImm((Imm == 32 ? 0 : Imm))); } @@ -2637,7 +2641,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // An ASR value of 32 encodes as 0, so that's how we want to add it to // the instruction as well. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); int Val = CE->getValue(); Inst.addOperand(MCOperand::createImm(Val == 32 ? 0 : Val)); } @@ -2646,7 +2650,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The operand is actually a t2_so_imm, but we have its bitwise // negation in the assembly source, so twiddle it here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(~(uint32_t)CE->getValue())); } @@ -2654,7 +2658,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The operand is actually a t2_so_imm, but we have its // negation in the assembly source, so twiddle it here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue())); } @@ -2662,7 +2666,7 @@ public: assert(N == 1 && "Invalid number of operands!"); // The operand is actually an imm0_4095, but we have its // negation in the assembly source, so twiddle it here. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue())); } @@ -2671,9 +2675,7 @@ public: Inst.addOperand(MCOperand::createImm(CE->getValue() >> 2)); return; } - - const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val); - assert(SR && "Unknown value type!"); + const MCSymbolRefExpr *SR = cast<MCSymbolRefExpr>(Imm.Val); Inst.addOperand(MCOperand::createExpr(SR)); } @@ -2685,10 +2687,7 @@ public: Inst.addOperand(MCOperand::createImm(CE->getValue())); return; } - - const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val); - - assert(SR && "Unknown value type!"); + const MCSymbolRefExpr *SR = cast<MCSymbolRefExpr>(Imm.Val); Inst.addOperand(MCOperand::createExpr(SR)); return; } @@ -2750,7 +2749,7 @@ public: return; } - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); int Val = CE->getValue(); Inst.addOperand(MCOperand::createImm(Val)); } @@ -3130,7 +3129,7 @@ public: void addPowerTwoOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue())); } @@ -3225,14 +3224,14 @@ public: assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. // Mask in that this is an i8 splat. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() | 0xe00)); } void addNEONi16splatOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = CE->getValue(); Value = ARM_AM::encodeNEONi16splat(Value); Inst.addOperand(MCOperand::createImm(Value)); @@ -3241,7 +3240,7 @@ public: void addNEONi16splatNotOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = CE->getValue(); Value = ARM_AM::encodeNEONi16splat(~Value & 0xffff); Inst.addOperand(MCOperand::createImm(Value)); @@ -3250,7 +3249,7 @@ public: void addNEONi32splatOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = CE->getValue(); Value = ARM_AM::encodeNEONi32splat(Value); Inst.addOperand(MCOperand::createImm(Value)); @@ -3259,7 +3258,7 @@ public: void addNEONi32splatNotOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = CE->getValue(); Value = ARM_AM::encodeNEONi32splat(~Value); Inst.addOperand(MCOperand::createImm(Value)); @@ -3267,7 +3266,7 @@ public: void addNEONi8ReplicateOperands(MCInst &Inst, bool Inv) const { // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); assert((Inst.getOpcode() == ARM::VMOVv8i8 || Inst.getOpcode() == ARM::VMOVv16i8) && "All instructions that wants to replicate non-zero byte " @@ -3298,7 +3297,7 @@ public: void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = encodeNeonVMOVImmediate(CE->getValue()); Inst.addOperand(MCOperand::createImm(Value)); } @@ -3310,7 +3309,7 @@ public: void addNEONvmovi16ReplicateOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); assert((Inst.getOpcode() == ARM::VMOVv4i16 || Inst.getOpcode() == ARM::VMOVv8i16 || Inst.getOpcode() == ARM::VMVNv4i16 || @@ -3327,14 +3326,14 @@ public: void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); unsigned Value = encodeNeonVMOVImmediate(~CE->getValue()); Inst.addOperand(MCOperand::createImm(Value)); } void addNEONvmovi32ReplicateOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); assert((Inst.getOpcode() == ARM::VMOVv2i32 || Inst.getOpcode() == ARM::VMOVv4i32 || Inst.getOpcode() == ARM::VMVNv2i32 || @@ -3349,7 +3348,7 @@ public: void addNEONi64splatOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); // The immediate encodes the type of constant as well as the value. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); uint64_t Value = CE->getValue(); unsigned Imm = 0; for (unsigned i = 0; i < 8; ++i, Value >>= 8) { @@ -3360,20 +3359,28 @@ public: void addComplexRotationEvenOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm(CE->getValue() / 90)); } void addComplexRotationOddOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); Inst.addOperand(MCOperand::createImm((CE->getValue() - 90) / 180)); } + void addMveSaturateOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); + unsigned Imm = CE->getValue(); + assert((Imm == 48 || Imm == 64) && "Invalid saturate operand"); + Inst.addOperand(MCOperand::createImm(Imm == 48 ? 1 : 0)); + } + void print(raw_ostream &OS) const override; static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_ITCondMask); + auto Op = std::make_unique<ARMOperand>(k_ITCondMask); Op->ITMask.Mask = Mask; Op->StartLoc = S; Op->EndLoc = S; @@ -3382,7 +3389,7 @@ public: static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_CondCode); + auto Op = std::make_unique<ARMOperand>(k_CondCode); Op->CC.Val = CC; Op->StartLoc = S; Op->EndLoc = S; @@ -3391,7 +3398,7 @@ public: static std::unique_ptr<ARMOperand> CreateVPTPred(ARMVCC::VPTCodes CC, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_VPTPred); + auto Op = std::make_unique<ARMOperand>(k_VPTPred); Op->VCC.Val = CC; Op->StartLoc = S; Op->EndLoc = S; @@ -3399,7 +3406,7 @@ public: } static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_CoprocNum); + auto Op = std::make_unique<ARMOperand>(k_CoprocNum); Op->Cop.Val = CopVal; Op->StartLoc = S; Op->EndLoc = S; @@ -3407,7 +3414,7 @@ public: } static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_CoprocReg); + auto Op = std::make_unique<ARMOperand>(k_CoprocReg); Op->Cop.Val = CopVal; Op->StartLoc = S; Op->EndLoc = S; @@ -3416,7 +3423,7 @@ public: static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_CoprocOption); + auto Op = std::make_unique<ARMOperand>(k_CoprocOption); Op->Cop.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -3424,7 +3431,7 @@ public: } static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_CCOut); + auto Op = std::make_unique<ARMOperand>(k_CCOut); Op->Reg.RegNum = RegNum; Op->StartLoc = S; Op->EndLoc = S; @@ -3432,7 +3439,7 @@ public: } static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_Token); + auto Op = std::make_unique<ARMOperand>(k_Token); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = S; @@ -3442,7 +3449,7 @@ public: static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_Register); + auto Op = std::make_unique<ARMOperand>(k_Register); Op->Reg.RegNum = RegNum; Op->StartLoc = S; Op->EndLoc = E; @@ -3453,7 +3460,7 @@ public: CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg, unsigned ShiftReg, unsigned ShiftImm, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_ShiftedRegister); + auto Op = std::make_unique<ARMOperand>(k_ShiftedRegister); Op->RegShiftedReg.ShiftTy = ShTy; Op->RegShiftedReg.SrcReg = SrcReg; Op->RegShiftedReg.ShiftReg = ShiftReg; @@ -3466,7 +3473,7 @@ public: static std::unique_ptr<ARMOperand> CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg, unsigned ShiftImm, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_ShiftedImmediate); + auto Op = std::make_unique<ARMOperand>(k_ShiftedImmediate); Op->RegShiftedImm.ShiftTy = ShTy; Op->RegShiftedImm.SrcReg = SrcReg; Op->RegShiftedImm.ShiftImm = ShiftImm; @@ -3477,7 +3484,7 @@ public: static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_ShifterImmediate); + auto Op = std::make_unique<ARMOperand>(k_ShifterImmediate); Op->ShifterImm.isASR = isASR; Op->ShifterImm.Imm = Imm; Op->StartLoc = S; @@ -3487,7 +3494,7 @@ public: static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_RotateImmediate); + auto Op = std::make_unique<ARMOperand>(k_RotateImmediate); Op->RotImm.Imm = Imm; Op->StartLoc = S; Op->EndLoc = E; @@ -3496,7 +3503,7 @@ public: static std::unique_ptr<ARMOperand> CreateModImm(unsigned Bits, unsigned Rot, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_ModifiedImmediate); + auto Op = std::make_unique<ARMOperand>(k_ModifiedImmediate); Op->ModImm.Bits = Bits; Op->ModImm.Rot = Rot; Op->StartLoc = S; @@ -3506,7 +3513,7 @@ public: static std::unique_ptr<ARMOperand> CreateConstantPoolImm(const MCExpr *Val, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_ConstantPoolImmediate); + auto Op = std::make_unique<ARMOperand>(k_ConstantPoolImmediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -3515,7 +3522,7 @@ public: static std::unique_ptr<ARMOperand> CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor); + auto Op = std::make_unique<ARMOperand>(k_BitfieldDescriptor); Op->Bitfield.LSB = LSB; Op->Bitfield.Width = Width; Op->StartLoc = S; @@ -3543,16 +3550,15 @@ public: Kind = k_SPRRegisterList; } - // Sort based on the register encoding values. - array_pod_sort(Regs.begin(), Regs.end()); - if (Kind == k_RegisterList && Regs.back().second == ARM::APSR) Kind = k_RegisterListWithAPSR; - auto Op = make_unique<ARMOperand>(Kind); - for (SmallVectorImpl<std::pair<unsigned, unsigned>>::const_iterator - I = Regs.begin(), E = Regs.end(); I != E; ++I) - Op->Registers.push_back(I->second); + assert(std::is_sorted(Regs.begin(), Regs.end()) && + "Register list must be sorted by encoding"); + + auto Op = std::make_unique<ARMOperand>(Kind); + for (const auto &P : Regs) + Op->Registers.push_back(P.second); Op->StartLoc = StartLoc; Op->EndLoc = EndLoc; @@ -3563,7 +3569,7 @@ public: unsigned Count, bool isDoubleSpaced, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_VectorList); + auto Op = std::make_unique<ARMOperand>(k_VectorList); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.isDoubleSpaced = isDoubleSpaced; @@ -3575,7 +3581,7 @@ public: static std::unique_ptr<ARMOperand> CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_VectorListAllLanes); + auto Op = std::make_unique<ARMOperand>(k_VectorListAllLanes); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.isDoubleSpaced = isDoubleSpaced; @@ -3587,7 +3593,7 @@ public: static std::unique_ptr<ARMOperand> CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index, bool isDoubleSpaced, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_VectorListIndexed); + auto Op = std::make_unique<ARMOperand>(k_VectorListIndexed); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.LaneIndex = Index; @@ -3599,7 +3605,7 @@ public: static std::unique_ptr<ARMOperand> CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<ARMOperand>(k_VectorIndex); + auto Op = std::make_unique<ARMOperand>(k_VectorIndex); Op->VectorIndex.Val = Idx; Op->StartLoc = S; Op->EndLoc = E; @@ -3608,7 +3614,7 @@ public: static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_Immediate); + auto Op = std::make_unique<ARMOperand>(k_Immediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -3620,7 +3626,7 @@ public: unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType, unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S, SMLoc E, SMLoc AlignmentLoc = SMLoc()) { - auto Op = make_unique<ARMOperand>(k_Memory); + auto Op = std::make_unique<ARMOperand>(k_Memory); Op->Memory.BaseRegNum = BaseRegNum; Op->Memory.OffsetImm = OffsetImm; Op->Memory.OffsetRegNum = OffsetRegNum; @@ -3637,7 +3643,7 @@ public: static std::unique_ptr<ARMOperand> CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy, unsigned ShiftImm, SMLoc S, SMLoc E) { - auto Op = make_unique<ARMOperand>(k_PostIndexRegister); + auto Op = std::make_unique<ARMOperand>(k_PostIndexRegister); Op->PostIdxReg.RegNum = RegNum; Op->PostIdxReg.isAdd = isAdd; Op->PostIdxReg.ShiftTy = ShiftTy; @@ -3649,7 +3655,7 @@ public: static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_MemBarrierOpt); + auto Op = std::make_unique<ARMOperand>(k_MemBarrierOpt); Op->MBOpt.Val = Opt; Op->StartLoc = S; Op->EndLoc = S; @@ -3658,7 +3664,7 @@ public: static std::unique_ptr<ARMOperand> CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt); + auto Op = std::make_unique<ARMOperand>(k_InstSyncBarrierOpt); Op->ISBOpt.Val = Opt; Op->StartLoc = S; Op->EndLoc = S; @@ -3667,7 +3673,7 @@ public: static std::unique_ptr<ARMOperand> CreateTraceSyncBarrierOpt(ARM_TSB::TraceSyncBOpt Opt, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_TraceSyncBarrierOpt); + auto Op = std::make_unique<ARMOperand>(k_TraceSyncBarrierOpt); Op->TSBOpt.Val = Opt; Op->StartLoc = S; Op->EndLoc = S; @@ -3676,7 +3682,7 @@ public: static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_ProcIFlags); + auto Op = std::make_unique<ARMOperand>(k_ProcIFlags); Op->IFlags.Val = IFlags; Op->StartLoc = S; Op->EndLoc = S; @@ -3684,7 +3690,7 @@ public: } static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_MSRMask); + auto Op = std::make_unique<ARMOperand>(k_MSRMask); Op->MMask.Val = MMask; Op->StartLoc = S; Op->EndLoc = S; @@ -3692,7 +3698,7 @@ public: } static std::unique_ptr<ARMOperand> CreateBankedReg(unsigned Reg, SMLoc S) { - auto Op = make_unique<ARMOperand>(k_BankedReg); + auto Op = std::make_unique<ARMOperand>(k_BankedReg); Op->BankedReg.Val = Reg; Op->StartLoc = S; Op->EndLoc = S; @@ -4253,6 +4259,24 @@ static unsigned getNextRegister(unsigned Reg) { } } +// Insert an <Encoding, Register> pair in an ordered vector. Return true on +// success, or false, if duplicate encoding found. +static bool +insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs, + unsigned Enc, unsigned Reg) { + Regs.emplace_back(Enc, Reg); + for (auto I = Regs.rbegin(), J = I + 1, E = Regs.rend(); J != E; ++I, ++J) { + if (J->first == Enc) { + Regs.erase(J.base()); + return false; + } + if (J->first < Enc) + break; + std::swap(*I, *J); + } + return true; +} + /// Parse a register list. bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder) { @@ -4278,7 +4302,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { Reg = getDRegFromQReg(Reg); EReg = MRI->getEncodingValue(Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + Registers.emplace_back(EReg, Reg); ++Reg; } const MCRegisterClass *RC; @@ -4295,7 +4319,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, // Store the register. EReg = MRI->getEncodingValue(Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + Registers.emplace_back(EReg, Reg); // This starts immediately after the first register token in the list, // so we can see either a comma or a minus (range separator) as a legal @@ -4326,7 +4350,11 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, while (Reg != EndReg) { Reg = getNextRegister(Reg); EReg = MRI->getEncodingValue(Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + if (!insertNoDuplicates(Registers, EReg, Reg)) { + Warning(AfterMinusLoc, StringRef("duplicated register (") + + ARMInstPrinter::getRegisterName(Reg) + + ") in register list"); + } } continue; } @@ -4350,11 +4378,16 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, // subset of GPRRegClassId except it contains APSR as well. RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID]; } - if (Reg == ARM::VPR && (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] || - RC == &ARMMCRegisterClasses[ARM::DPRRegClassID])) { + if (Reg == ARM::VPR && + (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] || + RC == &ARMMCRegisterClasses[ARM::DPRRegClassID] || + RC == &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID])) { RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID]; EReg = MRI->getEncodingValue(Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + if (!insertNoDuplicates(Registers, EReg, Reg)) { + Warning(RegLoc, "duplicated register (" + RegTok.getString() + + ") in register list"); + } continue; } // The register must be in the same register class as the first. @@ -4371,21 +4404,19 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, else if (!ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg)) return Error(RegLoc, "register list not in ascending order"); } - if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) { - Warning(RegLoc, "duplicated register (" + RegTok.getString() + - ") in register list"); - continue; - } // VFP register lists must also be contiguous. if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] && RC != &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID] && Reg != OldReg + 1) return Error(RegLoc, "non-contiguous register range"); EReg = MRI->getEncodingValue(Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + if (!insertNoDuplicates(Registers, EReg, Reg)) { + Warning(RegLoc, "duplicated register (" + RegTok.getString() + + ") in register list"); + } if (isQReg) { EReg = MRI->getEncodingValue(++Reg); - Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + Registers.emplace_back(EReg, Reg); } } @@ -5702,14 +5733,16 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) { return false; } - // If we have a '#', it's an immediate offset, else assume it's a register - // offset. Be friendly and also accept a plain integer (without a leading - // hash) for gas compatibility. + // If we have a '#' or '$', it's an immediate offset, else assume it's a + // register offset. Be friendly and also accept a plain integer or expression + // (without a leading hash) for gas compatibility. if (Parser.getTok().is(AsmToken::Hash) || Parser.getTok().is(AsmToken::Dollar) || + Parser.getTok().is(AsmToken::LParen) || Parser.getTok().is(AsmToken::Integer)) { - if (Parser.getTok().isNot(AsmToken::Integer)) - Parser.Lex(); // Eat '#' or '$'. + if (Parser.getTok().is(AsmToken::Hash) || + Parser.getTok().is(AsmToken::Dollar)) + Parser.Lex(); // Eat '#' or '$' E = Parser.getTok().getLoc(); bool isNegative = getParser().getTok().is(AsmToken::Minus); @@ -11308,7 +11341,7 @@ bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) { SmallVector<uint8_t, 16> Opcodes; auto parseOne = [&]() -> bool { - const MCExpr *OE; + const MCExpr *OE = nullptr; SMLoc OpcodeLoc = getLexer().getLoc(); if (check(getLexer().is(AsmToken::EndOfStatement) || Parser.parseExpression(OE), @@ -11694,14 +11727,14 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { { ARM::AEK_CRYPTO, {Feature_HasV8Bit}, {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} }, { ARM::AEK_FP, {Feature_HasV8Bit}, - {ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} }, + {ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} }, { (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM), {Feature_HasV7Bit, Feature_IsNotMClassBit}, {ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} }, { ARM::AEK_MP, {Feature_HasV7Bit, Feature_IsNotMClassBit}, {ARM::FeatureMP} }, { ARM::AEK_SIMD, {Feature_HasV8Bit}, - {ARM::FeatureNEON, ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} }, + {ARM::FeatureNEON, ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} }, { ARM::AEK_SEC, {Feature_HasV6KBit}, {ARM::FeatureTrustZone} }, // FIXME: Only available in A-class, isel not predicated { ARM::AEK_VIRT, {Feature_HasV7Bit}, {ARM::FeatureVirtualization} }, @@ -11775,19 +11808,19 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, // immediate in the syntax. switch (Kind) { default: break; - case MCK__35_0: + case MCK__HASH_0: if (Op.isImm()) if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm())) if (CE->getValue() == 0) return Match_Success; break; - case MCK__35_8: + case MCK__HASH_8: if (Op.isImm()) if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm())) if (CE->getValue() == 8) return Match_Success; break; - case MCK__35_16: + case MCK__HASH_16: if (Op.isImm()) if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm())) if (CE->getValue() == 16) diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 673691ebd93e..eabc26d05f47 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -314,7 +314,7 @@ static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val, +static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst,unsigned Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst,unsigned Val, uint64_t Address, const void *Decoder); @@ -561,6 +561,8 @@ static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); @@ -3445,7 +3447,7 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn, } static DecodeStatus -DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn, +DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; @@ -5679,7 +5681,7 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, } } } - return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder); + return DecodeVMOVModImmInstruction(Inst, Insn, Address, Decoder); } if (!(imm & 0x20)) return MCDisassembler::Fail; @@ -5738,7 +5740,7 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, } } } - return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder); + return DecodeVMOVModImmInstruction(Inst, Insn, Address, Decoder); } if (!(imm & 0x20)) return MCDisassembler::Fail; @@ -6481,6 +6483,12 @@ static DecodeStatus DecodeMVEOverlappingLongShift( if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) return MCDisassembler::Fail; + if (fieldFromInstruction (Insn, 6, 3) != 4) + return MCDisassembler::SoftFail; + + if (Rda == Rm) + return MCDisassembler::SoftFail; + return S; } @@ -6503,6 +6511,13 @@ static DecodeStatus DecodeMVEOverlappingLongShift( if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) return MCDisassembler::Fail; + if (Inst.getOpcode() == ARM::MVE_SQRSHRL || + Inst.getOpcode() == ARM::MVE_UQRSHLL) { + unsigned Saturate = fieldFromInstruction(Insn, 7, 1); + // Saturate, the bit position for saturation + Inst.addOperand(MCOperand::createImm(Saturate)); + } + return S; } @@ -6572,3 +6587,11 @@ static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address, return MCDisassembler::Fail; return S; } + +static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + Inst.addOperand(MCOperand::createReg(ARM::VPR)); + Inst.addOperand(MCOperand::createReg(ARM::VPR)); + return S; +} diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h index 7732a6485a85..24a9fabf0979 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h @@ -518,10 +518,10 @@ namespace ARM_AM { // Valid alignments depend on the specific instruction. //===--------------------------------------------------------------------===// - // NEON Modified Immediates + // NEON/MVE Modified Immediates //===--------------------------------------------------------------------===// // - // Several NEON instructions (e.g., VMOV) take a "modified immediate" + // Several NEON and MVE instructions (e.g., VMOV) take a "modified immediate" // vector operand, where a small immediate encoded in the instruction // specifies a full NEON vector value. These modified immediates are // represented here as encoded integers. The low 8 bits hold the immediate @@ -529,20 +529,20 @@ namespace ARM_AM { // the "Cmode" field of the instruction. The interfaces below treat the // Op and Cmode values as a single 5-bit value. - inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) { + inline unsigned createVMOVModImm(unsigned OpCmode, unsigned Val) { return (OpCmode << 8) | Val; } - inline unsigned getNEONModImmOpCmode(unsigned ModImm) { + inline unsigned getVMOVModImmOpCmode(unsigned ModImm) { return (ModImm >> 8) & 0x1f; } - inline unsigned getNEONModImmVal(unsigned ModImm) { return ModImm & 0xff; } + inline unsigned getVMOVModImmVal(unsigned ModImm) { return ModImm & 0xff; } - /// decodeNEONModImm - Decode a NEON modified immediate value into the + /// decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the /// element value and the element size in bits. (If the element size is /// smaller than the vector, it is splatted into all the elements.) - inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) { - unsigned OpCmode = getNEONModImmOpCmode(ModImm); - unsigned Imm8 = getNEONModImmVal(ModImm); + inline uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits) { + unsigned OpCmode = getVMOVModImmOpCmode(ModImm); + unsigned Imm8 = getVMOVModImmVal(ModImm); uint64_t Val = 0; if (OpCmode == 0xe) { @@ -572,7 +572,7 @@ namespace ARM_AM { } EltBits = 64; } else { - llvm_unreachable("Unsupported NEON immediate"); + llvm_unreachable("Unsupported VMOV immediate"); } return Val; } diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index aeab5be78ab4..6196881a9b8f 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -233,7 +233,7 @@ static const char *checkPCRelOffset(uint64_t Value, int64_t Min, int64_t Max) { const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup, uint64_t Value) const { - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { case ARM::fixup_arm_thumb_br: { // Relaxing tB to t2B. tB has a signed 12-bit displacement with the // low bit being an implied zero. There's an implied +4 offset for the @@ -870,7 +870,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm, const MCValue &Target) { const MCSymbolRefExpr *A = Target.getSymA(); const MCSymbol *Sym = A ? &A->getSymbol() : nullptr; - const unsigned FixupKind = Fixup.getKind() ; + const unsigned FixupKind = Fixup.getKind(); if (FixupKind == FK_NONE) return true; if (FixupKind == ARM::fixup_arm_thumb_bl) { @@ -1105,28 +1105,28 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding( if (Instrs.empty()) return 0; // Start off assuming CFA is at SP+0. - int CFARegister = ARM::SP; + unsigned CFARegister = ARM::SP; int CFARegisterOffset = 0; // Mark savable registers as initially unsaved DenseMap<unsigned, int> RegOffsets; int FloatRegCount = 0; // Process each .cfi directive and build up compact unwind info. for (size_t i = 0, e = Instrs.size(); i != e; ++i) { - int Reg; + unsigned Reg; const MCCFIInstruction &Inst = Instrs[i]; switch (Inst.getOperation()) { case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa CFARegisterOffset = -Inst.getOffset(); - CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true); + CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true); break; case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset CFARegisterOffset = -Inst.getOffset(); break; case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register - CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true); + CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true); break; case MCCFIInstruction::OpOffset: // DW_CFA_offset - Reg = MRI.getLLVMRegNum(Inst.getRegister(), true); + Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true); if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) RegOffsets[Reg] = Inst.getOffset(); else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) { diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h index c4daafe8ee97..6293a2462306 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -393,6 +393,9 @@ namespace ARMII { // in an IT block). ThumbArithFlagSetting = 1 << 19, + // Whether an instruction can be included in an MVE tail-predicated loop. + ValidForTailPredication = 1 << 20, + //===------------------------------------------------------------------===// // Code domain. DomainShift = 15, diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index fda19eea1de6..1fee38821a49 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -82,7 +82,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol"); return ELF::R_ARM_NONE; @@ -145,7 +145,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, return ELF::R_ARM_THM_BF18; } } - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol"); return ELF::R_ARM_NONE; @@ -263,5 +263,5 @@ void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx, std::unique_ptr<MCObjectTargetWriter> llvm::createARMELFObjectWriter(uint8_t OSABI) { - return llvm::make_unique<ARMELFObjectWriter>(OSABI); + return std::make_unique<ARMELFObjectWriter>(OSABI); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp index 45be1ee96342..a1def61b58d9 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp @@ -1334,12 +1334,12 @@ void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, << markup(">"); } -void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum, +void ARMInstPrinter::printVMOVModImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned EncodedImm = MI->getOperand(OpNum).getImm(); unsigned EltBits; - uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits); + uint64_t Val = ARM_AM::decodeVMOVModImm(EncodedImm, EltBits); O << markup("<imm:") << "#0x"; O.write_hex(Val); O << markup(">"); @@ -1676,3 +1676,11 @@ void ARMInstPrinter::printExpandedImmOperand(const MCInst *MI, unsigned OpNum, O.write_hex(Val); O << markup(">"); } + +void ARMInstPrinter::printMveSaturateOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint32_t Val = MI->getOperand(OpNum).getImm(); + assert(Val <= 1 && "Invalid MVE saturate operand"); + O << "#" << (Val == 1 ? 48 : 64); +} diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h index 69026956b60e..eeb811e216fc 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h +++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h @@ -191,7 +191,7 @@ public: const MCSubtargetInfo &STI, raw_ostream &O); void printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); - void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, + void printVMOVModImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); @@ -262,7 +262,8 @@ public: const MCSubtargetInfo &STI, raw_ostream &O); void printExpandedImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); - + void printMveSaturateOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); private: unsigned DefaultAltIdx = ARM::NoRegAltName; }; diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index dca6fe37d49a..268fe7efd9ce 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -1720,7 +1720,6 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op, unsigned Reg = MI.getOperand(Op).getReg(); bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg); bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg); - bool CLRMRegs = MI.getOpcode() == ARM::t2CLRM; unsigned Binary = 0; @@ -1739,21 +1738,13 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op, Binary |= NumRegs * 2; } else { const MCRegisterInfo &MRI = *CTX.getRegisterInfo(); - if (!CLRMRegs) { - assert(std::is_sorted(MI.begin() + Op, MI.end(), - [&](const MCOperand &LHS, const MCOperand &RHS) { - return MRI.getEncodingValue(LHS.getReg()) < - MRI.getEncodingValue(RHS.getReg()); - })); - } - + assert(std::is_sorted(MI.begin() + Op, MI.end(), + [&](const MCOperand &LHS, const MCOperand &RHS) { + return MRI.getEncodingValue(LHS.getReg()) < + MRI.getEncodingValue(RHS.getReg()); + })); for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) { - unsigned RegNo; - if (CLRMRegs && MI.getOperand(I).getReg() == ARM::APSR) { - RegNo = 15; - } else { - RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg()); - } + unsigned RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg()); Binary |= 1 << RegNo; } } diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index c49885023cb2..ed4000c7e5be 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -204,7 +204,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, // relocation entry in the low 16 bits of r_address field. unsigned ThumbBit = 0; unsigned MovtBit = 0; - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: break; case ARM::fixup_arm_movt_hi16: MovtBit = 1; @@ -480,7 +480,7 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer, // PAIR. I.e. it's correct that we insert the high bits of the addend in the // MOVW case here. relocation entries. uint32_t Value = 0; - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: break; case ARM::fixup_arm_movw_lo16: case ARM::fixup_t2_movw_lo16: @@ -506,5 +506,5 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer, std::unique_ptr<MCObjectTargetWriter> llvm::createARMMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) { - return llvm::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype); + return std::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index b863517c0cca..7b30a61e8ccb 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -249,12 +249,12 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) { : ARM::FK_VFPV3_D16) : (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3XD_FP16 : ARM::FK_VFPV3XD))); - else if (STI.hasFeature(ARM::FeatureVFP2_D16_SP)) + else if (STI.hasFeature(ARM::FeatureVFP2_SP)) emitFPU(ARM::FK_VFPV2); } // ABI_HardFP_use attribute to indicate single precision FP. - if (STI.hasFeature(ARM::FeatureVFP2_D16_SP) && !STI.hasFeature(ARM::FeatureFP64)) + if (STI.hasFeature(ARM::FeatureVFP2_SP) && !STI.hasFeature(ARM::FeatureFP64)) emitAttribute(ARMBuildAttrs::ABI_HardFP_use, ARMBuildAttrs::HardFPSinglePrecision); diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp index 054a95dd1e12..900c5fe30364 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp @@ -92,7 +92,7 @@ namespace llvm { std::unique_ptr<MCObjectTargetWriter> createARMWinCOFFObjectWriter(bool Is64Bit) { - return llvm::make_unique<ARMWinCOFFObjectWriter>(Is64Bit); + return std::make_unique<ARMWinCOFFObjectWriter>(Is64Bit); } } // end namespace llvm diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp index 2e816bea5e91..b3c8146a9bde 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp @@ -22,20 +22,10 @@ public: std::unique_ptr<MCObjectWriter> OW) : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {} - void EmitAssemblerFlag(MCAssemblerFlag Flag) override; void EmitThumbFunc(MCSymbol *Symbol) override; void FinishImpl() override; }; -void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { - switch (Flag) { - default: llvm_unreachable("not implemented"); - case MCAF_SyntaxUnified: - case MCAF_Code16: - break; - } -} - void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) { getAssembler().setIsThumbFunc(Symbol); } diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp index 4b25986b90a7..cc31929899b4 100644 --- a/lib/Target/ARM/MLxExpansionPass.cpp +++ b/lib/Target/ARM/MLxExpansionPass.cpp @@ -86,8 +86,8 @@ void MLxExpansion::pushStack(MachineInstr *MI) { MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { // Look past COPY and INSERT_SUBREG instructions to find the // real definition MI. This is important for _sfp instructions. - unsigned Reg = MI->getOperand(1).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + Register Reg = MI->getOperand(1).getReg(); + if (Register::isPhysicalRegister(Reg)) return nullptr; MachineBasicBlock *MBB = MI->getParent(); @@ -97,13 +97,13 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { break; if (DefMI->isCopyLike()) { Reg = DefMI->getOperand(1).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { DefMI = MRI->getVRegDef(Reg); continue; } } else if (DefMI->isInsertSubreg()) { Reg = DefMI->getOperand(2).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { DefMI = MRI->getVRegDef(Reg); continue; } @@ -114,9 +114,8 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { } unsigned MLxExpansion::getDefReg(MachineInstr *MI) const { - unsigned Reg = MI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg) || - !MRI->hasOneNonDBGUse(Reg)) + Register Reg = MI->getOperand(0).getReg(); + if (Register::isPhysicalRegister(Reg) || !MRI->hasOneNonDBGUse(Reg)) return Reg; MachineBasicBlock *MBB = MI->getParent(); @@ -126,8 +125,7 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const { while (UseMI->isCopy() || UseMI->isInsertSubreg()) { Reg = UseMI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg) || - !MRI->hasOneNonDBGUse(Reg)) + if (Register::isPhysicalRegister(Reg) || !MRI->hasOneNonDBGUse(Reg)) return Reg; UseMI = &*MRI->use_instr_nodbg_begin(Reg); if (UseMI->getParent() != MBB) @@ -140,8 +138,8 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const { /// hasLoopHazard - Check whether an MLx instruction is chained to itself across /// a single-MBB loop. bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const { - unsigned Reg = MI->getOperand(1).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + Register Reg = MI->getOperand(1).getReg(); + if (Register::isPhysicalRegister(Reg)) return false; MachineBasicBlock *MBB = MI->getParent(); @@ -154,8 +152,8 @@ outer_continue: if (DefMI->isPHI()) { for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) { if (DefMI->getOperand(i + 1).getMBB() == MBB) { - unsigned SrcReg = DefMI->getOperand(i).getReg(); - if (TargetRegisterInfo::isVirtualRegister(SrcReg)) { + Register SrcReg = DefMI->getOperand(i).getReg(); + if (Register::isVirtualRegister(SrcReg)) { DefMI = MRI->getVRegDef(SrcReg); goto outer_continue; } @@ -163,13 +161,13 @@ outer_continue: } } else if (DefMI->isCopyLike()) { Reg = DefMI->getOperand(1).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { DefMI = MRI->getVRegDef(Reg); continue; } } else if (DefMI->isInsertSubreg()) { Reg = DefMI->getOperand(2).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { DefMI = MRI->getVRegDef(Reg); continue; } @@ -271,23 +269,23 @@ void MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, unsigned MulOpc, unsigned AddSubOpc, bool NegAcc, bool HasLane) { - unsigned DstReg = MI->getOperand(0).getReg(); + Register DstReg = MI->getOperand(0).getReg(); bool DstDead = MI->getOperand(0).isDead(); - unsigned AccReg = MI->getOperand(1).getReg(); - unsigned Src1Reg = MI->getOperand(2).getReg(); - unsigned Src2Reg = MI->getOperand(3).getReg(); + Register AccReg = MI->getOperand(1).getReg(); + Register Src1Reg = MI->getOperand(2).getReg(); + Register Src2Reg = MI->getOperand(3).getReg(); bool Src1Kill = MI->getOperand(2).isKill(); bool Src2Kill = MI->getOperand(3).isKill(); unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0; unsigned NextOp = HasLane ? 5 : 4; ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm(); - unsigned PredReg = MI->getOperand(++NextOp).getReg(); + Register PredReg = MI->getOperand(++NextOp).getReg(); const MCInstrDesc &MCID1 = TII->get(MulOpc); const MCInstrDesc &MCID2 = TII->get(AddSubOpc); const MachineFunction &MF = *MI->getParent()->getParent(); - unsigned TmpReg = MRI->createVirtualRegister( - TII->getRegClass(MCID1, 0, TRI, MF)); + Register TmpReg = + MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI, MF)); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg) .addReg(Src1Reg, getKillRegState(Src1Kill)) diff --git a/lib/Target/ARM/MVETailPredication.cpp b/lib/Target/ARM/MVETailPredication.cpp new file mode 100644 index 000000000000..4db8ab17c49b --- /dev/null +++ b/lib/Target/ARM/MVETailPredication.cpp @@ -0,0 +1,519 @@ +//===- MVETailPredication.cpp - MVE Tail Predication ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead +/// branches to help accelerate DSP applications. These two extensions can be +/// combined to provide implicit vector predication within a low-overhead loop. +/// The HardwareLoops pass inserts intrinsics identifying loops that the +/// backend will attempt to convert into a low-overhead loop. The vectorizer is +/// responsible for generating a vectorized loop in which the lanes are +/// predicated upon the iteration counter. This pass looks at these predicated +/// vector loops, that are targets for low-overhead loops, and prepares it for +/// code generation. Once the vectorizer has produced a masked loop, there's a +/// couple of final forms: +/// - A tail-predicated loop, with implicit predication. +/// - A loop containing multiple VCPT instructions, predicating multiple VPT +/// blocks of instructions operating on different vector types. + +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "ARM.h" +#include "ARMSubtarget.h" + +using namespace llvm; + +#define DEBUG_TYPE "mve-tail-predication" +#define DESC "Transform predicated vector loops to use MVE tail predication" + +static cl::opt<bool> +DisableTailPredication("disable-mve-tail-predication", cl::Hidden, + cl::init(true), + cl::desc("Disable MVE Tail Predication")); +namespace { + +class MVETailPredication : public LoopPass { + SmallVector<IntrinsicInst*, 4> MaskedInsts; + Loop *L = nullptr; + ScalarEvolution *SE = nullptr; + TargetTransformInfo *TTI = nullptr; + +public: + static char ID; + + MVETailPredication() : LoopPass(ID) { } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<TargetPassConfig>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.setPreservesCFG(); + } + + bool runOnLoop(Loop *L, LPPassManager&) override; + +private: + + /// Perform the relevant checks on the loop and convert if possible. + bool TryConvert(Value *TripCount); + + /// Return whether this is a vectorized loop, that contains masked + /// load/stores. + bool IsPredicatedVectorLoop(); + + /// Compute a value for the total number of elements that the predicated + /// loop will process. + Value *ComputeElements(Value *TripCount, VectorType *VecTy); + + /// Is the icmp that generates an i1 vector, based upon a loop counter + /// and a limit that is defined outside the loop. + bool isTailPredicate(Instruction *Predicate, Value *NumElements); +}; + +} // end namespace + +static bool IsDecrement(Instruction &I) { + auto *Call = dyn_cast<IntrinsicInst>(&I); + if (!Call) + return false; + + Intrinsic::ID ID = Call->getIntrinsicID(); + return ID == Intrinsic::loop_decrement_reg; +} + +static bool IsMasked(Instruction *I) { + auto *Call = dyn_cast<IntrinsicInst>(I); + if (!Call) + return false; + + Intrinsic::ID ID = Call->getIntrinsicID(); + // TODO: Support gather/scatter expand/compress operations. + return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load; +} + +bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { + if (skipLoop(L) || DisableTailPredication) + return false; + + Function &F = *L->getHeader()->getParent(); + auto &TPC = getAnalysis<TargetPassConfig>(); + auto &TM = TPC.getTM<TargetMachine>(); + auto *ST = &TM.getSubtarget<ARMSubtarget>(F); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + this->L = L; + + // The MVE and LOB extensions are combined to enable tail-predication, but + // there's nothing preventing us from generating VCTP instructions for v8.1m. + if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) { + LLVM_DEBUG(dbgs() << "TP: Not a v8.1m.main+mve target.\n"); + return false; + } + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + return false; + + auto FindLoopIterations = [](BasicBlock *BB) -> IntrinsicInst* { + for (auto &I : *BB) { + auto *Call = dyn_cast<IntrinsicInst>(&I); + if (!Call) + continue; + + Intrinsic::ID ID = Call->getIntrinsicID(); + if (ID == Intrinsic::set_loop_iterations || + ID == Intrinsic::test_set_loop_iterations) + return cast<IntrinsicInst>(&I); + } + return nullptr; + }; + + // Look for the hardware loop intrinsic that sets the iteration count. + IntrinsicInst *Setup = FindLoopIterations(Preheader); + + // The test.set iteration could live in the pre- preheader. + if (!Setup) { + if (!Preheader->getSinglePredecessor()) + return false; + Setup = FindLoopIterations(Preheader->getSinglePredecessor()); + if (!Setup) + return false; + } + + // Search for the hardware loop intrinic that decrements the loop counter. + IntrinsicInst *Decrement = nullptr; + for (auto *BB : L->getBlocks()) { + for (auto &I : *BB) { + if (IsDecrement(I)) { + Decrement = cast<IntrinsicInst>(&I); + break; + } + } + } + + if (!Decrement) + return false; + + LLVM_DEBUG(dbgs() << "TP: Running on Loop: " << *L + << *Setup << "\n" + << *Decrement << "\n"); + bool Changed = TryConvert(Setup->getArgOperand(0)); + return Changed; +} + +bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) { + // Look for the following: + + // %trip.count.minus.1 = add i32 %N, -1 + // %broadcast.splatinsert10 = insertelement <4 x i32> undef, + // i32 %trip.count.minus.1, i32 0 + // %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, + // <4 x i32> undef, + // <4 x i32> zeroinitializer + // ... + // ... + // %index = phi i32 + // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, + // <4 x i32> undef, + // <4 x i32> zeroinitializer + // %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> + // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11 + + // And return whether V == %pred. + + using namespace PatternMatch; + + CmpInst::Predicate Pred; + Instruction *Shuffle = nullptr; + Instruction *Induction = nullptr; + + // The vector icmp + if (!match(I, m_ICmp(Pred, m_Instruction(Induction), + m_Instruction(Shuffle))) || + Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle)) + return false; + + // First find the stuff outside the loop which is setting up the limit + // vector.... + // The invariant shuffle that broadcast the limit into a vector. + Instruction *Insert = nullptr; + if (!match(Shuffle, m_ShuffleVector(m_Instruction(Insert), m_Undef(), + m_Zero()))) + return false; + + // Insert the limit into a vector. + Instruction *BECount = nullptr; + if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(BECount), + m_Zero()))) + return false; + + // The limit calculation, backedge count. + Value *TripCount = nullptr; + if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes()))) + return false; + + if (TripCount != NumElements) + return false; + + // Now back to searching inside the loop body... + // Find the add with takes the index iv and adds a constant vector to it. + Instruction *BroadcastSplat = nullptr; + Constant *Const = nullptr; + if (!match(Induction, m_Add(m_Instruction(BroadcastSplat), + m_Constant(Const)))) + return false; + + // Check that we're adding <0, 1, 2, 3... + if (auto *CDS = dyn_cast<ConstantDataSequential>(Const)) { + for (unsigned i = 0; i < CDS->getNumElements(); ++i) { + if (CDS->getElementAsInteger(i) != i) + return false; + } + } else + return false; + + // The shuffle which broadcasts the index iv into a vector. + if (!match(BroadcastSplat, m_ShuffleVector(m_Instruction(Insert), m_Undef(), + m_Zero()))) + return false; + + // The insert element which initialises a vector with the index iv. + Instruction *IV = nullptr; + if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(IV), m_Zero()))) + return false; + + // The index iv. + auto *Phi = dyn_cast<PHINode>(IV); + if (!Phi) + return false; + + // TODO: Don't think we need to check the entry value. + Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader()); + if (!match(OnEntry, m_Zero())) + return false; + + Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch()); + unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements(); + + Instruction *LHS = nullptr; + if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes)))) + return false; + + return LHS == Phi; +} + +static VectorType* getVectorType(IntrinsicInst *I) { + unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1; + auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType()); + return cast<VectorType>(PtrTy->getElementType()); +} + +bool MVETailPredication::IsPredicatedVectorLoop() { + // Check that the loop contains at least one masked load/store intrinsic. + // We only support 'normal' vector instructions - other than masked + // load/stores. + for (auto *BB : L->getBlocks()) { + for (auto &I : *BB) { + if (IsMasked(&I)) { + VectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I)); + unsigned Lanes = VecTy->getNumElements(); + unsigned ElementWidth = VecTy->getScalarSizeInBits(); + // MVE vectors are 128-bit, but don't support 128 x i1. + // TODO: Can we support vectors larger than 128-bits? + unsigned MaxWidth = TTI->getRegisterBitWidth(true); + if (Lanes * ElementWidth != MaxWidth || Lanes == MaxWidth) + return false; + MaskedInsts.push_back(cast<IntrinsicInst>(&I)); + } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) { + for (auto &U : Int->args()) { + if (isa<VectorType>(U->getType())) + return false; + } + } + } + } + + return !MaskedInsts.empty(); +} + +Value* MVETailPredication::ComputeElements(Value *TripCount, + VectorType *VecTy) { + const SCEV *TripCountSE = SE->getSCEV(TripCount); + ConstantInt *VF = ConstantInt::get(cast<IntegerType>(TripCount->getType()), + VecTy->getNumElements()); + + if (VF->equalsInt(1)) + return nullptr; + + // TODO: Support constant trip counts. + auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* { + if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) { + if (Const->getAPInt() != -VF->getValue()) + return nullptr; + } else + return nullptr; + return dyn_cast<SCEVMulExpr>(S->getOperand(1)); + }; + + auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* { + if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) { + if (Const->getValue() != VF) + return nullptr; + } else + return nullptr; + return dyn_cast<SCEVUDivExpr>(S->getOperand(1)); + }; + + auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* { + if (auto *Const = dyn_cast<SCEVConstant>(S->getRHS())) { + if (Const->getValue() != VF) + return nullptr; + } else + return nullptr; + + if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) { + if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) { + if (Const->getAPInt() != (VF->getValue() - 1)) + return nullptr; + } else + return nullptr; + + return RoundUp->getOperand(1); + } + return nullptr; + }; + + // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to + // determine the numbers of elements instead? Looks like this is what is used + // for delinearization, but I'm not sure if it can be applied to the + // vectorized form - at least not without a bit more work than I feel + // comfortable with. + + // Search for Elems in the following SCEV: + // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw> + const SCEV *Elems = nullptr; + if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE)) + if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1))) + if (auto *Add = dyn_cast<SCEVAddExpr>(Div->getLHS())) + if (auto *Mul = VisitAdd(Add)) + if (auto *Div = VisitMul(Mul)) + if (auto *Res = VisitDiv(Div)) + Elems = Res; + + if (!Elems) + return nullptr; + + Instruction *InsertPt = L->getLoopPreheader()->getTerminator(); + if (!isSafeToExpandAt(Elems, InsertPt, *SE)) + return nullptr; + + auto DL = L->getHeader()->getModule()->getDataLayout(); + SCEVExpander Expander(*SE, DL, "elements"); + return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt); +} + +// Look through the exit block to see whether there's a duplicate predicate +// instruction. This can happen when we need to perform a select on values +// from the last and previous iteration. Instead of doing a straight +// replacement of that predicate with the vctp, clone the vctp and place it +// in the block. This means that the VPR doesn't have to be live into the +// exit block which should make it easier to convert this loop into a proper +// tail predicated loop. +static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates, + SetVector<Instruction*> &MaybeDead, Loop *L) { + if (BasicBlock *Exit = L->getUniqueExitBlock()) { + for (auto &Pair : NewPredicates) { + Instruction *OldPred = Pair.first; + Instruction *NewPred = Pair.second; + + for (auto &I : *Exit) { + if (I.isSameOperationAs(OldPred)) { + Instruction *PredClone = NewPred->clone(); + PredClone->insertBefore(&I); + I.replaceAllUsesWith(PredClone); + MaybeDead.insert(&I); + break; + } + } + } + } + + // Drop references and add operands to check for dead. + SmallPtrSet<Instruction*, 4> Dead; + while (!MaybeDead.empty()) { + auto *I = MaybeDead.front(); + MaybeDead.remove(I); + if (I->hasNUsesOrMore(1)) + continue; + + for (auto &U : I->operands()) { + if (auto *OpI = dyn_cast<Instruction>(U)) + MaybeDead.insert(OpI); + } + I->dropAllReferences(); + Dead.insert(I); + } + + for (auto *I : Dead) + I->eraseFromParent(); + + for (auto I : L->blocks()) + DeleteDeadPHIs(I); +} + +bool MVETailPredication::TryConvert(Value *TripCount) { + if (!IsPredicatedVectorLoop()) + return false; + + LLVM_DEBUG(dbgs() << "TP: Found predicated vector loop.\n"); + + // Walk through the masked intrinsics and try to find whether the predicate + // operand is generated from an induction variable. + Module *M = L->getHeader()->getModule(); + Type *Ty = IntegerType::get(M->getContext(), 32); + SetVector<Instruction*> Predicates; + DenseMap<Instruction*, Instruction*> NewPredicates; + + for (auto *I : MaskedInsts) { + Intrinsic::ID ID = I->getIntrinsicID(); + unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3; + auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp)); + if (!Predicate || Predicates.count(Predicate)) + continue; + + VectorType *VecTy = getVectorType(I); + Value *NumElements = ComputeElements(TripCount, VecTy); + if (!NumElements) + continue; + + if (!isTailPredicate(Predicate, NumElements)) { + LLVM_DEBUG(dbgs() << "TP: Not tail predicate: " << *Predicate << "\n"); + continue; + } + + LLVM_DEBUG(dbgs() << "TP: Found tail predicate: " << *Predicate << "\n"); + Predicates.insert(Predicate); + + // Insert a phi to count the number of elements processed by the loop. + IRBuilder<> Builder(L->getHeader()->getFirstNonPHI()); + PHINode *Processed = Builder.CreatePHI(Ty, 2); + Processed->addIncoming(NumElements, L->getLoopPreheader()); + + // Insert the intrinsic to represent the effect of tail predication. + Builder.SetInsertPoint(cast<Instruction>(Predicate)); + ConstantInt *Factor = + ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements()); + Intrinsic::ID VCTPID; + switch (VecTy->getNumElements()) { + default: + llvm_unreachable("unexpected number of lanes"); + case 2: VCTPID = Intrinsic::arm_vctp64; break; + case 4: VCTPID = Intrinsic::arm_vctp32; break; + case 8: VCTPID = Intrinsic::arm_vctp16; break; + case 16: VCTPID = Intrinsic::arm_vctp8; break; + } + Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); + Value *TailPredicate = Builder.CreateCall(VCTP, Processed); + Predicate->replaceAllUsesWith(TailPredicate); + NewPredicates[Predicate] = cast<Instruction>(TailPredicate); + + // Add the incoming value to the new phi. + // TODO: This add likely already exists in the loop. + Value *Remaining = Builder.CreateSub(Processed, Factor); + Processed->addIncoming(Remaining, L->getLoopLatch()); + LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: " + << *Processed << "\n" + << "TP: Inserted VCTP: " << *TailPredicate << "\n"); + } + + // Now clean up. + Cleanup(NewPredicates, Predicates, L); + return true; +} + +Pass *llvm::createMVETailPredicationPass() { + return new MVETailPredication(); +} + +char MVETailPredication::ID = 0; + +INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false) +INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false) diff --git a/lib/Target/ARM/MVEVPTBlockPass.cpp b/lib/Target/ARM/MVEVPTBlockPass.cpp new file mode 100644 index 000000000000..bc0a80b177ed --- /dev/null +++ b/lib/Target/ARM/MVEVPTBlockPass.cpp @@ -0,0 +1,278 @@ +//===-- MVEVPTBlockPass.cpp - Insert MVE VPT blocks -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include <cassert> +#include <new> + +using namespace llvm; + +#define DEBUG_TYPE "arm-mve-vpt" + +namespace { + class MVEVPTBlock : public MachineFunctionPass { + public: + static char ID; + const Thumb2InstrInfo *TII; + const TargetRegisterInfo *TRI; + + MVEVPTBlock() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &Fn) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + StringRef getPassName() const override { + return "MVE VPT block insertion pass"; + } + + private: + bool InsertVPTBlocks(MachineBasicBlock &MBB); + }; + + char MVEVPTBlock::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false) + +enum VPTMaskValue { + T = 8, // 0b1000 + TT = 4, // 0b0100 + TE = 12, // 0b1100 + TTT = 2, // 0b0010 + TTE = 6, // 0b0110 + TEE = 10, // 0b1010 + TET = 14, // 0b1110 + TTTT = 1, // 0b0001 + TTTE = 3, // 0b0011 + TTEE = 5, // 0b0101 + TTET = 7, // 0b0111 + TEEE = 9, // 0b1001 + TEET = 11, // 0b1011 + TETT = 13, // 0b1101 + TETE = 15 // 0b1111 +}; + +static unsigned VCMPOpcodeToVPT(unsigned Opcode) { + switch (Opcode) { + case ARM::MVE_VCMPf32: + return ARM::MVE_VPTv4f32; + case ARM::MVE_VCMPf16: + return ARM::MVE_VPTv8f16; + case ARM::MVE_VCMPi8: + return ARM::MVE_VPTv16i8; + case ARM::MVE_VCMPi16: + return ARM::MVE_VPTv8i16; + case ARM::MVE_VCMPi32: + return ARM::MVE_VPTv4i32; + case ARM::MVE_VCMPu8: + return ARM::MVE_VPTv16u8; + case ARM::MVE_VCMPu16: + return ARM::MVE_VPTv8u16; + case ARM::MVE_VCMPu32: + return ARM::MVE_VPTv4u32; + case ARM::MVE_VCMPs8: + return ARM::MVE_VPTv16s8; + case ARM::MVE_VCMPs16: + return ARM::MVE_VPTv8s16; + case ARM::MVE_VCMPs32: + return ARM::MVE_VPTv4s32; + + case ARM::MVE_VCMPf32r: + return ARM::MVE_VPTv4f32r; + case ARM::MVE_VCMPf16r: + return ARM::MVE_VPTv8f16r; + case ARM::MVE_VCMPi8r: + return ARM::MVE_VPTv16i8r; + case ARM::MVE_VCMPi16r: + return ARM::MVE_VPTv8i16r; + case ARM::MVE_VCMPi32r: + return ARM::MVE_VPTv4i32r; + case ARM::MVE_VCMPu8r: + return ARM::MVE_VPTv16u8r; + case ARM::MVE_VCMPu16r: + return ARM::MVE_VPTv8u16r; + case ARM::MVE_VCMPu32r: + return ARM::MVE_VPTv4u32r; + case ARM::MVE_VCMPs8r: + return ARM::MVE_VPTv16s8r; + case ARM::MVE_VCMPs16r: + return ARM::MVE_VPTv8s16r; + case ARM::MVE_VCMPs32r: + return ARM::MVE_VPTv4s32r; + + default: + return 0; + } +} + +static MachineInstr *findVCMPToFoldIntoVPST(MachineBasicBlock::iterator MI, + const TargetRegisterInfo *TRI, + unsigned &NewOpcode) { + // Search backwards to the instruction that defines VPR. This may or not + // be a VCMP, we check that after this loop. If we find another instruction + // that reads cpsr, we return nullptr. + MachineBasicBlock::iterator CmpMI = MI; + while (CmpMI != MI->getParent()->begin()) { + --CmpMI; + if (CmpMI->modifiesRegister(ARM::VPR, TRI)) + break; + if (CmpMI->readsRegister(ARM::VPR, TRI)) + break; + } + + if (CmpMI == MI) + return nullptr; + NewOpcode = VCMPOpcodeToVPT(CmpMI->getOpcode()); + if (NewOpcode == 0) + return nullptr; + + // Search forward from CmpMI to MI, checking if either register was def'd + if (registerDefinedBetween(CmpMI->getOperand(1).getReg(), std::next(CmpMI), + MI, TRI)) + return nullptr; + if (registerDefinedBetween(CmpMI->getOperand(2).getReg(), std::next(CmpMI), + MI, TRI)) + return nullptr; + return &*CmpMI; +} + +bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { + bool Modified = false; + MachineBasicBlock::instr_iterator MBIter = Block.instr_begin(); + MachineBasicBlock::instr_iterator EndIter = Block.instr_end(); + + while (MBIter != EndIter) { + MachineInstr *MI = &*MBIter; + unsigned PredReg = 0; + DebugLoc dl = MI->getDebugLoc(); + + ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg); + + // The idea of the predicate is that None, Then and Else are for use when + // handling assembly language: they correspond to the three possible + // suffixes "", "t" and "e" on the mnemonic. So when instructions are read + // from assembly source or disassembled from object code, you expect to see + // a mixture whenever there's a long VPT block. But in code generation, we + // hope we'll never generate an Else as input to this pass. + assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds"); + + if (Pred == ARMVCC::None) { + ++MBIter; + continue; + } + + LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump()); + int VPTInstCnt = 1; + ARMVCC::VPTCodes NextPred; + + // Look at subsequent instructions, checking if they can be in the same VPT + // block. + ++MBIter; + while (MBIter != EndIter && VPTInstCnt < 4) { + NextPred = getVPTInstrPredicate(*MBIter, PredReg); + assert(NextPred != ARMVCC::Else && + "VPT block pass does not expect Else preds"); + if (NextPred != Pred) + break; + LLVM_DEBUG(dbgs() << " adding : "; MBIter->dump()); + ++VPTInstCnt; + ++MBIter; + }; + + unsigned BlockMask = 0; + switch (VPTInstCnt) { + case 1: + BlockMask = VPTMaskValue::T; + break; + case 2: + BlockMask = VPTMaskValue::TT; + break; + case 3: + BlockMask = VPTMaskValue::TTT; + break; + case 4: + BlockMask = VPTMaskValue::TTTT; + break; + default: + llvm_unreachable("Unexpected number of instruction in a VPT block"); + }; + + // Search back for a VCMP that can be folded to create a VPT, or else create + // a VPST directly + MachineInstrBuilder MIBuilder; + unsigned NewOpcode; + MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode); + if (VCMP) { + LLVM_DEBUG(dbgs() << " folding VCMP into VPST: "; VCMP->dump()); + MIBuilder = BuildMI(Block, MI, dl, TII->get(NewOpcode)); + MIBuilder.addImm(BlockMask); + MIBuilder.add(VCMP->getOperand(1)); + MIBuilder.add(VCMP->getOperand(2)); + MIBuilder.add(VCMP->getOperand(3)); + VCMP->eraseFromParent(); + } else { + MIBuilder = BuildMI(Block, MI, dl, TII->get(ARM::MVE_VPST)); + MIBuilder.addImm(BlockMask); + } + + finalizeBundle( + Block, MachineBasicBlock::instr_iterator(MIBuilder.getInstr()), MBIter); + + Modified = true; + } + return Modified; +} + +bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) { + const ARMSubtarget &STI = + static_cast<const ARMSubtarget &>(Fn.getSubtarget()); + + if (!STI.isThumb2() || !STI.hasMVEIntegerOps()) + return false; + + TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); + TRI = STI.getRegisterInfo(); + + LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n" + << "********** Function: " << Fn.getName() << '\n'); + + bool Modified = false; + for (MachineBasicBlock &MBB : Fn) + Modified |= InsertVPTBlocks(MBB); + + LLVM_DEBUG(dbgs() << "**************************************\n"); + return Modified; +} + +/// createMVEVPTBlock - Returns an instance of the MVE VPT block +/// insertion pass. +FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); } diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index 426e9a0ed9b8..956d474f1d79 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -164,7 +164,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, // to determine the end of the prologue. DebugLoc dl; - unsigned FramePtr = RegInfo->getFrameRegister(MF); + Register FramePtr = RegInfo->getFrameRegister(MF); unsigned BasePtr = RegInfo->getBaseRegister(); int CFAOffset = 0; @@ -459,8 +459,8 @@ static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) { else if (MI.getOpcode() == ARM::tPOP) { return true; } else if (MI.getOpcode() == ARM::tMOVr) { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); return ((ARM::tGPRRegClass.contains(Src) || Src == ARM::LR) && ARM::hGPRRegClass.contains(Dst)); } @@ -483,7 +483,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, assert((unsigned)NumBytes >= ArgRegsSaveSize && "ArgRegsSaveSize is included in NumBytes"); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); - unsigned FramePtr = RegInfo->getFrameRegister(MF); + Register FramePtr = RegInfo->getFrameRegister(MF); if (!AFI->hasStackFrame()) { if (NumBytes - ArgRegsSaveSize != 0) diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index f57d93a2e83d..fccaa4c9cc8a 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -80,12 +80,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { assert((RC == &ARM::tGPRRegClass || - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && - isARMLowRegister(SrcReg))) && "Unknown regclass!"); + (Register::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) && + "Unknown regclass!"); if (RC == &ARM::tGPRRegClass || - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && - isARMLowRegister(SrcReg))) { + (Register::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); @@ -108,13 +107,13 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) || - (TargetRegisterInfo::isPhysicalRegister(DestReg) && - isARMLowRegister(DestReg))) && "Unknown regclass!"); + assert( + (RC->hasSuperClassEq(&ARM::tGPRRegClass) || + (Register::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) && + "Unknown regclass!"); if (RC->hasSuperClassEq(&ARM::tGPRRegClass) || - (TargetRegisterInfo::isPhysicalRegister(DestReg) && - isARMLowRegister(DestReg))) { + (Register::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp index 3143eb9840ed..786fc78d0233 100644 --- a/lib/Target/ARM/Thumb2ITBlockPass.cpp +++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -87,7 +87,7 @@ static void TrackDefUses(MachineInstr *MI, RegisterSet &Defs, RegisterSet &Uses, for (auto &MO : MI->operands()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP) continue; if (MO.isUse()) @@ -145,8 +145,8 @@ Thumb2ITBlock::MoveCopyOutOfITBlock(MachineInstr *MI, MI->getOperand(1).getSubReg() == 0 && "Sub-register indices still around?"); - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned SrcReg = MI->getOperand(1).getReg(); + Register DstReg = MI->getOperand(0).getReg(); + Register SrcReg = MI->getOperand(1).getReg(); // First check if it's safe to move it. if (Uses.count(DstReg) || Defs.count(SrcReg)) @@ -308,131 +308,3 @@ bool Thumb2ITBlock::runOnMachineFunction(MachineFunction &Fn) { /// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks /// insertion pass. FunctionPass *llvm::createThumb2ITBlockPass() { return new Thumb2ITBlock(); } - -#undef DEBUG_TYPE -#define DEBUG_TYPE "arm-mve-vpt" - -namespace { - class MVEVPTBlock : public MachineFunctionPass { - public: - static char ID; - const Thumb2InstrInfo *TII; - const TargetRegisterInfo *TRI; - - MVEVPTBlock() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &Fn) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); - } - - StringRef getPassName() const override { - return "MVE VPT block insertion pass"; - } - - private: - bool InsertVPTBlocks(MachineBasicBlock &MBB); - }; - - char MVEVPTBlock::ID = 0; - -} // end anonymous namespace - -INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false) - -enum VPTMaskValue { - T = 8, // 0b1000 - TT = 4, // 0b0100 - TE = 12, // 0b1100 - TTT = 2, // 0b0010 - TTE = 6, // 0b0110 - TEE = 10, // 0b1010 - TET = 14, // 0b1110 - TTTT = 1, // 0b0001 - TTTE = 3, // 0b0011 - TTEE = 5, // 0b0101 - TTET = 7, // 0b0111 - TEEE = 9, // 0b1001 - TEET = 11, // 0b1011 - TETT = 13, // 0b1101 - TETE = 15 // 0b1111 -}; - -bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { - bool Modified = false; - MachineBasicBlock::iterator MBIter = Block.begin(); - MachineBasicBlock::iterator EndIter = Block.end(); - - while (MBIter != EndIter) { - MachineInstr *MI = &*MBIter; - unsigned PredReg = 0; - DebugLoc dl = MI->getDebugLoc(); - - ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg); - - // The idea of the predicate is that None, Then and Else are for use when - // handling assembly language: they correspond to the three possible - // suffixes "", "t" and "e" on the mnemonic. So when instructions are read - // from assembly source or disassembled from object code, you expect to see - // a mixture whenever there's a long VPT block. But in code generation, we - // hope we'll never generate an Else as input to this pass. - - assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds"); - - if (Pred == ARMVCC::None) { - ++MBIter; - continue; - } - - MachineInstrBuilder MIBuilder = - BuildMI(Block, MBIter, dl, TII->get(ARM::MVE_VPST)); - // The mask value for the VPST instruction is T = 0b1000 = 8 - MIBuilder.addImm(VPTMaskValue::T); - - MachineBasicBlock::iterator VPSTInsertPos = MIBuilder.getInstr(); - int VPTInstCnt = 1; - ARMVCC::VPTCodes NextPred; - - do { - ++MBIter; - NextPred = getVPTInstrPredicate(*MBIter, PredReg); - } while (NextPred != ARMVCC::None && NextPred == Pred && ++VPTInstCnt < 4); - - MachineInstr *LastMI = &*MBIter; - finalizeBundle(Block, VPSTInsertPos.getInstrIterator(), - ++LastMI->getIterator()); - - Modified = true; - LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump();); - - ++MBIter; - } - return Modified; -} - -bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) { - const ARMSubtarget &STI = - static_cast<const ARMSubtarget &>(Fn.getSubtarget()); - - if (!STI.isThumb2() || !STI.hasMVEIntegerOps()) - return false; - - TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); - TRI = STI.getRegisterInfo(); - - LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n" - << "********** Function: " << Fn.getName() << '\n'); - - bool Modified = false; - for (MachineBasicBlock &MBB : Fn) - Modified |= InsertVPTBlocks(MBB); - - LLVM_DEBUG(dbgs() << "**************************************\n"); - return Modified; -} - -/// createMVEVPTBlock - Returns an instance of the MVE VPT block -/// insertion pass. -FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); } diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 5a965f7a6b9b..af1f0aeb27ba 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -159,9 +159,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, // Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for // gsub_0, but needs an extra constraint for gsub_1 (which could be sp // otherwise). - if (TargetRegisterInfo::isVirtualRegister(SrcReg)) { + if (Register::isVirtualRegister(SrcReg)) { MachineRegisterInfo *MRI = &MF.getRegInfo(); - MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass); + MRI->constrainRegClass(SrcReg, &ARM::GPRPairnospRegClass); } MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8)); @@ -200,10 +200,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, // Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for // gsub_0, but needs an extra constraint for gsub_1 (which could be sp // otherwise). - if (TargetRegisterInfo::isVirtualRegister(DestReg)) { + if (Register::isVirtualRegister(DestReg)) { MachineRegisterInfo *MRI = &MF.getRegInfo(); - MRI->constrainRegClass(DestReg, - &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass); + MRI->constrainRegClass(DestReg, &ARM::GPRPairnospRegClass); } MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8)); @@ -211,7 +210,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL)); - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + if (Register::isPhysicalRegister(DestReg)) MIB.addReg(DestReg, RegState::ImplicitDefine); return; } @@ -470,12 +469,17 @@ immediateOffsetOpcode(unsigned opcode) bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, - const ARMBaseInstrInfo &TII) { + const ARMBaseInstrInfo &TII, + const TargetRegisterInfo *TRI) { unsigned Opcode = MI.getOpcode(); const MCInstrDesc &Desc = MI.getDesc(); unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); bool isSub = false; + MachineFunction &MF = *MI.getParent()->getParent(); + const TargetRegisterClass *RegClass = + TII.getRegClass(Desc, FrameRegIdx, TRI, MF); + // Memory operands in inline assembly always use AddrModeT2_i12. if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR) AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2? @@ -554,7 +558,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, // register then we change to an immediate version. unsigned NewOpc = Opcode; if (AddrMode == ARMII::AddrModeT2_so) { - unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg(); + Register OffsetReg = MI.getOperand(FrameRegIdx + 1).getReg(); if (OffsetReg != 0) { MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); return Offset == 0; @@ -645,10 +649,21 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1); // Attempt to fold address computation - // Common case: small offset, fits into instruction. + // Common case: small offset, fits into instruction. We need to make sure + // the register class is correct too, for instructions like the MVE + // VLDRH.32, which only accepts low tGPR registers. int ImmedOffset = Offset / Scale; unsigned Mask = (1 << NumBits) - 1; - if ((unsigned)Offset <= Mask * Scale) { + if ((unsigned)Offset <= Mask * Scale && + (Register::isVirtualRegister(FrameReg) || + RegClass->contains(FrameReg))) { + if (Register::isVirtualRegister(FrameReg)) { + // Make sure the register class for the virtual register is correct + MachineRegisterInfo *MRI = &MF.getRegInfo(); + if (!MRI->constrainRegClass(FrameReg, RegClass)) + llvm_unreachable("Unable to constrain virtual register class."); + } + // Replace the FrameIndex with fp/sp MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); if (isSub) { @@ -681,7 +696,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, } Offset = (isSub) ? -Offset : Offset; - return Offset == 0; + return Offset == 0 && (Register::isVirtualRegister(FrameReg) || + RegClass->contains(FrameReg)); } ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI, diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index 37a85fa38417..c5a62aa33990 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -300,7 +300,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) { for (const MachineOperand &MO : CPSRDef->operands()) { if (!MO.isReg() || MO.isUndef() || MO.isUse()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg == 0 || Reg == ARM::CPSR) continue; Defs.insert(Reg); @@ -309,7 +309,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) { for (const MachineOperand &MO : Use->operands()) { if (!MO.isReg() || MO.isUndef() || MO.isDef()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Defs.count(Reg)) return false; } @@ -380,7 +380,7 @@ static bool VerifyLowRegs(MachineInstr *MI) { const MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || MO.isImplicit()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg == 0 || Reg == ARM::CPSR) continue; if (isPCOk && Reg == ARM::PC) @@ -464,11 +464,11 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, // For this reason we can't reuse the logic at the end of this function; we // have to implement the MI building here. bool IsStore = Entry.WideOpc == ARM::t2STR_POST; - unsigned Rt = MI->getOperand(IsStore ? 1 : 0).getReg(); - unsigned Rn = MI->getOperand(IsStore ? 0 : 1).getReg(); + Register Rt = MI->getOperand(IsStore ? 1 : 0).getReg(); + Register Rn = MI->getOperand(IsStore ? 0 : 1).getReg(); unsigned Offset = MI->getOperand(3).getImm(); unsigned PredImm = MI->getOperand(4).getImm(); - unsigned PredReg = MI->getOperand(5).getReg(); + Register PredReg = MI->getOperand(5).getReg(); assert(isARMLowRegister(Rt)); assert(isARMLowRegister(Rn)); @@ -496,7 +496,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, return true; } case ARM::t2LDMIA: { - unsigned BaseReg = MI->getOperand(0).getReg(); + Register BaseReg = MI->getOperand(0).getReg(); assert(isARMLowRegister(BaseReg)); // For the non-writeback version (this one), the base register must be @@ -524,7 +524,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, break; case ARM::t2LDMIA_RET: { - unsigned BaseReg = MI->getOperand(1).getReg(); + Register BaseReg = MI->getOperand(1).getReg(); if (BaseReg != ARM::SP) return false; Opc = Entry.NarrowOpc2; // tPOP_RET @@ -537,7 +537,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, case ARM::t2STMDB_UPD: { OpNum = 0; - unsigned BaseReg = MI->getOperand(1).getReg(); + Register BaseReg = MI->getOperand(1).getReg(); if (BaseReg == ARM::SP && (Entry.WideOpc == ARM::t2LDMIA_UPD || Entry.WideOpc == ARM::t2STMDB_UPD)) { @@ -743,11 +743,11 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, // are optimizing for size. return false; - unsigned Reg0 = MI->getOperand(0).getReg(); - unsigned Reg1 = MI->getOperand(1).getReg(); + Register Reg0 = MI->getOperand(0).getReg(); + Register Reg1 = MI->getOperand(1).getReg(); // t2MUL is "special". The tied source operand is second, not first. if (MI->getOpcode() == ARM::t2MUL) { - unsigned Reg2 = MI->getOperand(2).getReg(); + Register Reg2 = MI->getOperand(2).getReg(); // Early exit if the regs aren't all low regs. if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1) || !isARMLowRegister(Reg2)) @@ -782,7 +782,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, if (Imm > Limit) return false; } else { - unsigned Reg2 = MI->getOperand(2).getReg(); + Register Reg2 = MI->getOperand(2).getReg(); if (Entry.LowRegs2 && !isARMLowRegister(Reg2)) return false; } @@ -868,7 +868,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, continue; const MachineOperand &MO = MI->getOperand(i); if (MO.isReg()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!Reg || Reg == ARM::CPSR) continue; if (Entry.LowRegs1 && !isARMLowRegister(Reg)) diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp index a96417ffce4d..b0ba58d8dc4a 100644 --- a/lib/Target/ARM/ThumbRegisterInfo.cpp +++ b/lib/Target/ARM/ThumbRegisterInfo.cpp @@ -107,8 +107,9 @@ void ThumbRegisterInfo::emitLoadConstPool( MachineFunction &MF = *MBB.getParent(); const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); if (STI.isThumb1Only()) { - assert((isARMLowRegister(DestReg) || isVirtualRegister(DestReg)) && - "Thumb1 does not have ldr to high register"); + assert( + (isARMLowRegister(DestReg) || Register::isVirtualRegister(DestReg)) && + "Thumb1 does not have ldr to high register"); return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred, PredReg, MIFlags); } @@ -141,7 +142,7 @@ static void emitThumbRegPlusImmInReg( unsigned LdReg = DestReg; if (DestReg == ARM::SP) assert(BaseReg == ARM::SP && "Unexpected!"); - if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg)) + if (!isARMLowRegister(DestReg) && !Register::isVirtualRegister(DestReg)) LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass); if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) { @@ -371,7 +372,7 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II, if (Opcode == ARM::tADDframe) { Offset += MI.getOperand(FrameRegIdx+1).getImm(); - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII, *this); @@ -509,7 +510,7 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (MI.mayLoad()) { // Use the destination register to materialize sp + offset. - unsigned TmpReg = MI.getOperand(0).getReg(); + Register TmpReg = MI.getOperand(0).getReg(); bool UseRR = false; if (Opcode == ARM::tLDRspi) { if (FrameReg == ARM::SP || STI.genExecuteOnly()) |